diff --git a/client/.eslintrc.json b/client/.eslintrc.json index d2d227a66f..1581b3e209 100644 --- a/client/.eslintrc.json +++ b/client/.eslintrc.json @@ -91,6 +91,7 @@ "skipWords": [ "accessor", "allowfullscreen", + "amalthea", "amazonaws", "apiversion", "ascii", @@ -223,6 +224,7 @@ "presentational", "profiler", "progressbar", + "prometheus", "proxying", "Pupikofer", "pygments", diff --git a/client/src/components/prometheusModal/prometheusModal.tsx b/client/src/components/prometheusModal/prometheusModal.tsx new file mode 100644 index 0000000000..f96f94ffc0 --- /dev/null +++ b/client/src/components/prometheusModal/prometheusModal.tsx @@ -0,0 +1,392 @@ +/*! + * Copyright 2025 - Swiss Data Science Center (SDSC) + * A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and + * Eidgenössische Technische Hochschule Zürich (ETHZ). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* eslint-disable spellcheck/spell-checker */ +import cx from "classnames"; +import { useCallback, useState, useEffect, useRef, useMemo } from "react"; +import { Activity } from "react-bootstrap-icons"; +import { Card, CardBody, CloseButton } from "reactstrap"; + +interface PrometheusQueryResult { + status: string; + data: { + resultType: string; + result: Array<{ + metric: Record; + value?: [number, string]; + values?: Array<[number, string]>; + }>; + }; + requestId?: string; + error?: string; + predefinedQuery?: { + label: string; + query: string; + path?: string; + description?: string; + icon?: string; + unit: string; + }; +} + +interface PrometheusQueryBoxProps { + className?: string; + sessionName: string; + onClose: () => void; + setPrometheusQueryBtnColor: (color: string) => void; + showPrometheusQuery: boolean; +} + +interface AlertDetails { + alertName: string; + severity: string; + value: number; + description: string; + unit?: string; +} + +function usePrometheusWebSocket() { + const [ws, setWs] = useState(null); + const [isConnected, setIsConnected] = useState(false); + const reconnectAttempts = useRef(0); + const reconnectTimeout = useRef(null); + const maxReconnectAttempts = 10; + const baseDelay = 1000; + const maxDelay = 30000; + + const pendingRequests = useRef< + Map< + string, + { + resolve: (result: PrometheusQueryResult) => void; + reject: (error: Error) => void; + } + > + >(new Map()); + + const connect = useCallback(() => { + const wsUrl = `wss://${window.location.host}/ui-server/ws`; + + const websocket = new WebSocket(wsUrl); + + websocket.onopen = () => { + setIsConnected(true); + setWs(websocket); + reconnectAttempts.current = 0; + }; + + websocket.onmessage = (event) => { + const message = JSON.parse(event.data); + + if (message.type === "prometheusQuery" && message.data?.requestId) { + const pending = pendingRequests.current.get(message.data.requestId); + if (pending) { + pendingRequests.current.delete(message.data.requestId); + if (message.data.error) { + pending.reject(new Error(message.data.error)); + } else { + pending.resolve(message.data); + } + } + } + }; + + websocket.onerror = () => { + setIsConnected(false); + }; + + websocket.onclose = (event) => { + setIsConnected(false); + setWs(null); + + if (pendingRequests.current.size > 0) { + pendingRequests.current.forEach((pending) => { + pending.reject(new Error("WebSocket connection closed")); + }); + pendingRequests.current.clear(); + } + + if (reconnectAttempts.current < maxReconnectAttempts && !event.wasClean) { + const delay = Math.min( + baseDelay * Math.pow(2, reconnectAttempts.current), + maxDelay + ); + + reconnectTimeout.current = setTimeout(() => { + reconnectAttempts.current++; + connect(); + }, delay); + } + }; + + return websocket; + }, []); + + useEffect(() => { + const websocket = connect(); + + return () => { + if (reconnectTimeout.current) { + clearTimeout(reconnectTimeout.current); + reconnectTimeout.current = null; + } + websocket.close(1000, "Component unmounting"); + }; + }, [connect]); + + const sendPrometheusQuery = useCallback( + async (queryOrPath: string): Promise => { + if (!ws || !isConnected) { + throw new Error("WebSocket not connected"); + } + + const requestId = `prometheus-${Date.now()}-${Math.random()}`; + + return new Promise((resolve, reject) => { + pendingRequests.current.set(requestId, { resolve, reject }); + + setTimeout(() => { + if (pendingRequests.current.has(requestId)) { + pendingRequests.current.delete(requestId); + reject(new Error("Request timeout")); + } + }, 10000); + + const message = { + timestamp: new Date(), + type: "prometheusQuery", + data: { + fullPath: queryOrPath, + requestId, + }, + }; + + try { + ws.send(JSON.stringify(message)); + } catch (error) { + pendingRequests.current.delete(requestId); + reject(error); + } + }); + }, + [ws, isConnected] + ); + + return { sendPrometheusQuery, isConnected }; +} + +export function PrometheusQueryBox({ + className, + sessionName, + onClose, + setPrometheusQueryBtnColor, + showPrometheusQuery, +}: PrometheusQueryBoxProps) { + const [queryResults, setQueryResults] = useState([]); + const [alerts, setAlerts] = useState([]); + + const { sendPrometheusQuery } = usePrometheusWebSocket(); + + const hardcodedQuery = useMemo(() => { + const query = `ALERTS{pod=~"${sessionName}.*", purpose="renku-session"}`; + return { + label: "Alerts for this session", + query, + path: `http://prometheus-server.monitoring.svc.cluster.local/api/v1/query?query=${encodeURIComponent( + query + )}`, + description: "Alerts for this session", + icon: "memory", + unit: "", + }; + }, [sessionName]); + + const setPrometheusQueryBtnColorRef = useRef(setPrometheusQueryBtnColor); + const sendPrometheusQueryRef = useRef(sendPrometheusQuery); + + useEffect(() => { + setPrometheusQueryBtnColorRef.current = setPrometheusQueryBtnColor; + }, [setPrometheusQueryBtnColor]); + + useEffect(() => { + sendPrometheusQueryRef.current = sendPrometheusQuery; + }, [sendPrometheusQuery]); + + const executeQuery = useCallback( + async (predefinedQuery: { + label: string; + query: string; + path: string; + description?: string; + icon?: string; + unit: string; + }) => { + if (!predefinedQuery.path?.trim()) return; + + try { + const result = await sendPrometheusQueryRef.current( + predefinedQuery.path + ); + return result; + } catch (err) { + return null; + } + }, + [] + ); + + const getAllQueryResults = useCallback(async () => { + const result = await executeQuery(hardcodedQuery); + + if (result?.data?.result?.length && result.data.result.length > 0) { + const filteredResults = [{ ...result, predefinedQuery: hardcodedQuery }]; + + setQueryResults(filteredResults); + } else { + setQueryResults([]); + } + }, [executeQuery, hardcodedQuery]); + + const getAlertDetails = useCallback(() => { + if (queryResults.length === 0) return []; + const result = queryResults[0]; + if (result.data.result.length === 0) return []; + + const alertNames = result.data.result.map( + (alertResult) => alertResult.metric.name + ); + return alertNames; + }, [queryResults]); + + const handleCloseButton = useCallback(() => { + onClose(); + }, [onClose]); + + useEffect(() => { + getAllQueryResults(); + + const interval = setInterval(() => { + getAllQueryResults(); + }, 15000); + + return () => { + clearInterval(interval); + }; + }, [getAllQueryResults]); + + useEffect(() => { + async function getAllAlertDetails(alertNames: string[]) { + if (!alertNames || alertNames.length === 0) { + setAlerts([]); + setPrometheusQueryBtnColorRef.current("text-dark"); + return; + } + + const query = "ALERTS"; + const detailsQuery = { + label: "Alerts for this session", + query, + path: `http://prometheus-server.monitoring.svc.cluster.local/api/v1/alerts`, + description: "Alerts for this session", + icon: "memory", + unit: "", + }; + + const result = await executeQuery(detailsQuery); + + if (result?.data?.result?.length && result.data.result.length > 0) { + const relevantAlerts = result.data.result.filter( + (alert) => + alert.metric.name && + alertNames.includes(alert.metric.name) && + alert.metric.purpose === "renku-session" + ); + + let buttonColor = "text-warning"; + + const alertDetails: AlertDetails[] = relevantAlerts.map((alert) => { + let severity = alert.metric.severity || "unknown"; + const alertValue = alert.value?.[1] || alert.values?.[0]?.[1] || "0"; + const value = parseFloat(alertValue) || 0; + + if (alert.metric.criticalAt) { + const criticalThreshold = parseFloat(alert.metric.criticalAt); + if (value >= criticalThreshold) { + severity = "critical"; + buttonColor = "text-danger"; + } + } + return { + alertName: + alert.metric.alertname || alert.metric.name || "Unknown Alert", + severity, + value, + description: alert.metric.description || "", + unit: alert.metric.unit || "", + }; + }); + setAlerts(alertDetails); + setPrometheusQueryBtnColorRef.current(buttonColor); + } else { + setAlerts([]); + } + } + + const alertNames = getAlertDetails(); + getAllAlertDetails(alertNames); + }, [queryResults, getAlertDetails, executeQuery]); + + if (queryResults.length === 0 || showPrometheusQuery === false) { + return null; + } + + return ( + + +
+
+ + Metrics +
+ + +
+ + {alerts.map((alert, idx) => ( +
+
+ {alert.description || alert.alertName} +
+
+ {alert.value} {alert.unit} +
+ {idx < alerts.length - 1 &&
} +
+ ))} +
+
+ ); +} diff --git a/client/src/features/sessionsV2/SessionShowPage/ShowSessionPage.tsx b/client/src/features/sessionsV2/SessionShowPage/ShowSessionPage.tsx index 5e6800b74a..8171f64bf6 100644 --- a/client/src/features/sessionsV2/SessionShowPage/ShowSessionPage.tsx +++ b/client/src/features/sessionsV2/SessionShowPage/ShowSessionPage.tsx @@ -20,6 +20,7 @@ import { skipToken } from "@reduxjs/toolkit/query"; import cx from "classnames"; import { useCallback, useEffect, useMemo, useRef, useState } from "react"; import { + Activity, ArrowLeft, Box, Briefcase, @@ -69,6 +70,7 @@ import SessionPaused from "./SessionPaused"; import SessionUnavailable from "./SessionUnavailable"; import styles from "../../session/components/ShowSession.module.scss"; +import { PrometheusQueryBox } from "../../../components/prometheusModal/prometheusModal"; export default function ShowSessionPage() { const dispatch = useAppDispatch(); @@ -113,7 +115,9 @@ export default function ShowSessionPage() { const toggleModalLogs = useCallback(() => { dispatch( - displaySlice.actions.toggleSessionLogsModal({ targetServer: sessionName }) + displaySlice.actions.toggleSessionLogsModal({ + targetServer: sessionName, + }) ); }, [dispatch, sessionName]); @@ -123,6 +127,14 @@ export default function ShowSessionPage() { () => setShowModalPauseOrDeleteSession((show) => !show), [] ); + + const [showPrometheusQuery, setShowPrometheusQuery] = useState(true); + const [prometheusQueryBtnColor, setPrometheusQueryBtnColor] = + useState("text-dark"); + const togglePrometheusQuery = useCallback( + () => setShowPrometheusQuery((show) => !show), + [] + ); const [pauseOrDeleteAction, setPauseOrDeleteAction] = useState< "pause" | "delete" >("pause"); @@ -225,6 +237,10 @@ export default function ShowSessionPage() { namespace={namespace} slug={slug} /> +
+
+ +
{content}
@@ -293,6 +320,39 @@ function LogsBtn({ toggle }: LogsBtnProps) { ); } +interface PrometheusBtnProps { + toggle: () => void; + color: string; +} +function PrometheusBtn({ toggle, color }: PrometheusBtnProps) { + const ref = useRef(null); + + return ( +
+ + + Toggle metrics + +
+ ); +} + interface PauseSessionBtnProps { openPauseSession: () => void; } diff --git a/client/src/root.tsx b/client/src/root.tsx index f605592557..df42cdbca2 100644 --- a/client/src/root.tsx +++ b/client/src/root.tsx @@ -46,7 +46,7 @@ export const DEFAULT_META: MetaDescriptor[] = [ { name: "description", content: - "An open-source platform for reproducible and collaborative data science. Share code, data and computational environments whilst tracking provenance and lineage of research objects.", + "An open-source platform reproducible and collaborative data science. Share code, data and computational environments whilst tracking provenance and lineage of research objects.", }, { property: "og:title", diff --git a/server/src/config.ts b/server/src/config.ts index 9e03087eec..e1664d7f6e 100644 --- a/server/src/config.ts +++ b/server/src/config.ts @@ -92,6 +92,9 @@ const PROMETHEUS = { (process.env.PROMETHEUS_ENABLED ?? "").toLowerCase() ), path: "/metrics", + url: + process.env.PROMETHEUS_URL || + "http://prometheus-server.monitoring.svc.cluster.local", }; const config = { diff --git a/server/src/websocket/handlers/prometheus.ts b/server/src/websocket/handlers/prometheus.ts new file mode 100644 index 0000000000..a357052b6a --- /dev/null +++ b/server/src/websocket/handlers/prometheus.ts @@ -0,0 +1,109 @@ +/*! + * Copyright 2025 - Swiss Data Science Center (SDSC) + * A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and + * Eidgenössische Technische Hochschule Zürich (ETHZ). + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import fetch from "cross-fetch"; +import ws from "ws"; + +import logger from "../../logger"; +import { WsMessage } from "../WsMessages"; +import type { Channel } from "./handlers.types"; + +export function handlerPrometheusQuery( + data: Record, + channel: Channel, + socket: ws +): void { + const { fullPath, requestId } = data; + + if (!fullPath || typeof fullPath !== "string") { + const errorMessage = new WsMessage( + { + error: "Missing required 'fullPath' parameter", + requestId, + }, + "user", + "prometheusQuery" + ).toString(); + socket.send(errorMessage); + return; + } + + executePrometheusFullPath(fullPath as string, requestId as string, socket); +} + +async function executePrometheusFullPath( + fullPath: string, + requestId: string, + socket: ws +): Promise { + try { + logger.info(`🔗 Making request to full path: ${fullPath}`); + logger.info(`🆔 Request ID: ${requestId}`); + + const prometheusResponse = await fetch(fullPath, { + method: "GET", + headers: { + Accept: "application/json", + }, + }); + + logger.info(`📊 Response status: ${prometheusResponse.status}`); + + if (!prometheusResponse.ok) { + logger.error( + `Prometheus query failed with status ${prometheusResponse.status}` + ); + const errorMessage = new WsMessage( + { + error: `Prometheus server error: ${prometheusResponse.statusText}`, + requestId, + }, + "user", + "prometheusQuery" + ).toString(); + socket.send(errorMessage); + return; + } + + const responseData = await prometheusResponse.json(); + logger.info(`📈 Response data:`, JSON.stringify(responseData, null, 2)); + + const successMessage = new WsMessage( + { + ...responseData, + requestId, + }, + "user", + "prometheusQuery" + ).toString(); + socket.send(successMessage); + } catch (error) { + logger.error("Error executing Prometheus query:", error); + + const failureMessage = new WsMessage( + { + error: "Error executing Prometheus query", + details: error.message, + requestId, + }, + "user", + "prometheusQuery" + ).toString(); + socket.send(failureMessage); + } +} diff --git a/server/src/websocket/index.ts b/server/src/websocket/index.ts index ec905afc48..875866801e 100644 --- a/server/src/websocket/index.ts +++ b/server/src/websocket/index.ts @@ -44,6 +44,7 @@ import { handlerRequestSessionStatusV2, heartbeatRequestSessionStatusV2, } from "./handlers/sessionsV2"; +import { handlerPrometheusQuery } from "./handlers/prometheus"; // *** Channels *** // No need to store data in Redis since it's used only locally. We can modify this if necessary. @@ -87,6 +88,13 @@ const acceptedMessages: Record> = { handler: handlerRequestSessionStatusV2, } as MessageData, ], + prometheusQuery: [ + { + required: ["fullPath", "requestId"], + optional: null, + handler: handlerPrometheusQuery, + } as MessageData, + ], ping: [ { required: null,