From 0abb3c3b27f2dee9b5c7b82dafc76aa6f79bb42f Mon Sep 17 00:00:00 2001 From: Pal Lakatos-Toth Date: Tue, 30 Jun 2026 14:56:51 +0200 Subject: [PATCH 01/10] fix(mcp): keep stateful upstream sessions alive (Playwright about:blank) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The MCP forwarder spoke request/response only: it POSTed tools/call and read the immediate response, but never opened the standalone GET /mcp SSE stream nor answered server-initiated pings. Heartbeating MCP servers — Playwright MCP runs one (HTTP transport, runHeartbeat=true) — send the client a JSON-RPC ping every ~3s and call server.close() if no pong arrives within PLAYWRIGHT_MCP_PING_TIMEOUT_MS (default 5000). So every session was reaped ~5s after creation; the next tools/call got 404 "Session not found", the forwarder re-initialized, and the retry landed on a brand-new blank browser context — the agent saw about:blank mid-task (navigate/click state lost). Fix: act as a well-formed MCP client. For each stateful session, spawn a keepalive task that holds the standalone GET SSE stream open and replies pong to server pings, keeping the session — and the agent's live page — alive. The task is cancelled/replaced when the session is re-initialized. Also: - Tighten the session-loss classifier so it only triggers on genuine 4xx hard signals (never on healthy 2xx tool output that merely mentions "session", e.g. browser_evaluate returning sessionStorage), since a false positive is destructive for stateful servers. - Carry the triggering status+body on CallAttempt::SessionLost so the re-init log records exactly why a session was deemed lost. Tests: keepalive_holds_get_stream_and_pongs_server_ping plus the existing stateful/session-loss suite. 1048 router tests pass; clippy + fmt clean. Validated live on AKS: both sandbox routers hold a persistent GET stream to Playwright that survives well past the 5s reaper; zero session-loss. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- inference-router/src/mcp/forwarder.rs | 1086 ++++++++++++++++++++++++- 1 file changed, 1041 insertions(+), 45 deletions(-) diff --git a/inference-router/src/mcp/forwarder.rs b/inference-router/src/mcp/forwarder.rs index e329c305b..be929a3a3 100644 --- a/inference-router/src/mcp/forwarder.rs +++ b/inference-router/src/mcp/forwarder.rs @@ -67,6 +67,7 @@ use serde_json::{Value, json}; use std::collections::BTreeMap; use std::sync::Arc; use std::time::Duration; +use tokio::sync::Mutex; use super::registry::{DiscoveredMcpServer, McpServerRegistry}; use super::tools::{ @@ -92,6 +93,27 @@ struct ForwarderEntry { /// outbound `tools/list` and `tools/call` POST. Resolved at /// discovery time from the env var named by `meta.bearer_from_env`. bearer_token: Option, + /// Live MCP session for a STATEFUL upstream — the `Mcp-Session-Id` plus the + /// negotiated protocol version, established by the `initialize` handshake at + /// discovery and reused across every `tools/call` so servers that bind state + /// to the session (e.g. Playwright, whose browser context lives in the + /// session) keep that state between calls. Re-established on demand when an + /// upstream reports the session is gone (pod restart / TTL expiry). + session: Arc>, + /// Background keepalive task for a STATEFUL session (see + /// [`run_session_keepalive`]). MCP servers such as Playwright run a + /// server-side heartbeat: they send the client a JSON-RPC `ping` over the + /// standalone `GET /mcp` SSE stream and, if no response (`pong`) arrives + /// within a few seconds (Playwright: `PLAYWRIGHT_MCP_PING_TIMEOUT_MS`, + /// default 5000), they tear the session down. A forwarder that only issues + /// request/response `tools/call` POSTs never receives those pings, so every + /// session silently dies ~5s after creation — the next call then 404s with + /// "Session not found", forcing a re-init onto a brand-new (blank) browser + /// context. The keepalive task holds the GET stream open and answers pings, + /// keeping the session — and the agent's live browser page — alive. Holds + /// the task's [`AbortHandle`] so it can be cancelled when the session is + /// re-initialized (replaced) or the dispatcher is dropped. + keepalive: Arc>>, } /// Namespaced MCP forwarder — the production-mode `AsyncToolDispatcher` @@ -297,7 +319,15 @@ async fn build_entry_for( ); } - let upstream_tools = fetch_upstream_tools(http, &meta.url, bearer_token.as_deref()).await?; + // MCP streamable-HTTP servers may be STATEFUL: they require an + // `initialize` handshake and bind subsequent requests to the + // `Mcp-Session-Id` they issue. Establish that session up front + // (best-effort: stateless servers simply return no session id and the + // `tools/list` below proceeds unauthenticated as before). + let session = initialize_session(http, &meta.url, bearer_token.as_deref()).await; + + let upstream_tools = + fetch_upstream_tools(http, &meta.url, bearer_token.as_deref(), &session).await?; let filtered = filter_by_allowlist(&upstream_tools, &meta.allowed_tools); if filtered.is_empty() { @@ -320,6 +350,20 @@ async fn build_entry_for( }); } + // For a STATEFUL session, start the heartbeat-responder so the upstream + // doesn't reap the session (and the agent's live browser page) after its + // ping timeout. Stateless sessions (id: None) need no keepalive. + let keepalive: Arc>> = Arc::new(Mutex::new(None)); + if session.id.is_some() { + let handle = spawn_session_keepalive( + http.clone(), + meta.url.clone(), + session.clone(), + bearer_token.clone(), + ); + *keepalive.lock().await = Some(handle); + } + Ok(( ForwarderEntry { name: name.to_string(), @@ -327,12 +371,318 @@ async fn build_entry_for( upstream_url: meta.url.clone(), tools: tools_map, bearer_token, + session: Arc::new(Mutex::new(session)), + keepalive, }, namespaced_defs, )) } -/// POST a JSON-RPC `tools/list` to `url` and parse the result. No +/// The MCP protocol version the router advertises in its `initialize` +/// handshake. Servers negotiate to a version they support; this is the +/// version the router proposes and falls back to when the upstream doesn't +/// echo one back. +const MCP_PROTOCOL_VERSION: &str = "2025-06-18"; + +/// A negotiated MCP streamable-HTTP session. +/// +/// Two pieces of per-connection state survive the `initialize` handshake and +/// MUST be replayed on every subsequent request for broad server +/// compatibility: +/// +/// - **`id`** — the `Mcp-Session-Id` a STATEFUL server issues (Playwright, the +/// official TS-SDK servers, GitHub MCP, …). `None` for stateless servers, +/// which simply don't return the header. +/// - **`protocol_version`** — the version the server negotiated. Per MCP +/// 2025-06-18 the client MUST send it back in the `MCP-Protocol-Version` +/// header on every post-initialize HTTP request; SDK-based servers reject +/// requests that omit it with `400 Bad Request`. We default to the version +/// we proposed when the server doesn't echo one (older servers ignore the +/// header). +#[derive(Clone, Debug)] +struct McpSession { + id: Option, + protocol_version: String, +} + +impl McpSession { + /// A session for an upstream that did not complete a handshake (stateless, + /// or handshake unreachable): no session id, default protocol version. + fn stateless() -> Self { + Self { + id: None, + protocol_version: MCP_PROTOCOL_VERSION.to_string(), + } + } + + /// Apply this session's headers (`Mcp-Session-Id` when present, and always + /// `MCP-Protocol-Version`) to an outbound request builder. + fn apply(&self, mut req: reqwest::RequestBuilder) -> reqwest::RequestBuilder { + req = req.header("mcp-protocol-version", &self.protocol_version); + if let Some(sid) = self.id.as_deref() { + req = req.header("mcp-session-id", sid); + } + req + } +} + +/// MCP `initialize` handshake — negotiates the protocol version and (for +/// STATEFUL upstreams) establishes the `Mcp-Session-Id` reused across every +/// subsequent request. +/// +/// Best-effort by design: stateless servers (and servers that don't implement +/// the handshake) return no session header, in which case the returned session +/// has `id: None` and the caller proceeds without one — preserving the original +/// `tools/list` behaviour. Stateful servers (e.g. Playwright MCP, which returns +/// `400 "Server not initialized"` on a bare `tools/list`) issue a session id; +/// for those we also send the required `notifications/initialized` to complete +/// the handshake before any `tools/list` / `tools/call`. +/// +/// Never returns an error: a failed handshake degrades to a stateless session +/// and the real signal surfaces from the subsequent `tools/list` (which keeps +/// error reporting in one place). +async fn initialize_session(http: &reqwest::Client, url: &str, bearer: Option<&str>) -> McpSession { + let init_body = json!({ + "jsonrpc": "2.0", + "id": 1, + "method": "initialize", + "params": { + "protocolVersion": MCP_PROTOCOL_VERSION, + "capabilities": {}, + "clientInfo": { + "name": "kars-inference-router", + "version": env!("CARGO_PKG_VERSION"), + }, + }, + }); + + let mut req = http + .post(url) + .header("content-type", "application/json") + .header("accept", "application/json, text/event-stream") + .json(&init_body); + if let Some(token) = bearer { + req = req.bearer_auth(token); + } + + let resp = match req.send().await { + Ok(r) => r, + Err(e) => { + tracing::debug!(url = %url, error = %e, "MCP initialize POST failed; treating upstream as stateless"); + return McpSession::stateless(); + } + }; + + let session_id = resp + .headers() + .get("mcp-session-id") + .and_then(|v| v.to_str().ok()) + .map(|s| s.to_string()); + let content_type = resp + .headers() + .get("content-type") + .and_then(|v| v.to_str().ok()) + .unwrap_or("") + .to_string(); + + // The initialize *result* carries the negotiated `protocolVersion`. The + // body may be plain JSON or an SSE event stream (official TS-SDK servers + // reply with SSE); `extract_jsonrpc_payload` handles both. If anything is + // missing we fall back to the version we proposed. + let protocol_version = match resp.text().await { + Ok(body) => extract_jsonrpc_payload(&content_type, &body) + .ok() + .and_then(|v| { + v.get("result") + .and_then(|r| r.get("protocolVersion")) + .and_then(|p| p.as_str()) + .map(|s| s.to_string()) + }) + .unwrap_or_else(|| MCP_PROTOCOL_VERSION.to_string()), + Err(_) => MCP_PROTOCOL_VERSION.to_string(), + }; + + let Some(sid) = session_id else { + // Stateless server: no session id, but still report the negotiated + // protocol version so we send the `MCP-Protocol-Version` header. + return McpSession { + id: None, + protocol_version, + }; + }; + + // Stateful server: complete the handshake. Per the MCP spec the client + // MUST send `notifications/initialized` (carrying the session + protocol + // headers) before issuing requests. Best-effort: a transient hiccup here + // surfaces as a clear error on the following `tools/list`. + let session = McpSession { + id: Some(sid), + protocol_version, + }; + let note = json!({ "jsonrpc": "2.0", "method": "notifications/initialized" }); + let mut nreq = http + .post(url) + .header("content-type", "application/json") + .header("accept", "application/json, text/event-stream") + .json(¬e); + nreq = session.apply(nreq); + if let Some(token) = bearer { + nreq = nreq.bearer_auth(token); + } + if let Err(e) = nreq.send().await { + tracing::debug!(url = %url, error = %e, "MCP notifications/initialized POST failed"); + } + + tracing::debug!(url = %url, protocol = %session.protocol_version, "Established MCP session with stateful upstream"); + session +} + +/// How long the keepalive holds a single `GET /mcp` SSE connection before +/// proactively reconnecting. The connection normally stays open for the life +/// of the session; this bound just guarantees we recycle a wedged TCP +/// connection rather than block forever on a half-open socket. +const KEEPALIVE_GET_TIMEOUT: Duration = Duration::from_secs(3600); + +/// Backoff before re-opening the standalone SSE stream after it ends. Short +/// enough that a transient blip doesn't leave a ping unanswered past the +/// upstream's timeout, long enough to avoid a hot loop if the server has +/// actually torn the session down (in which case the next GET 404s and the +/// task exits). +const KEEPALIVE_RECONNECT_BACKOFF: Duration = Duration::from_millis(750); + +/// Spawn the [`run_session_keepalive`] task and return its [`AbortHandle`]. +fn spawn_session_keepalive( + http: reqwest::Client, + url: String, + session: McpSession, + bearer: Option, +) -> tokio::task::AbortHandle { + tokio::spawn(run_session_keepalive(http, url, session, bearer)).abort_handle() +} + +/// Keep a STATEFUL MCP session alive by behaving as a well-formed MCP client: +/// hold the standalone `GET /mcp` SSE stream open and answer the server's +/// JSON-RPC `ping` requests with a `pong`. +/// +/// # Why this is required +/// +/// MCP servers may run a server-initiated heartbeat to detect dead clients. +/// Playwright MCP does (HTTP transport, `runHeartbeat = true`): every ~3s it +/// calls `server.ping()` and, if no response arrives within +/// `PLAYWRIGHT_MCP_PING_TIMEOUT_MS` (default 5000), it calls `server.close()`, +/// which deletes the session. A forwarder that only POSTs `tools/call` and +/// reads the immediate response never sees those pings, so the session — and +/// the browser context bound to it — is reaped ~5s after creation. The next +/// `tools/call` then gets `404 "Session not found"`, the forwarder re-inits, +/// and the retry lands on a fresh blank page (`about:blank`). Holding the GET +/// stream and ponging keeps the session (and the agent's page) alive. +/// +/// # Behaviour +/// +/// Loops for the life of the task (cancelled via its [`AbortHandle`] when the +/// session is re-initialized or the dispatcher is dropped): +/// +/// 1. Open `GET {url}` with `Accept: text/event-stream` + the session headers. +/// 2. A non-2xx (e.g. `404 Session not found`, or `405` from a server that +/// doesn't implement the standalone stream) means there's nothing to keep +/// alive this way — exit; the normal `tools/call` path handles re-init. +/// 3. Stream the SSE body. For each server→client JSON-RPC `ping`, POST a +/// `{"result":{}}` response carrying the same id and the session headers. +/// 4. If the stream ends while the session may still be valid, back off briefly +/// and reconnect. +async fn run_session_keepalive( + http: reqwest::Client, + url: String, + session: McpSession, + bearer: Option, +) { + use futures::StreamExt; + + loop { + let mut req = http + .get(&url) + .header("accept", "text/event-stream") + .header("mcp-protocol-version", &session.protocol_version) + .timeout(KEEPALIVE_GET_TIMEOUT); + if let Some(sid) = session.id.as_deref() { + req = req.header("mcp-session-id", sid); + } + if let Some(token) = bearer.as_deref() { + req = req.bearer_auth(token); + } + + let resp = match req.send().await { + Ok(r) => r, + Err(e) => { + tracing::debug!(url = %url, error = %e, "MCP keepalive GET failed; retrying"); + tokio::time::sleep(KEEPALIVE_RECONNECT_BACKOFF).await; + continue; + } + }; + + if !resp.status().is_success() { + // 404 → session already gone; 405 → server has no standalone SSE + // stream (so it can't be heartbeating us either). Either way, stop: + // there is nothing this task can keep alive. + tracing::debug!( + url = %url, + status = %resp.status(), + "MCP keepalive GET not available; stopping keepalive for this session" + ); + return; + } + + let mut stream = resp.bytes_stream(); + let mut buf = String::new(); + while let Some(chunk) = stream.next().await { + let bytes = match chunk { + Ok(b) => b, + Err(e) => { + tracing::debug!(url = %url, error = %e, "MCP keepalive SSE read error; reconnecting"); + break; + } + }; + buf.push_str(&String::from_utf8_lossy(&bytes)); + // SSE frames are newline-delimited; pull complete lines and act on + // `data:` lines that carry a server→client `ping` request. + while let Some(nl) = buf.find('\n') { + let line = buf[..nl].trim().to_string(); + buf.drain(..=nl); + let Some(payload) = line.strip_prefix("data:") else { + continue; + }; + let Ok(msg) = serde_json::from_str::(payload.trim()) else { + continue; + }; + if msg.get("method").and_then(|m| m.as_str()) != Some("ping") { + continue; + } + let Some(id) = msg.get("id").cloned() else { + // A ping *notification* (no id) needs no response. + continue; + }; + let pong = json!({ "jsonrpc": "2.0", "id": id, "result": {} }); + let mut preq = http + .post(&url) + .header("content-type", "application/json") + .header("accept", "application/json, text/event-stream") + .json(&pong); + preq = session.apply(preq); + if let Some(token) = bearer.as_deref() { + preq = preq.bearer_auth(token); + } + if let Err(e) = preq.send().await { + tracing::debug!(url = %url, error = %e, "MCP keepalive pong POST failed"); + } + } + } + + // Stream ended. Back off, then reconnect — if the session was reaped the + // next GET 404s and we exit; otherwise we resume answering pings. + tokio::time::sleep(KEEPALIVE_RECONNECT_BACKOFF).await; + } +} + /// pagination — Slice 4d.4 caps at one page; multi-page upstreams are /// truncated with a recorded warning. Multi-page support lands when /// we have a real consumer that hits the cap (principles §5). @@ -340,6 +690,7 @@ async fn fetch_upstream_tools( http: &reqwest::Client, url: &str, bearer: Option<&str>, + session: &McpSession, ) -> Result, String> { let body = json!({ "jsonrpc": "2.0", @@ -355,6 +706,7 @@ async fn fetch_upstream_tools( if let Some(token) = bearer { req = req.bearer_auth(token); } + req = session.apply(req); let resp = req .send() .await @@ -421,12 +773,82 @@ fn filter_by_allowlist(upstream: &[ToolDefinition], allow: &[String]) -> Vec bool { + let code = status.as_u16(); + if code != 400 && code != 404 { + return false; + } + body_signals_session_loss(body) +} + +/// Shared session-loss body classifier for both the HTTP-status path and +/// the JSON-RPC-error path. Conservative by design (see [`is_session_lost`]). +fn body_signals_session_loss(body: &str) -> bool { + let b = body.to_ascii_lowercase(); + // Canonical MCP SDK signal — emitted verbatim by Playwright MCP and the + // official reference servers when the session they issued is gone. + if b.contains("not initialized") { + return true; + } + // Other servers phrase it as an expired/invalid/missing *session*. Require + // the word "session" AND an explicit invalidation qualifier so incidental + // occurrences of "session" in tool output never trigger a re-init. + if b.contains("session") { + const QUALIFIERS: [&str; 6] = [ + "expired", + "not found", + "invalid", + "unknown", + "missing", + "no longer", + ]; + return QUALIFIERS.iter().any(|q| b.contains(q)); + } + false +} + +async fn post_tools_call( http: &reqwest::Client, entry: &ForwarderEntry, upstream_name: &str, arguments: &Value, -) -> Result { + session: &McpSession, +) -> CallAttempt { + let tool_label = format!("{}.{}", entry.prefix, upstream_name); let body = json!({ "jsonrpc": "2.0", "id": 1, @@ -445,13 +867,17 @@ async fn forward_tools_call( if let Some(token) = entry.bearer_token.as_deref() { req = req.bearer_auth(token); } - let resp = req - .send() - .await - .map_err(|e| DispatchError::ExecutionFailed { - tool: format!("{}.{}", entry.prefix, upstream_name), - reason: format!("upstream POST failed: {e}"), - })?; + req = session.apply(req); + + let resp = match req.send().await { + Ok(r) => r, + Err(e) => { + return CallAttempt::Fatal(DispatchError::ExecutionFailed { + tool: tool_label, + reason: format!("upstream POST failed: {e}"), + }); + } + }; let status = resp.status(); let content_type = resp @@ -460,17 +886,28 @@ async fn forward_tools_call( .and_then(|v| v.to_str().ok()) .unwrap_or("") .to_string(); - let body_text = resp - .text() - .await - .map_err(|e| DispatchError::ExecutionFailed { - tool: format!("{}.{}", entry.prefix, upstream_name), - reason: format!("upstream body read failed: {e}"), - })?; + let body_text = match resp.text().await { + Ok(t) => t, + Err(e) => { + return CallAttempt::Fatal(DispatchError::ExecutionFailed { + tool: tool_label, + reason: format!("upstream body read failed: {e}"), + }); + } + }; if !status.is_success() { - return Err(DispatchError::ExecutionFailed { - tool: format!("{}.{}", entry.prefix, upstream_name), + if is_session_lost(status, &body_text) { + return CallAttempt::SessionLost { + reason: format!( + "http {} body: {}", + status, + body_text.chars().take(160).collect::() + ), + }; + } + return CallAttempt::Fatal(DispatchError::ExecutionFailed { + tool: tool_label, reason: format!( "upstream non-2xx: {} (body trimmed: {})", status, @@ -479,26 +916,43 @@ async fn forward_tools_call( }); } - let json_payload = extract_jsonrpc_payload(&content_type, &body_text).map_err(|e| { - DispatchError::ExecutionFailed { - tool: format!("{}.{}", entry.prefix, upstream_name), - reason: format!("upstream response decode failed: {e}"), + let json_payload = match extract_jsonrpc_payload(&content_type, &body_text) { + Ok(v) => v, + Err(e) => { + return CallAttempt::Fatal(DispatchError::ExecutionFailed { + tool: tool_label, + reason: format!("upstream response decode failed: {e}"), + }); + } + }; + let parsed: ToolsCallResponse = match serde_json::from_value(json_payload) { + Ok(p) => p, + Err(e) => { + return CallAttempt::Fatal(DispatchError::ExecutionFailed { + tool: tool_label, + reason: format!("upstream tools/call parse failed: {e}"), + }); } - })?; - let parsed: ToolsCallResponse = - serde_json::from_value(json_payload).map_err(|e| DispatchError::ExecutionFailed { - tool: format!("{}.{}", entry.prefix, upstream_name), - reason: format!("upstream tools/call parse failed: {e}"), - })?; + }; if let Some(err) = parsed.error { - // Upstream protocol error → surface as an isError content - // entry, not a DispatchError. Per MCP spec, JSON-RPC errors - // from `tools/call` indicate the *protocol* failed; the - // semantic "tool ran but errored" path uses isError:true. - // We collapse them here because the agent-facing surface - // shouldn't distinguish (the audit layer can see both). - return Ok(ToolCallOutput { + // A JSON-RPC session error (code -32000/-32001 with an explicit + // session-loss message) also means the session is gone — retry once. + // We reuse the same conservative body classifier so an ordinary + // tool-level error whose message merely mentions "session" is NOT + // mistaken for a lost session. + if (err.code == -32000 || err.code == -32001) && body_signals_session_loss(&err.message) { + return CallAttempt::SessionLost { + reason: format!("jsonrpc error code={} message={}", err.code, err.message), + }; + } + // Other upstream protocol error → surface as an isError content + // entry, not a DispatchError. Per MCP spec, JSON-RPC errors from + // `tools/call` indicate the *protocol* failed; the semantic "tool + // ran but errored" path uses isError:true. We collapse them here + // because the agent-facing surface shouldn't distinguish (the audit + // layer can see both). + return CallAttempt::Done(ToolCallOutput { content: vec![ToolContent::Text { text: format!( "upstream JSON-RPC error code={} message={}", @@ -509,19 +963,78 @@ async fn forward_tools_call( }); } - let result = parsed - .result - .ok_or_else(|| DispatchError::ExecutionFailed { - tool: format!("{}.{}", entry.prefix, upstream_name), - reason: "tools/call missing result".to_string(), - })?; + let result = match parsed.result { + Some(r) => r, + None => { + return CallAttempt::Fatal(DispatchError::ExecutionFailed { + tool: tool_label, + reason: "tools/call missing result".to_string(), + }); + } + }; - Ok(ToolCallOutput { + CallAttempt::Done(ToolCallOutput { content: result.content, is_error: result.is_error.unwrap_or(false), }) } +async fn forward_tools_call( + http: &reqwest::Client, + entry: &ForwarderEntry, + upstream_name: &str, + arguments: &Value, +) -> Result { + // Reuse the session established at discovery so stateful upstreams keep + // their per-session state (e.g. the open browser page) across calls. + let session = entry.session.lock().await.clone(); + + match post_tools_call(http, entry, upstream_name, arguments, &session).await { + CallAttempt::Done(out) => Ok(out), + CallAttempt::Fatal(e) => Err(e), + CallAttempt::SessionLost { reason } => { + // Re-establish the session once and retry. Covers upstream pod + // restarts and session TTL expiry without failing the agent's call. + tracing::info!( + tool = %format!("{}.{}", entry.prefix, upstream_name), + reason = %reason, + "MCP upstream session lost; re-initializing and retrying once" + ); + let new_session = + initialize_session(http, &entry.upstream_url, entry.bearer_token.as_deref()).await; + *entry.session.lock().await = new_session.clone(); + + // The old keepalive (if any) was bound to the now-dead session; + // cancel it and start a fresh one for the re-established session so + // the new session is likewise protected from the upstream's + // heartbeat reaper. + { + let mut guard = entry.keepalive.lock().await; + if let Some(old) = guard.take() { + old.abort(); + } + if new_session.id.is_some() { + *guard = Some(spawn_session_keepalive( + http.clone(), + entry.upstream_url.clone(), + new_session.clone(), + entry.bearer_token.clone(), + )); + } + } + + match post_tools_call(http, entry, upstream_name, arguments, &new_session).await { + CallAttempt::Done(out) => Ok(out), + CallAttempt::Fatal(e) => Err(e), + CallAttempt::SessionLost { .. } => Err(DispatchError::ExecutionFailed { + tool: format!("{}.{}", entry.prefix, upstream_name), + reason: "upstream session could not be re-established after retry".to_string(), + }), + } + } + } +} + /// Decode an MCP Streamable HTTP response body into a JSON-RPC payload. /// /// Per the MCP Streamable HTTP transport spec, a server MAY respond @@ -633,7 +1146,7 @@ mod tests { Json, Router, extract::State, http::{HeaderMap, StatusCode}, - routing::{any, post}, + routing::{any, get, post}, }; use std::collections::BTreeMap; use std::sync::Arc as StdArc; @@ -1151,4 +1664,487 @@ mod tests { let ToolContent::Text { text: t2 } = &out2.content[0]; assert!(t2.contains("called query")); } + + /// A configurable STATEFUL mock upstream covering the transport variants of + /// the popular MCP servers: + /// + /// - **Playwright MCP** — JSON `initialize` body, `Mcp-Session-Id`, rejects + /// bare `tools/list` with `400 "Server not initialized"`. + /// - **Official TS-SDK servers / GitHub MCP** — SSE `initialize` body, a + /// negotiated `protocolVersion`, and a hard requirement that every + /// post-initialize request carries the `MCP-Protocol-Version` header + /// (returns `400` otherwise). + /// - **Session expiry / pod restart** — one request fails with a session + /// error, forcing a re-initialize + retry. + #[derive(Clone)] + struct StatefulState { + tools: Vec, + session_id: String, + negotiated_version: String, + /// Return the `initialize` result as a `text/event-stream` body. + sse_init: bool, + /// Reject `tools/*` requests that omit the `MCP-Protocol-Version` header. + require_protocol_header: bool, + init_count: StdArc, + call_count: StdArc, + /// When set, the next `tools/call` returns a `400` session error once + /// (then clears the flag), simulating session expiry / pod restart. + fail_next_with_session_lost: StdArc, + /// When set, every successful `tools/call` result embeds the word + /// "session" in its text (mimics Playwright `browser_evaluate` output + /// that references `sessionStorage`). A healthy 200 like this must + /// NEVER be misread as a lost session. + result_mentions_session: bool, + } + + impl StatefulState { + fn new(session_id: &str, tools: Vec) -> Self { + Self { + tools, + session_id: session_id.to_string(), + negotiated_version: MCP_PROTOCOL_VERSION.to_string(), + sse_init: false, + require_protocol_header: false, + init_count: StdArc::new(AtomicUsize::new(0)), + call_count: StdArc::new(AtomicUsize::new(0)), + fail_next_with_session_lost: StdArc::new(std::sync::atomic::AtomicBool::new(false)), + result_mentions_session: false, + } + } + } + + async fn stateful_mock_handler( + State(state): State, + headers: HeaderMap, + Json(body): Json, + ) -> axum::response::Response { + use axum::response::IntoResponse; + let method = body.get("method").and_then(|v| v.as_str()).unwrap_or(""); + let id = body.get("id").cloned().unwrap_or(serde_json::json!(1)); + let sid = headers + .get("mcp-session-id") + .and_then(|v| v.to_str().ok()) + .unwrap_or(""); + let has_proto = headers.contains_key("mcp-protocol-version"); + + let session_lost = || { + ( + StatusCode::BAD_REQUEST, + Json(serde_json::json!({ + "jsonrpc": "2.0", "id": null, + "error": {"code": -32000, "message": "Bad Request: Server not initialized"} + })), + ) + .into_response() + }; + + match method { + "initialize" => { + state.init_count.fetch_add(1, Ordering::SeqCst); + let result = serde_json::json!({ + "jsonrpc": "2.0", "id": id, + "result": {"protocolVersion": state.negotiated_version, "capabilities": {}} + }); + if state.sse_init { + let sse = format!("event: message\ndata: {result}\n\n"); + ( + StatusCode::OK, + [ + ("mcp-session-id", state.session_id.clone()), + ("content-type", "text/event-stream".to_string()), + ], + sse, + ) + .into_response() + } else { + ( + StatusCode::OK, + [("mcp-session-id", state.session_id.clone())], + Json(result), + ) + .into_response() + } + } + "notifications/initialized" => StatusCode::ACCEPTED.into_response(), + "tools/list" | "tools/call" if sid != state.session_id => session_lost(), + "tools/list" | "tools/call" if state.require_protocol_header && !has_proto => ( + StatusCode::BAD_REQUEST, + Json(serde_json::json!({ + "jsonrpc": "2.0", "id": null, + "error": {"code": -32600, "message": "Missing MCP-Protocol-Version header"} + })), + ) + .into_response(), + "tools/list" => ( + StatusCode::OK, + Json(serde_json::json!({ + "jsonrpc": "2.0", "id": id, "result": {"tools": state.tools} + })), + ) + .into_response(), + "tools/call" => { + state.call_count.fetch_add(1, Ordering::SeqCst); + if state + .fail_next_with_session_lost + .swap(false, Ordering::SeqCst) + { + return session_lost(); + } + let tool = body + .pointer("/params/name") + .and_then(|v| v.as_str()) + .unwrap_or(""); + let text = if state.result_mentions_session { + format!("called {tool}; page has sessionStorage keys: [token]") + } else { + format!("called {tool}") + }; + ( + StatusCode::OK, + Json(serde_json::json!({ + "jsonrpc": "2.0", "id": id, + "result": {"content": [{"type": "text", "text": text}], "isError": false} + })), + ) + .into_response() + } + _ => ( + StatusCode::OK, + Json(serde_json::json!({ + "jsonrpc": "2.0", "id": id, + "error": {"code": -32601, "message": "method not found"} + })), + ) + .into_response(), + } + } + + async fn stateful_mock_upstream(state: StatefulState) -> String { + let app = Router::new() + .route("/", post(stateful_mock_handler)) + .route("/", any(method_block)) + .with_state(state); + let listener = TcpListener::bind("127.0.0.1:0").await.unwrap(); + let addr = listener.local_addr().unwrap(); + tokio::spawn(async move { + axum::serve(listener, app).await.unwrap(); + }); + format!("http://{addr}/") + } + + /// Playwright-MCP shape: JSON `initialize`, session id, bare `tools/list` + /// rejected. Discovery + call must succeed and reuse one session. + #[tokio::test] + async fn stateful_playwright_shape_discovers_and_calls_with_reused_session() { + let state = StatefulState::new( + "sess-abc-123", + vec![tool_def("browser_navigate", "open a url")], + ); + let init_count = state.init_count.clone(); + let url = stateful_mock_upstream(state).await; + let registry = registry_with(vec![discovered("playwright", &url, vec!["*"])]); + + let dispatcher = RouterToolDispatcher::discover(registry, Duration::from_secs(5)) + .await + .expect("discover"); + assert_eq!( + dispatcher.len(), + 1, + "stateful server tools must be discovered" + ); + assert!( + dispatcher.skipped().is_empty(), + "stateful server must not be skipped" + ); + + let out = dispatcher + .invoke( + "playwright.browser_navigate", + &serde_json::json!({"url": "https://x"}), + ) + .await + .expect("invoke"); + let ToolContent::Text { text } = &out.content[0]; + assert!(text.contains("called browser_navigate")); + assert!(!out.is_error); + + // Exactly one initialize: the session is reused, not re-created per call. + assert_eq!( + init_count.load(Ordering::SeqCst), + 1, + "session must be reused across calls" + ); + } + + /// Official TS-SDK / GitHub-MCP shape: SSE `initialize` body, a negotiated + /// protocol version, and a hard requirement that the `MCP-Protocol-Version` + /// header is replayed on every request. Discovery succeeding proves the + /// router parses the SSE-negotiated version and sends the header. + #[tokio::test] + async fn stateful_sse_init_with_required_protocol_header() { + let mut state = StatefulState::new("gh-session-xyz", vec![tool_def("search_repos", "")]); + state.sse_init = true; + state.require_protocol_header = true; + state.negotiated_version = "2025-03-26".to_string(); + let url = stateful_mock_upstream(state).await; + let registry = registry_with(vec![discovered("github-mcp", &url, vec!["*"])]); + + let dispatcher = RouterToolDispatcher::discover(registry, Duration::from_secs(5)) + .await + .expect("discover"); + assert_eq!( + dispatcher.len(), + 1, + "SSE-init server with required protocol header must be discovered" + ); + assert!(dispatcher.skipped().is_empty()); + + let out = dispatcher + .invoke("github_mcp.search_repos", &serde_json::json!({"q": "kars"})) + .await + .expect("invoke"); + let ToolContent::Text { text } = &out.content[0]; + assert!(text.contains("called search_repos")); + } + + /// Session expiry / upstream pod restart: the first `tools/call` fails with + /// a session error; the router must re-initialize and retry transparently. + #[tokio::test] + async fn stateful_session_expiry_triggers_reinit_and_retry() { + let state = StatefulState::new("sess-1", vec![tool_def("do_thing", "")]); + let init_count = state.init_count.clone(); + let fail_flag = state.fail_next_with_session_lost.clone(); + let url = stateful_mock_upstream(state).await; + let registry = registry_with(vec![discovered("svc", &url, vec!["*"])]); + + let dispatcher = RouterToolDispatcher::discover(registry, Duration::from_secs(5)) + .await + .expect("discover"); + assert_eq!( + init_count.load(Ordering::SeqCst), + 1, + "one initialize at discovery" + ); + + // Arm the upstream to drop the session on the next call. + fail_flag.store(true, Ordering::SeqCst); + let out = dispatcher + .invoke("svc.do_thing", &serde_json::json!({})) + .await + .expect("invoke should succeed after transparent re-init + retry"); + let ToolContent::Text { text } = &out.content[0]; + assert!(text.contains("called do_thing")); + + // The retry path re-established the session exactly once more. + assert_eq!( + init_count.load(Ordering::SeqCst), + 2, + "session must be re-initialized once on expiry" + ); + } + + /// Regression: a HEALTHY `tools/call` whose 200 result merely mentions the + /// word "session" (e.g. Playwright `browser_evaluate` returning page text + /// that references `sessionStorage`) must NOT be misread as a lost session. + /// Re-initializing here is destructive — it drops the live browser context + /// and the retry lands on a blank `about:blank` page. The session must be + /// reused with ZERO re-initializations across many such calls. + #[tokio::test] + async fn healthy_call_mentioning_session_does_not_reinitialize() { + let mut state = + StatefulState::new("sess-keep", vec![tool_def("browser_evaluate", "run js")]); + state.result_mentions_session = true; + let init_count = state.init_count.clone(); + let url = stateful_mock_upstream(state).await; + let registry = registry_with(vec![discovered("pw", &url, vec!["*"])]); + + let dispatcher = RouterToolDispatcher::discover(registry, Duration::from_secs(5)) + .await + .expect("discover"); + assert_eq!( + init_count.load(Ordering::SeqCst), + 1, + "one init at discovery" + ); + + // Several calls in a row, each returning "session" in the result text. + for _ in 0..5 { + let out = dispatcher + .invoke( + "pw.browser_evaluate", + &serde_json::json!({"function": "() => 1"}), + ) + .await + .expect("invoke"); + let ToolContent::Text { text } = &out.content[0]; + assert!(text.contains("sessionStorage"), "result text preserved"); + assert!(!out.is_error); + } + + // The single discovery session must have been reused for every call — + // no false-positive session-loss re-init. + assert_eq!( + init_count.load(Ordering::SeqCst), + 1, + "healthy 'session'-mentioning results must not trigger re-initialization" + ); + } + + #[test] + fn session_loss_classifier_is_conservative() { + use reqwest::StatusCode; + + // Canonical MCP SDK signal — Playwright / official servers. + assert!(is_session_lost( + StatusCode::BAD_REQUEST, + "Bad Request: Server not initialized" + )); + // Explicit session invalidation phrasings. + assert!(is_session_lost(StatusCode::NOT_FOUND, "session not found")); + assert!(is_session_lost( + StatusCode::BAD_REQUEST, + "Mcp-Session-Id expired" + )); + assert!(is_session_lost( + StatusCode::BAD_REQUEST, + "this session is no longer valid" + )); + + // Incidental "session" mentions in error bodies must NOT match — this + // is exactly the false positive that corrupted live browser sessions. + assert!(!is_session_lost( + StatusCode::BAD_REQUEST, + "Error: page.evaluate failed; sessionStorage is empty" + )); + assert!(!is_session_lost( + StatusCode::BAD_REQUEST, + "invalid arguments: missing 'url'" + )); + // Non-4xx never qualifies regardless of body. + assert!(!is_session_lost(StatusCode::OK, "Server not initialized")); + assert!(!is_session_lost( + StatusCode::INTERNAL_SERVER_ERROR, + "session expired" + )); + } + + /// Mock upstream that mimics a heartbeating MCP server (Playwright shape): + /// it issues a session on `initialize`, serves a `tools/list`, and — on the + /// standalone `GET /` SSE stream — pushes a server→client `ping` then holds + /// the stream open. A correct client MUST answer that ping with a `pong` + /// POST or the real server would reap the session after its ping timeout. + #[derive(Clone)] + struct HeartbeatState { + session_id: String, + tools: Vec, + get_hits: StdArc, + pong_hits: StdArc, + } + + async fn heartbeat_post( + State(state): State, + Json(body): Json, + ) -> axum::response::Response { + use axum::response::IntoResponse; + let method = body.get("method").and_then(|v| v.as_str()).unwrap_or(""); + let id = body.get("id").cloned().unwrap_or(serde_json::json!(1)); + match method { + "initialize" => ( + StatusCode::OK, + [("mcp-session-id", state.session_id.clone())], + Json(serde_json::json!({ + "jsonrpc": "2.0", "id": id, + "result": {"protocolVersion": MCP_PROTOCOL_VERSION, "capabilities": {}} + })), + ) + .into_response(), + "notifications/initialized" => StatusCode::ACCEPTED.into_response(), + "tools/list" => ( + StatusCode::OK, + Json(serde_json::json!({ + "jsonrpc": "2.0", "id": id, "result": {"tools": state.tools} + })), + ) + .into_response(), + // A `pong` is a JSON-RPC *response*: it has `result` + `id` and no + // `method`. That's the heartbeat reply we're asserting on. + "" if body.get("result").is_some() && body.get("id").is_some() => { + state.pong_hits.fetch_add(1, Ordering::SeqCst); + StatusCode::ACCEPTED.into_response() + } + _ => StatusCode::ACCEPTED.into_response(), + } + } + + async fn heartbeat_get(State(state): State) -> axum::response::Response { + use axum::response::IntoResponse; + use futures::stream::{self, StreamExt}; + state.get_hits.fetch_add(1, Ordering::SeqCst); + // Push one server→client ping, then hold the stream open forever. + let ping = "event: message\ndata: {\"jsonrpc\":\"2.0\",\"id\":0,\"method\":\"ping\"}\n\n"; + let body = axum::body::Body::from_stream( + stream::once(async move { + Ok::<_, std::io::Error>(axum::body::Bytes::from_static(ping.as_bytes())) + }) + .chain(stream::pending()), + ); + ( + StatusCode::OK, + [("content-type", "text/event-stream")], + body, + ) + .into_response() + } + + /// A heartbeating upstream must not reap our session: the forwarder has to + /// hold the standalone `GET` SSE stream open and answer server pings with a + /// `pong`. This is the fix for the Playwright `about:blank` regression + /// (session reaped after the 5s ping timeout → re-init onto a blank page). + #[tokio::test] + async fn keepalive_holds_get_stream_and_pongs_server_ping() { + let state = HeartbeatState { + session_id: "hb-session-1".to_string(), + tools: vec![tool_def("browser_navigate", "go")], + get_hits: StdArc::new(AtomicUsize::new(0)), + pong_hits: StdArc::new(AtomicUsize::new(0)), + }; + let get_hits = state.get_hits.clone(); + let pong_hits = state.pong_hits.clone(); + + let app = Router::new() + .route("/", post(heartbeat_post)) + .route("/", get(heartbeat_get)) + .with_state(state); + let listener = TcpListener::bind("127.0.0.1:0").await.unwrap(); + let addr = listener.local_addr().unwrap(); + tokio::spawn(async move { + axum::serve(listener, app).await.unwrap(); + }); + let url = format!("http://{addr}/"); + + let registry = registry_with(vec![discovered("browser", &url, vec!["*"])]); + let dispatcher = + RouterToolDispatcher::discover_with_client(registry, reqwest::Client::new()) + .await + .unwrap(); + assert_eq!(dispatcher.len(), 1, "stateful server should be discovered"); + + // The keepalive runs in the background; give it a moment to open the + // GET stream, receive the ping, and POST the pong. + for _ in 0..50 { + if pong_hits.load(Ordering::SeqCst) >= 1 { + break; + } + tokio::time::sleep(Duration::from_millis(20)).await; + } + + assert!( + get_hits.load(Ordering::SeqCst) >= 1, + "keepalive must open the standalone GET SSE stream" + ); + assert!( + pong_hits.load(Ordering::SeqCst) >= 1, + "keepalive must answer the server's ping with a pong (else the upstream reaps the session)" + ); + } } From eb1a7fd0d43cb68b614ec7cdd27f31d7a8ba2003 Mon Sep 17 00:00:00 2001 From: Pal Lakatos-Toth Date: Tue, 30 Jun 2026 15:28:19 +0200 Subject: [PATCH 02/10] fix(deps): bump anyhow to 1.0.103 (RUSTSEC-2026-0190) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cargo-audit and cargo-deny on main fail on RUSTSEC-2026-0190 — unsoundness in anyhow's Error::downcast_mut() (UB when downcasting a context-wrapped error). Fixed in 1.0.103. Lockfile-only bump; no API changes. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- Cargo.lock | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 0ad5ae27a..996d5f74e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -209,9 +209,9 @@ dependencies = [ [[package]] name = "anyhow" -version = "1.0.102" +version = "1.0.103" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c" +checksum = "2a4385e2e34eb35d6b3efe798b9eb88096925d87726c0798709bf56d9ed84af3" [[package]] name = "ar_archive_writer" From 499d44daa761e111c9391c7f11e71213ae6d797e Mon Sep 17 00:00:00 2001 From: Pal Lakatos-Toth Date: Tue, 30 Jun 2026 15:28:30 +0200 Subject: [PATCH 03/10] feat(controller): auto-derive sandbox egress from referenced McpServer URLs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adding an MCP server should 'just work': the per-pod router is the only network path to it, so the sandbox's default-deny NetworkPolicy must admit the router→MCP hop. Until now the operator also had to hand-write a networkPolicy.allowedEndpoints entry for every MCP; miss it and calls to an in-cluster MCP silently time out. The controller now parses each referenced McpServer's spec.url and emits the correct egress rule automatically: - in-cluster Service DNS (*.svc.cluster.local) -> namespaceSelector rule. Under Cilium a K8s NetworkPolicy ipBlock (even 0.0.0.0/0) only matches the reserved 'world' entity and never an in-cluster pod, so an ipBlock rule would never admit traffic to another pod — namespaceSelector is the only form that works. - external host, non-443 -> coarse port-level ipBlock (router enforces host). - external host, 443 -> already covered by the blanket HTTPS rule. New helpers mcp_url_host_port / cluster_internal_namespace / mcp_egress_rule with unit tests; the reconcile loop walks effective_mcp_server_refs and adds de-duplicated rules. Missing/unparseable referents are logged and skipped. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- controller/src/reconciler/mod.rs | 289 ++++++++++++++++++++++++++++++- 1 file changed, 285 insertions(+), 4 deletions(-) diff --git a/controller/src/reconciler/mod.rs b/controller/src/reconciler/mod.rs index a202ce24b..018271a90 100644 --- a/controller/src/reconciler/mod.rs +++ b/controller/src/reconciler/mod.rs @@ -89,6 +89,109 @@ pub(crate) fn isolation_scheduling(isolation: &str) -> (Option<&'static str>, &' } } +/// Parse an `McpServer.spec.url` into the `(host, port)` the sandbox's +/// inference-router must be allowed to reach. +/// +/// Returns `None` when the URL is empty or unparseable (the caller skips +/// egress derivation and lets the router surface the misconfiguration). +/// The port defaults from the scheme (`https` → 443, `http` → 80) when not +/// explicitly given. Only `http`/`https` MCP transports are supported. +/// +/// Examples: +/// - `https://mcp.deepwiki.com/mcp` → `("mcp.deepwiki.com", 443)` +/// - `http://playwright-mcp.default.svc.cluster.local:8931/mcp` +/// → `("playwright-mcp.default.svc.cluster.local", 8931)` +pub(crate) fn mcp_url_host_port(url: &str) -> Option<(String, u16)> { + let url = url.trim(); + let (scheme, rest) = url.split_once("://")?; + let default_port: u16 = match scheme.to_ascii_lowercase().as_str() { + "https" => 443, + "http" => 80, + _ => return None, + }; + // Strip path / query / fragment — everything from the first '/', '?' or '#'. + let authority = rest.split(['/', '?', '#']).next().unwrap_or("").trim(); + if authority.is_empty() { + return None; + } + // Drop any userinfo ("user:pass@host"). + let host_port = authority.rsplit('@').next().unwrap_or(authority); + // IPv6 literal `[::1]:8931` — not expected for MCP services; reject so the + // caller falls back rather than mis-splitting on the inner colons. + if host_port.starts_with('[') { + return None; + } + let (host, port) = match host_port.rsplit_once(':') { + Some((h, p)) => { + let port = p.parse::().ok()?; + (h, port) + } + None => (host_port, default_port), + }; + if host.is_empty() { + return None; + } + Some((host.to_string(), port)) +} + +/// Build the NetworkPolicy egress rule that admits the sandbox router to an +/// MCP server at `host:port`. In-cluster Service DNS names get a +/// `namespaceSelector` rule (the only form Cilium honours for pod +/// destinations); external hosts get the coarse port-level `ipBlock` +/// (host-level enforcement lives in the router). Returns `None` for an +/// external host on 443 — already covered by the blanket HTTPS rule. +fn mcp_egress_rule(host: &str, port: u16) -> Option { + if let Some(ns) = cluster_internal_namespace(host) { + Some(json!({ + "to": [{ + "namespaceSelector": { + "matchLabels": { "kubernetes.io/metadata.name": ns } + } + }], + "ports": [{"protocol": "TCP", "port": port}] + })) + } else if port == 443 { + None + } else { + Some(json!({ + "to": [{"ipBlock": {"cidr": "0.0.0.0/0"}}], + "ports": [{"protocol": "TCP", "port": port}] + })) + } +} + +/// If `host` is a cluster-internal Kubernetes DNS name, return the namespace +/// it resolves into; otherwise `None`. +/// +/// In-cluster MCP servers / egress endpoints are addressed by their Service +/// DNS name — `..svc.cluster.local` (or the `.svc` short form). For +/// those we MUST express the NetworkPolicy egress with a `namespaceSelector`, +/// not an `ipBlock`: under the Cilium CNI a K8s NetworkPolicy `ipBlock` CIDR +/// (including `0.0.0.0/0`) only matches the reserved `world` entity and is +/// NOT applied to in-cluster pod endpoints, so an `ipBlock`-based rule silently +/// fails to admit traffic to another pod. Selecting the destination namespace +/// by its well-known `kubernetes.io/metadata.name` label is the portable way to +/// open egress to an in-cluster Service. +/// +/// Returns `None` for external hostnames (e.g. `api.openai.com`) and for bare, +/// namespace-ambiguous names — those keep the coarse `ipBlock` port rule, with +/// fine-grained host enforcement handled by the router's CONNECT allowlist. +pub(crate) fn cluster_internal_namespace(host: &str) -> Option { + let host = host.trim().trim_end_matches('.'); + // `..svc.cluster.local` or the cluster-suffixless `..svc`. + let stripped = host + .strip_suffix(".svc.cluster.local") + .or_else(|| host.strip_suffix(".svc"))?; + // After stripping the `.svc[.cluster.local]` suffix, the remainder is + // `.`; the namespace is the last label. + let ns = stripped.rsplit('.').next()?; + if ns.is_empty() || ns == stripped { + // `stripped` had no `.` → no namespace label present. + return None; + } + Some(ns.to_string()) +} + /// Build the egress-guard init-container command. /// /// Standard sandboxes (every kind except SRE) get the full lockdown: @@ -251,9 +354,100 @@ mod egress_guard_tests { ); } } -} -/// Custom error type that bridges serde_json and kube errors. + #[test] + fn cluster_internal_namespace_parses_fqdn_forms() { + use super::cluster_internal_namespace; + assert_eq!( + cluster_internal_namespace("playwright-mcp.default.svc.cluster.local"), + Some("default".to_string()) + ); + assert_eq!( + cluster_internal_namespace("playwright-mcp.default.svc"), + Some("default".to_string()) + ); + // Trailing dot (fully-qualified) is tolerated. + assert_eq!( + cluster_internal_namespace("svc-x.team-a.svc.cluster.local."), + Some("team-a".to_string()) + ); + } + + #[test] + fn cluster_internal_namespace_rejects_external_and_ambiguous() { + use super::cluster_internal_namespace; + // External hostnames keep the coarse ipBlock rule. + assert_eq!(cluster_internal_namespace("api.openai.com"), None); + assert_eq!(cluster_internal_namespace("example.com"), None); + // Bare names without a namespace label are ambiguous → None. + assert_eq!(cluster_internal_namespace("playwright-mcp"), None); + // A `.svc` suffix with no namespace label is not a valid in-cluster name. + assert_eq!(cluster_internal_namespace("svc"), None); + } + + #[test] + fn mcp_url_host_port_parses_common_forms() { + use super::mcp_url_host_port; + // Remote HTTPS MCP — default port 443. + assert_eq!( + mcp_url_host_port("https://mcp.deepwiki.com/mcp"), + Some(("mcp.deepwiki.com".to_string(), 443)) + ); + assert_eq!( + mcp_url_host_port("https://api.githubcopilot.com/mcp"), + Some(("api.githubcopilot.com".to_string(), 443)) + ); + // In-cluster HTTP MCP with explicit port. + assert_eq!( + mcp_url_host_port("http://playwright-mcp.default.svc.cluster.local:8931/mcp"), + Some(("playwright-mcp.default.svc.cluster.local".to_string(), 8931)) + ); + // http default port 80, no path. + assert_eq!( + mcp_url_host_port("http://svc.ns.svc.cluster.local"), + Some(("svc.ns.svc.cluster.local".to_string(), 80)) + ); + // Query/fragment are stripped. + assert_eq!( + mcp_url_host_port("https://host.example:8443/mcp?x=1#frag"), + Some(("host.example".to_string(), 8443)) + ); + } + + #[test] + fn mcp_url_host_port_rejects_invalid() { + use super::mcp_url_host_port; + assert_eq!(mcp_url_host_port(""), None); + assert_eq!(mcp_url_host_port("not-a-url"), None); + assert_eq!(mcp_url_host_port("ftp://host:21/x"), None); + assert_eq!(mcp_url_host_port("https://"), None); + // Non-numeric port. + assert_eq!(mcp_url_host_port("https://host:notaport/mcp"), None); + // IPv6 literals are not supported (fall back rather than mis-split). + assert_eq!(mcp_url_host_port("http://[::1]:8931/mcp"), None); + } + + #[test] + fn mcp_egress_rule_shapes_by_destination() { + use super::mcp_egress_rule; + // In-cluster → namespaceSelector on the destination namespace + port. + let rule = mcp_egress_rule("playwright-mcp.default.svc.cluster.local", 8931) + .expect("in-cluster rule"); + assert_eq!( + rule["to"][0]["namespaceSelector"]["matchLabels"]["kubernetes.io/metadata.name"], + "default" + ); + assert_eq!(rule["ports"][0]["port"], 8931); + + // External non-443 → coarse ipBlock + port. + let rule = mcp_egress_rule("mcp.example.com", 8080).expect("external non-443 rule"); + assert_eq!(rule["to"][0]["ipBlock"]["cidr"], "0.0.0.0/0"); + assert_eq!(rule["ports"][0]["port"], 8080); + + // External 443 → None (blanket HTTPS rule already covers it). + assert!(mcp_egress_rule("mcp.deepwiki.com", 443).is_none()); + } +} #[derive(Debug, thiserror::Error)] enum ReconcileError { #[error("Kubernetes API error: {0}")] @@ -1184,8 +1378,26 @@ async fn reconcile(sandbox: Arc, ctx: Arc) -> Result..svc.cluster.local`) + // must be admitted with a `namespaceSelector`: a `0.0.0.0/0` ipBlock + // does NOT match in-cluster pods under Cilium, so the agent's router + // could never reach an in-cluster MCP server. External hosts keep the + // coarse port-level ipBlock (host-level enforcement lives in the + // router's CONNECT allowlist). + if let Some(ns) = cluster_internal_namespace(&ep.host) { + egress_rules.push(json!({ + "to": [{ + "namespaceSelector": { + "matchLabels": { "kubernetes.io/metadata.name": ns } + } + }], + "ports": [{"protocol": "TCP", "port": port}] + })); + } else { egress_rules.push(json!({ "to": [{"ipBlock": {"cidr": "0.0.0.0/0"}}], "ports": [{"protocol": "TCP", "port": port}] @@ -1194,6 +1406,75 @@ async fn reconcile(sandbox: Arc, ctx: Arc) -> Result = + Api::namespaced(client.clone(), &sandbox_self_ns); + for mcp_ref in governance_config.effective_mcp_server_refs() { + let ref_name = mcp_ref.name.trim(); + if ref_name.is_empty() { + continue; + } + let url = match mcp_api.get(ref_name).await { + Ok(mcp) => mcp.spec.url.unwrap_or_default(), + Err(kube::Error::Api(ae)) if ae.code == 404 => { + // Missing referent is surfaced by the JWKS-mirror path + // below; here we just skip egress derivation for it. + tracing::warn!( + sandbox = %name, + mcp = %ref_name, + "referenced McpServer not found; egress not auto-derived", + ); + continue; + } + Err(e) => { + tracing::error!(error = %e, sandbox = %name, mcp = %ref_name, "McpServer lookup failed"); + return Ok(Action::requeue(Duration::from_secs(15))); + } + }; + // Empty when the signed-bundle authoring path supplies the url; + // those endpoints still need an explicit allowlist entry. + if url.is_empty() { + continue; + } + let Some((host, port)) = mcp_url_host_port(&url) else { + tracing::warn!( + sandbox = %name, + mcp = %ref_name, + url = %url, + "McpServer url not parseable for egress derivation; skipping", + ); + continue; + }; + if let Some(rule) = mcp_egress_rule(&host, port) + && !egress_rules.contains(&rule) + { + tracing::info!( + sandbox = %name, + mcp = %ref_name, + host = %host, + port, + "auto-derived MCP egress rule", + ); + egress_rules.push(rule); + } + } + } + // Compute ingress rules up front. When governance is enabled the sandbox // exposes :8443 (mesh inference) and :18789/:18791 (gateway WebUX/WebSocket) // to peer sandbox namespaces, plus :8443 to the operator namespace for From 18fc9c5c9ac6d34e99b58b88f97ec66ee6f05302 Mon Sep 17 00:00:00 2001 From: Pal Lakatos-Toth Date: Tue, 30 Jun 2026 15:28:39 +0200 Subject: [PATCH 04/10] feat(cli): kars update + automatic CLI update notice Every kars run now does a quiet, cached check (at most once per day) of the npm registry and prints a one-line notice when a newer @kars-runtime/cli has been published, with a short changelog summary pulled from the GitHub release. The new 'kars update' command is the explicit, always-fresh path: it shows the changelog and offers to install (npm install -g @kars-runtime/cli@latest), with --check (report-only, non-zero exit) and --yes (non-interactive) modes. Best-effort and unobtrusive by design: hard 1.5s network timeout, all errors swallowed, result cached in ~/.kars/update-check.json, never prompts after an arbitrary command, and silent in CI / non-TTY / when KARS_NO_UPDATE_CHECK=1. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- cli/src/cli.ts | 5 + cli/src/commands/update.ts | 59 ++++++ cli/src/index.ts | 12 +- cli/src/lib/update-check.test.ts | 156 +++++++++++++++ cli/src/lib/update-check.ts | 319 +++++++++++++++++++++++++++++++ 5 files changed, 550 insertions(+), 1 deletion(-) create mode 100644 cli/src/commands/update.ts create mode 100644 cli/src/lib/update-check.test.ts create mode 100644 cli/src/lib/update-check.ts diff --git a/cli/src/cli.ts b/cli/src/cli.ts index 4b2539939..a018a62d9 100644 --- a/cli/src/cli.ts +++ b/cli/src/cli.ts @@ -39,6 +39,7 @@ import { inspectCommand } from "./commands/inspect.js"; import { auditCommand } from "./commands/audit.js"; import { headlampCommand } from "./commands/headlamp.js"; import { sreCommand } from "./commands/sre.js"; +import { updateCommand } from "./commands/update.js"; export function createCli(): Command { const program = new Command(); @@ -100,6 +101,9 @@ export function createCli(): Command { // Attestation program.addCommand(attestCommand()); + // Self-management + program.addCommand(updateCommand()); + program.addHelpText("after", ` Command groups: Lifecycle up, dev, add, push, destroy @@ -110,6 +114,7 @@ Command groups: Interop convert, a2a, a2a-agent, migrate Governance toolpolicy, inferencepolicy, mcp, memory Attestation attest + Self update Quick start: kars up # Provision Azure + deploy controller + first sandbox diff --git a/cli/src/commands/update.ts b/cli/src/commands/update.ts new file mode 100644 index 000000000..053808d7c --- /dev/null +++ b/cli/src/commands/update.ts @@ -0,0 +1,59 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +// commands/update.ts — `kars update`: explicitly check npm for a newer +// `@kars-runtime/cli`, show the changelog, and (interactively) install it. +// +// This is the on-demand counterpart to the passive end-of-run notice wired into +// every invocation (lib/update-check.ts). It always hits the network (bypassing +// the 24h cache) and always offers to install when a newer version exists. + +import { Command } from "commander"; +import chalk from "chalk"; +import { cliVersion } from "../lib/version.js"; +import { + CLI_PACKAGE, + checkForCliUpdate, + renderUpdateNotice, +} from "../lib/update-check.js"; + +export function updateCommand(): Command { + const cmd = new Command("update"); + + cmd + .description(`Check for and install a newer ${CLI_PACKAGE}`) + .option("--check", "Only check; never prompt or install") + .option("--yes", "Install the latest version without prompting") + .action(async (options: { check?: boolean; yes?: boolean }) => { + const info = await checkForCliUpdate({ force: true, withChangelog: true }); + + if (!info) { + console.error( + chalk.green(`\n ✓ ${CLI_PACKAGE} is up to date `) + + chalk.dim(`(v${cliVersion()}).\n`), + ); + return; + } + + if (options.check) { + // Report-only: print the notice without the install prompt. + await renderUpdateNotice(info, { offerInstall: false }); + // Non-zero so scripts/CI can detect "an update is available". + process.exitCode = 1; + return; + } + + if (options.yes) { + const { runGlobalInstall } = await import("../lib/update-check.js"); + await renderUpdateNotice(info, { offerInstall: false }); + await runGlobalInstall(); + return; + } + + await renderUpdateNotice(info, { + offerInstall: !!process.stdout.isTTY && !!process.stdin.isTTY, + }); + }); + + return cmd; +} diff --git a/cli/src/index.ts b/cli/src/index.ts index 0e0b4ac43..051f3b1fe 100644 --- a/cli/src/index.ts +++ b/cli/src/index.ts @@ -11,7 +11,17 @@ import { createCli } from "./cli.js"; import { bootstrapKubeContext } from "./lib/kube-bootstrap.js"; +import { maybeNotifyCliUpdate } from "./lib/update-check.js"; await bootstrapKubeContext(process.argv); const program = createCli(); -program.parse(process.argv); +await program.parseAsync(process.argv); + +// Passive, best-effort: tell the user if a newer CLI has been published. Runs +// after the command so it never delays or clutters real output, is cached to +// at most one network check per day, and stays silent in CI / non-TTY / when +// opted out (KARS_NO_UPDATE_CHECK=1). The interactive install lives in +// `kars update`. Skip when the user already ran `kars update`. +if (!process.argv.slice(2).includes("update")) { + await maybeNotifyCliUpdate(); +} diff --git a/cli/src/lib/update-check.test.ts b/cli/src/lib/update-check.test.ts new file mode 100644 index 000000000..cea271ea2 --- /dev/null +++ b/cli/src/lib/update-check.test.ts @@ -0,0 +1,156 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +import { describe, it, expect, vi, beforeEach, afterEach } from "vitest"; +import os from "node:os"; +import path from "node:path"; +import { promises as fs } from "node:fs"; +import { + updateCheckDisabled, + summarizeReleaseBody, + fetchLatestVersion, + fetchChangelogSummary, + checkForCliUpdate, + CLI_PACKAGE, +} from "./update-check.js"; + +// Redirect the on-disk cache into a throwaway temp dir so tests never touch the +// developer's real ~/.kars/update-check.json. +let stateDir: string; +beforeEach(async () => { + stateDir = await fs.mkdtemp(path.join(os.tmpdir(), "kars-update-test-")); + process.env.KARS_STATE_DIR = stateDir; +}); +afterEach(async () => { + delete process.env.KARS_STATE_DIR; + await fs.rm(stateDir, { recursive: true, force: true }); +}); + +describe("updateCheckDisabled", () => { + it("is enabled by default", () => { + expect(updateCheckDisabled({})).toBe(false); + }); + + it("respects explicit opt-out env vars", () => { + expect(updateCheckDisabled({ KARS_NO_UPDATE_CHECK: "1" })).toBe(true); + expect(updateCheckDisabled({ KARS_NO_UPDATE_CHECK: "true" })).toBe(true); + expect(updateCheckDisabled({ KARS_UPDATE_CHECK: "0" })).toBe(true); + expect(updateCheckDisabled({ KARS_UPDATE_CHECK: "false" })).toBe(true); + }); + + it("disables in CI", () => { + expect(updateCheckDisabled({ CI: "true" })).toBe(true); + expect(updateCheckDisabled({ CI: "1" })).toBe(true); + // A falsey CI value does not disable. + expect(updateCheckDisabled({ CI: "0" })).toBe(false); + expect(updateCheckDisabled({ CI: "false" })).toBe(false); + }); +}); + +describe("summarizeReleaseBody", () => { + it("prefers the summary half of a 'vX — summary' release title", () => { + expect(summarizeReleaseBody("v0.1.24 — MCP keepalive + egress auto-derive", "")) + .toBe("MCP keepalive + egress auto-derive"); + }); + + it("falls back to the first meaningful body line, stripped of markdown", () => { + const body = "## What's changed\n\n- Fixed the Playwright `about:blank` session reaper\n"; + expect(summarizeReleaseBody(undefined, body)) + .toBe("Fixed the Playwright about:blank session reaper"); + }); + + it("truncates very long summaries", () => { + const long = "x".repeat(200); + const out = summarizeReleaseBody(long, "")!; + expect(out.length).toBeLessThanOrEqual(100); + expect(out.endsWith("…")).toBe(true); + }); + + it("returns undefined when nothing usable is present", () => { + expect(summarizeReleaseBody(undefined, undefined)).toBeUndefined(); + expect(summarizeReleaseBody("", " \n \n")).toBeUndefined(); + }); +}); + +describe("fetchLatestVersion", () => { + const realFetch = globalThis.fetch; + afterEach(() => { + globalThis.fetch = realFetch; + vi.restoreAllMocks(); + }); + + it("returns the version from the registry latest dist-tag", async () => { + globalThis.fetch = vi.fn(async () => + new Response(JSON.stringify({ version: "0.2.0" }), { status: 200 }), + ) as unknown as typeof fetch; + expect(await fetchLatestVersion(CLI_PACKAGE, 500)).toBe("0.2.0"); + }); + + it("returns null on a non-200 response", async () => { + globalThis.fetch = vi.fn(async () => new Response("nope", { status: 404 })) as unknown as typeof fetch; + expect(await fetchLatestVersion(CLI_PACKAGE, 500)).toBeNull(); + }); + + it("returns null (never throws) on a network error", async () => { + globalThis.fetch = vi.fn(async () => { + throw new Error("ECONNREFUSED"); + }) as unknown as typeof fetch; + expect(await fetchLatestVersion(CLI_PACKAGE, 500)).toBeNull(); + }); +}); + +describe("fetchChangelogSummary", () => { + const realFetch = globalThis.fetch; + afterEach(() => { + globalThis.fetch = realFetch; + vi.restoreAllMocks(); + }); + + it("summarizes the GitHub release for the tag", async () => { + globalThis.fetch = vi.fn(async () => + new Response(JSON.stringify({ name: "v0.2.0 — big fixes", body: "" }), { status: 200 }), + ) as unknown as typeof fetch; + expect(await fetchChangelogSummary("0.2.0", 500)).toBe("big fixes"); + }); + + it("returns undefined when the release is missing", async () => { + globalThis.fetch = vi.fn(async () => new Response("", { status: 404 })) as unknown as typeof fetch; + expect(await fetchChangelogSummary("9.9.9", 500)).toBeUndefined(); + }); +}); + +describe("checkForCliUpdate", () => { + const realFetch = globalThis.fetch; + afterEach(() => { + globalThis.fetch = realFetch; + vi.restoreAllMocks(); + }); + + it("returns null when disabled", async () => { + const info = await checkForCliUpdate({ env: { KARS_NO_UPDATE_CHECK: "1" } }); + expect(info).toBeNull(); + }); + + it("returns null when the registry reports an older/equal version", async () => { + globalThis.fetch = vi.fn(async () => + new Response(JSON.stringify({ version: "0.0.0" }), { status: 200 }), + ) as unknown as typeof fetch; + const info = await checkForCliUpdate({ force: true }); + expect(info).toBeNull(); + }); + + it("reports an update when the registry has a newer version", async () => { + globalThis.fetch = vi.fn(async (url: string | URL) => { + const u = String(url); + if (u.includes("registry.npmjs.org")) { + return new Response(JSON.stringify({ version: "999.0.0" }), { status: 200 }); + } + // changelog + return new Response(JSON.stringify({ name: "v999.0.0 — the future", body: "" }), { status: 200 }); + }) as unknown as typeof fetch; + const info = await checkForCliUpdate({ force: true, withChangelog: true }); + expect(info).not.toBeNull(); + expect(info!.latest).toBe("999.0.0"); + expect(info!.changelog).toBe("the future"); + }); +}); diff --git a/cli/src/lib/update-check.ts b/cli/src/lib/update-check.ts new file mode 100644 index 000000000..740d9f77a --- /dev/null +++ b/cli/src/lib/update-check.ts @@ -0,0 +1,319 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +// lib/update-check.ts — best-effort "a newer CLI is published" notifier. +// +// Every interactive `kars` run quietly checks whether a newer +// `@kars-runtime/cli` has been published to npm and, if so, prints a short +// notice (current → latest + a one-line changelog) and offers to install it. +// Design constraints, in priority order: +// 1. NEVER break or slow a command. The check is wrapped in a hard timeout, +// all errors are swallowed, and the result is cached so we hit the network +// at most once per `CHECK_TTL_MS` window. +// 2. NEVER nag. We surface the notice at most once per `CHECK_TTL_MS`, only on +// a TTY, never in CI, and respect an explicit opt-out env var. +// 3. NEVER act without consent. The install only runs if the user confirms the +// interactive prompt; otherwise we just print the copy-paste command. +// +// The cache lives at `~/.kars/update-check.json`. It records the last fetched +// latest version, when we fetched it, and when we last showed the notice — so +// the notice is decoupled from the network check (classic update-notifier +// behaviour: we may show a previously-fetched result while refreshing for next +// time). + +import { promises as fs } from "node:fs"; +import os from "node:os"; +import path from "node:path"; +import { cliVersion } from "./version.js"; +import { compareVersions } from "./release.js"; + +/** npm package this CLI is published as. */ +export const CLI_PACKAGE = "@kars-runtime/cli"; + +/** How long a network result / shown-notice is considered fresh. */ +export const CHECK_TTL_MS = 24 * 60 * 60 * 1000; // 24h + +/** Hard cap on the registry request so a slow/unreachable network never delays + * the user's actual command by more than this. */ +const FETCH_TIMEOUT_MS = 1500; + +const CACHE_DIR = path.join(os.homedir(), ".kars"); +const CACHE_FILE = path.join(CACHE_DIR, "update-check.json"); + +/** Resolve the cache file, honouring `KARS_STATE_DIR` so tests (and unusual + * environments) can redirect the on-disk cache away from `~/.kars`. */ +function cacheFile(): string { + const override = process.env.KARS_STATE_DIR; + return override ? path.join(override, "update-check.json") : CACHE_FILE; +} + +interface UpdateCache { + /** Latest version string seen on the registry (no `v` prefix), or null. */ + latest: string | null; + /** Epoch ms of the last successful (or attempted) registry fetch. */ + checkedAt: number; + /** Epoch ms we last printed the notice to the user. */ + notifiedAt: number; +} + +export interface UpdateInfo { + current: string; + latest: string; + /** Short changelog blurb for `latest`, when one is cheaply available. */ + changelog?: string; +} + +/** True when update checks are disabled for this process: explicit opt-out, CI, + * or a non-interactive stdout (piped / redirected). Exported for tests. */ +export function updateCheckDisabled(env: NodeJS.ProcessEnv = process.env): boolean { + if (env.KARS_NO_UPDATE_CHECK === "1" || env.KARS_NO_UPDATE_CHECK === "true") return true; + if (env.KARS_UPDATE_CHECK === "0" || env.KARS_UPDATE_CHECK === "false") return true; + // Common CI signal — never nag an automated pipeline. + if (env.CI && env.CI !== "0" && env.CI !== "false") return true; + return false; +} + +async function readCache(): Promise { + try { + const raw = await fs.readFile(cacheFile(), "utf8"); + const parsed = JSON.parse(raw) as Partial; + if (typeof parsed.checkedAt !== "number") return null; + return { + latest: typeof parsed.latest === "string" ? parsed.latest : null, + checkedAt: parsed.checkedAt, + notifiedAt: typeof parsed.notifiedAt === "number" ? parsed.notifiedAt : 0, + }; + } catch { + return null; + } +} + +async function writeCache(cache: UpdateCache): Promise { + try { + const file = cacheFile(); + await fs.mkdir(path.dirname(file), { recursive: true }); + await fs.writeFile(file, JSON.stringify(cache), "utf8"); + } catch { + // Cache is an optimisation; a write failure must never surface. + } +} + +/** Fetch the `latest` dist-tag version from the npm registry. Best-effort: + * returns null on any error or timeout. Exported for tests. */ +export async function fetchLatestVersion( + pkg: string = CLI_PACKAGE, + timeoutMs: number = FETCH_TIMEOUT_MS, +): Promise { + // The `latest` endpoint returns just the dist-tag manifest — far smaller than + // the full packument. + const url = `https://registry.npmjs.org/${pkg.replace("/", "%2f")}/latest`; + try { + const resp = await fetch(url, { + signal: AbortSignal.timeout(timeoutMs), + headers: { accept: "application/json" }, + }); + if (!resp.ok) return null; + const body = (await resp.json()) as { version?: string }; + return typeof body.version === "string" ? body.version : null; + } catch { + return null; + } +} + +/** A terse, best-effort one-line changelog for `version`, pulled from the + * GitHub release body. Returns undefined when unavailable. Exported for tests. */ +export async function fetchChangelogSummary( + version: string, + timeoutMs: number = FETCH_TIMEOUT_MS, +): Promise { + const tag = version.startsWith("v") ? version : `v${version}`; + const url = `https://api.github.com/repos/Azure/kars/releases/tags/${tag}`; + try { + const resp = await fetch(url, { + signal: AbortSignal.timeout(timeoutMs), + headers: { accept: "application/vnd.github+json" }, + }); + if (!resp.ok) return undefined; + const body = (await resp.json()) as { name?: string; body?: string }; + return summarizeReleaseBody(body.name, body.body); + } catch { + return undefined; + } +} + +/** Reduce a GitHub release `name`/`body` to a single readable line. Prefers the + * release title; otherwise the first meaningful bullet/line of the body. + * Exported for tests. */ +export function summarizeReleaseBody(name?: string, body?: string): string | undefined { + const clean = (s: string): string => + s.replace(/^[#>*\-\s]+/, "").replace(/`/g, "").replace(/\s+/g, " ").trim(); + if (name && name.trim()) { + // Release titles are often "vX.Y.Z — summary"; keep the summary half. + const dash = name.split(/\s[—-]\s/); + const tail = dash.length > 1 ? dash.slice(1).join(" - ") : name; + const t = clean(tail); + if (t) return truncate(t); + } + if (body) { + for (const line of body.split(/\r?\n/)) { + // Skip markdown headings ("## What's changed") — we want a content line. + if (/^\s*#/.test(line)) continue; + const c = clean(line); + if (c.length >= 8) return truncate(c); + } + } + return undefined; +} + +function truncate(s: string, max = 100): string { + return s.length > max ? `${s.slice(0, max - 1).trimEnd()}…` : s; +} + +/** + * Determine whether an update is available, using the cache to avoid hitting the + * network more than once per TTL. Returns null when up-to-date, disabled, or the + * lookup couldn't determine a newer version. + * + * `force` bypasses the TTL (used by the explicit `kars update` command). + */ +export async function checkForCliUpdate(opts: { + force?: boolean; + withChangelog?: boolean; + env?: NodeJS.ProcessEnv; +} = {}): Promise { + const env = opts.env ?? process.env; + if (!opts.force && updateCheckDisabled(env)) return null; + + const current = cliVersion(); + const now = Date.now(); + const cache = await readCache(); + + let latest = cache?.latest ?? null; + const fresh = cache && now - cache.checkedAt < CHECK_TTL_MS; + + if (opts.force || !fresh) { + const fetched = await fetchLatestVersion(); + // Record the attempt regardless of outcome so a flaky network doesn't make + // us re-check on every single invocation. + latest = fetched ?? latest; + await writeCache({ + latest, + checkedAt: now, + notifiedAt: cache?.notifiedAt ?? 0, + }); + } + + if (!latest) return null; + if (compareVersions(latest, current) <= 0) return null; + + const info: UpdateInfo = { current, latest }; + if (opts.withChangelog) { + info.changelog = await fetchChangelogSummary(latest); + } + return info; +} + +/** Record that we've shown the notice now, so we don't repeat it within the TTL. */ +async function markNotified(): Promise { + const cache = (await readCache()) ?? { latest: null, checkedAt: Date.now(), notifiedAt: 0 }; + cache.notifiedAt = Date.now(); + await writeCache(cache); +} + +/** True if we've already shown a notice within the TTL window. */ +async function notifiedRecently(): Promise { + const cache = await readCache(); + return !!cache && Date.now() - cache.notifiedAt < CHECK_TTL_MS; +} + +/** + * The passive, end-of-run hook wired into every `kars` invocation. Best-effort: + * checks for an update (cached) and, at most once per TTL on an interactive + * terminal, offers to install it. Silent and side-effect-free when up-to-date, + * disabled, non-interactive, or already shown recently. + * + * Returns true if a notice was shown (used by tests). + */ +export async function maybeNotifyCliUpdate(): Promise { + try { + if (updateCheckDisabled()) return false; + if (await notifiedRecently()) return false; + + const info = await checkForCliUpdate({ withChangelog: true }); + if (!info) return false; + + await markNotified(); + // The passive notice never hijacks stdin after an arbitrary command — it + // prints the changelog + how to upgrade. The interactive install lives in + // `kars update`, which the notice points to. + await renderUpdateNotice(info, { offerInstall: false }); + return true; + } catch { + return false; + } +} + +/** + * Render the update notice. On an interactive TTY (`offerInstall`) it prompts to + * run the global install; otherwise it just prints the copy-paste command. + * Exported so the `kars update` command can reuse the exact same presentation. + */ +export async function renderUpdateNotice( + info: UpdateInfo, + opts: { offerInstall?: boolean } = {}, +): Promise { + const { default: chalk } = await import("chalk"); + const installCmd = `npm install -g ${CLI_PACKAGE}@latest`; + + const lines = [ + "", + chalk.cyan(` ✨ A new kars CLI is available: `) + + chalk.dim(info.current) + chalk.cyan(" → ") + chalk.green.bold(info.latest), + ]; + if (info.changelog) { + lines.push(chalk.dim(` ${info.changelog}`)); + } + lines.push( + chalk.dim(` Release notes: https://github.com/Azure/kars/releases/tag/v${info.latest}`), + "", + ); + console.error(lines.join("\n")); + + if (opts.offerInstall) { + const { default: inquirer } = await import("inquirer"); + const { doInstall } = await inquirer.prompt<{ doInstall: boolean }>([ + { + type: "confirm", + name: "doInstall", + message: `Update to ${info.latest} now?`, + default: true, + }, + ]); + if (doInstall) { + await runGlobalInstall(); + return; + } + } + console.error( + chalk.dim(` Run `) + chalk.bold(`kars update`) + + chalk.dim(` to install it, or: `) + chalk.bold(installCmd) + "\n", + ); +} + +/** Run the global npm install of the latest CLI, streaming output. Best-effort: + * reports failure with the manual command rather than throwing. */ +export async function runGlobalInstall(): Promise { + const { default: chalk } = await import("chalk"); + const { execa } = await import("execa"); + const installCmd = `npm install -g ${CLI_PACKAGE}@latest`; + try { + console.error(chalk.dim(`\n $ ${installCmd}\n`)); + await execa("npm", ["install", "-g", `${CLI_PACKAGE}@latest`], { stdio: "inherit" }); + console.error(chalk.green(`\n ✓ Updated. Re-run your command to use ${CLI_PACKAGE}@latest.\n`)); + } catch { + console.error( + chalk.yellow(`\n ⚠ Automatic update failed. Update manually with:\n `) + + chalk.bold(installCmd) + "\n", + ); + } +} From c7d1e2c94540a91f9385caeba54c85e0cad0ff91 Mon Sep 17 00:00:00 2001 From: Pal Lakatos-Toth Date: Tue, 30 Jun 2026 15:28:46 +0200 Subject: [PATCH 05/10] feat(upgrade): pre-flight server-side-apply conflict detection Helm v4 applies releases server-side, so a Deployment field owned by another field manager (e.g. a manual 'kubectl set env HERMES_RUNTIME_IMAGE') makes both the real apply AND its atomic rollback fail with a conflict, wedging the release mid-upgrade. kars upgrade now runs a server-side dry-run first and parses any field-manager conflicts. On conflict it stops BEFORE any change and prints the offending object/field/manager plus two remediation paths: --force-conflicts to take ownership, or a precise copy-paste command to hand the field back. It never silently clobbers a field a live operator legitimately manages. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- cli/src/commands/upgrade.test.ts | 97 +++++++++++++ cli/src/commands/upgrade.ts | 226 ++++++++++++++++++++++++++++++- 2 files changed, 318 insertions(+), 5 deletions(-) diff --git a/cli/src/commands/upgrade.test.ts b/cli/src/commands/upgrade.test.ts index ee429a2ab..683794471 100644 --- a/cli/src/commands/upgrade.test.ts +++ b/cli/src/commands/upgrade.test.ts @@ -9,6 +9,8 @@ import { detectCurrentVersion, rolloutRestartAll, verifyHealth, + parseHelmFieldConflicts, + suggestConflictRemediation, } from "./upgrade.js"; describe("isFoundryProjectHost", () => { @@ -127,6 +129,29 @@ describe("buildHelmUpgradeArgs", () => { expect(args.slice(0, 4)).toEqual(["upgrade", "--install", "kars", "/chart"]); }); + it("does NOT take ownership of foreign-managed fields by default (no --force-conflicts)", () => { + const args = buildHelmUpgradeArgs(ctx, "/chart", "v0.1.20"); + expect(args).not.toContain("--force-conflicts"); + }); + + it("adds --force-conflicts only when explicitly opted in", () => { + const args = buildHelmUpgradeArgs(ctx, "/chart", "v0.1.20", { forceConflicts: true }); + expect(args).toContain("--force-conflicts"); + // still a real, atomic upgrade + expect(args).toContain("--atomic"); + }); + + it("builds a non-mutating server dry-run for the conflict pre-flight", () => { + const args = buildHelmUpgradeArgs(ctx, "/chart", "v0.1.20", { dryRunServer: true }); + expect(args).toContain("--dry-run=server"); + // dry-run must NOT carry --atomic/--wait (meaningless) nor force ownership + expect(args).not.toContain("--atomic"); + expect(args).not.toContain("--wait"); + expect(args).not.toContain("--force-conflicts"); + // ...but still carries the real image values so the dry-run is representative + expect(args.join(" ")).toContain("controller.image.tag=v0.1.20"); + }); + it("only sets the Foundry endpoint when one is configured", () => { const withFoundry = buildHelmUpgradeArgs({ ...ctx, foundryEndpoint: "https://x.services.ai.azure.com" }, "/chart", "v0.1.20").join(" "); expect(withFoundry).toContain("inferenceRouter.azure.openai.endpoint=https://x.services.ai.azure.com"); @@ -135,6 +160,78 @@ describe("buildHelmUpgradeArgs", () => { }); }); +describe("parseHelmFieldConflicts — failsafe field-manager conflict detection", () => { + // The real Helm v4 error that wedged the v0.1.18 → v0.1.23 upgrade on launch + // day, as printed to stderr (`Error: UPGRADE FAILED:` — quotes are bare here). + const realError = + "Error: UPGRADE FAILED: an error occurred while rolling back the release. " + + "original upgrade error: conflict occurred while applying object " + + "kars-system/kars-controller apps/v1, Kind=Deployment: Apply failed with 1 " + + 'conflict: conflict with "kubectl-set" using apps/v1: ' + + '.spec.template.spec.containers[name="controller"].env[name="HERMES_RUNTIME_IMAGE"].value'; + + it("extracts the object, kind, manager, and field from a real conflict", () => { + const conflicts = parseHelmFieldConflicts(realError); + expect(conflicts).toHaveLength(1); + expect(conflicts[0]).toMatchObject({ + object: "kars-system/kars-controller", + kind: "Deployment", + manager: "kubectl-set", + }); + expect(conflicts[0].field).toContain('env[name="HERMES_RUNTIME_IMAGE"].value'); + }); + + it("also parses Go-%q escaped quotes from the structured WARN log line", () => { + const warn = + 'level=WARN msg="upgrade failed" name=kars error="conflict occurred while ' + + 'applying object kars-system/kars-controller apps/v1, Kind=Deployment: Apply ' + + 'failed with 1 conflict: conflict with \\"kubectl-set\\" using apps/v1: ' + + '.spec.template.spec.containers[name=\\"controller\\"].env[name=\\"HERMES_RUNTIME_IMAGE\\"].value"'; + const conflicts = parseHelmFieldConflicts(warn); + expect(conflicts).toHaveLength(1); + expect(conflicts[0].manager).toBe("kubectl-set"); + }); + + it("attributes multiple conflicting fields to the right object and de-dupes", () => { + const multi = + "conflict occurred while applying object ns/dep apps/v1, Kind=Deployment: " + + 'Apply failed with 2 conflicts: conflict with "mgr-a" using apps/v1: .spec.foo, ' + + 'conflict with "mgr-b" using apps/v1: .spec.bar; ' + + // a repeat of the same conflict must not double-count + 'conflict with "mgr-a" using apps/v1: .spec.foo'; + const conflicts = parseHelmFieldConflicts(multi); + expect(conflicts).toHaveLength(2); + expect(conflicts.map((c) => c.manager).sort()).toEqual(["mgr-a", "mgr-b"]); + expect(conflicts.every((c) => c.object === "ns/dep")).toBe(true); + }); + + it("returns nothing for output that has no conflict (a clean dry-run)", () => { + expect(parseHelmFieldConflicts("")).toEqual([]); + expect(parseHelmFieldConflicts("Release \"kars\" has been upgraded. Happy Helming!")).toEqual([]); + }); +}); + +describe("suggestConflictRemediation — actionable copy-paste fix", () => { + it("turns an env-var conflict into the exact `kubectl set env … NAME-` command", () => { + const fix = suggestConflictRemediation({ + object: "kars-system/kars-controller", + kind: "Deployment", + manager: "kubectl-set", + field: '.spec.template.spec.containers[name="controller"].env[name="HERMES_RUNTIME_IMAGE"].value', + }); + expect(fix).toBe("kubectl set env deployment/kars-controller -n kars-system HERMES_RUNTIME_IMAGE-"); + }); + + it("returns null for field shapes it can't safely auto-remediate", () => { + expect(suggestConflictRemediation({ + object: "ns/cm", + kind: "ConfigMap", + manager: "kubectl-edit", + field: ".data.someKey", + })).toBeNull(); + }); +}); + // ── health / safety / recovery coverage ──────────────────────────── type Execa = typeof import("execa").execa; diff --git a/cli/src/commands/upgrade.ts b/cli/src/commands/upgrade.ts index c5ecc0808..4f9fab21f 100644 --- a/cli/src/commands/upgrade.ts +++ b/cli/src/commands/upgrade.ts @@ -9,9 +9,16 @@ // 1. detects current vs. target version (latest GHCR release, or --to ), // 2. records a rollback point (Helm revision), // 3. imports the target release images into the user's ACR (pinned + :latest), -// 4. `helm upgrade --atomic` (auto-rolls-back the release on failure), -// 5. rolls the controller, router, and sandbox workloads to the new images, -// 6. verifies health and prints what changed. +// 4. pre-flights a server-side dry-run to detect fields owned by another field +// manager (e.g. a manual `kubectl set env`) — Helm v4 applies server-side, +// so such a field would make the real apply AND its atomic rollback fail +// with a conflict and wedge the release. On conflict the upgrade stops +// BEFORE any change and prints the offending fields + remediation options +// (`--force-conflicts` to take ownership, or a copy-paste command to hand +// the field back); it never silently overwrites a field a live operator owns, +// 5. `helm upgrade --atomic` (auto-rolls-back the release on failure), +// 6. rolls the controller, router, and sandbox workloads to the new images, +// 7. verifies health and prints what changed. // `--dry-run` shows the plan with no writes; `--rollback` reverts to the // previous Helm revision. @@ -129,7 +136,7 @@ export function buildHelmUpgradeArgs( ctx: UpgradeContext, helmPath: string, target?: string, - opts: { skipRuntimeImages?: boolean } = {}, + opts: { skipRuntimeImages?: boolean; forceConflicts?: boolean; dryRunServer?: boolean } = {}, ): string[] { const tag = target && target.length > 0 ? target : "latest"; const args = [ @@ -155,7 +162,24 @@ export function buildHelmUpgradeArgs( args.push("--set", `${valueKey}=${ctx.acrLoginServer}/${repo}:${tag}`); } } - args.push("--atomic", "--wait", "--timeout", "8m"); + // Execution mode: + // - dryRunServer: a server-side apply in dry-run — mutates nothing but makes + // the API server report field-manager (server-side apply) conflicts, which + // is how the pre-flight detects fields another manager owns BEFORE we touch + // the cluster. `--atomic`/`--wait` are meaningless here and omitted. + // - otherwise: the real upgrade. `--atomic` auto-rolls-back a failed rollout. + // - forceConflicts: opt-in. Makes server-side apply overwrite fields owned by + // other field managers (e.g. a manual `kubectl set env`). Off by default so + // we never silently clobber a field a live operator legitimately manages — + // the pre-flight surfaces the conflict and the operator chooses. + if (opts.dryRunServer) { + args.push("--dry-run=server"); + } else { + args.push("--atomic", "--wait", "--timeout", "8m"); + } + if (opts.forceConflicts) { + args.push("--force-conflicts"); + } if (ctx.foundryEndpoint) { args.push("--set", `inferenceRouter.azure.openai.endpoint=${ctx.foundryEndpoint}`); } @@ -168,6 +192,153 @@ export function buildHelmUpgradeArgs( return args; } +/** A single server-side-apply field-manager conflict, parsed from Helm's error. + * Helm v4 applies releases server-side, so a field another manager owns (e.g. a + * manual `kubectl set env`) makes the apply fail with a structured conflict + * instead of silently overwriting it. */ +export interface FieldManagerConflict { + /** `namespace/name` of the object, e.g. `kars-system/kars-controller`. */ + object: string; + /** Kind of the object, e.g. `Deployment`. */ + kind: string; + /** The field manager that owns the field, e.g. `kubectl-set`. */ + manager: string; + /** The conflicting field path, e.g. + * `.spec.template.spec.containers[name="controller"].env[name="HERMES_RUNTIME_IMAGE"].value`. */ + field: string; +} + +/** Parse server-side-apply field-manager conflicts out of a failed Helm + * upgrade's stderr/stdout. Returns one entry per conflicting field. Exported + * for tests. The shape we match (Helm v4 / Kubernetes SSA): + * + * conflict occurred while applying object kars-system/kars-controller apps/v1, + * Kind=Deployment: Apply failed with 1 conflict: conflict with "kubectl-set" + * using apps/v1: .spec.template.spec.containers[name="controller"] + * .env[name="HERMES_RUNTIME_IMAGE"].value + */ +export function parseHelmFieldConflicts(output: string): FieldManagerConflict[] { + if (!output) return []; + const conflicts: FieldManagerConflict[] = []; + const seen = new Set(); + // Each "applying object , Kind=:" header is + // followed by one or more `conflict with "" using : `. + // Conflicts can repeat for the same object, so track the most recent header. + const objectRe = /applying object (\S+?) \S+, Kind=(\w+)/g; + // Manager quotes may be bare (`"x"`) or Go-%q escaped (`\"x\"`) depending on + // whether the line came from Helm's plain `Error:` output or its structured + // WARN log. Field is the last token; stop at whitespace or a `,`/`;` list + // separator so a multi-conflict list doesn't fold the separator into the path. + const conflictRe = /conflict with \\?"([^"\\]+)\\?" using [^:]+:\s*([^\s,;]+)/g; + + // Walk the string, attributing each conflict to the nearest preceding object. + interface Header { index: number; object: string; kind: string } + const headers: Header[] = []; + for (let m = objectRe.exec(output); m; m = objectRe.exec(output)) { + headers.push({ index: m.index, object: m[1], kind: m[2] }); + } + for (let m = conflictRe.exec(output); m; m = conflictRe.exec(output)) { + let header: Header | undefined; + for (const h of headers) { + if (h.index <= m.index) header = h; + else break; + } + const c: FieldManagerConflict = { + object: header?.object ?? "unknown", + kind: header?.kind ?? "unknown", + manager: m[1], + field: m[2], + }; + const key = `${c.object}|${c.manager}|${c.field}`; + if (!seen.has(key)) { + seen.add(key); + conflicts.push(c); + } + } + return conflicts; +} + +/** Suggest a precise, copy-pasteable command to hand a conflicting field back so + * the upgrade can own it — when the field is a recognisable shape (a container + * env var set by a manual `kubectl set env`). Returns null for field shapes we + * can't safely auto-remediate; the caller then falls back to generic guidance. */ +export function suggestConflictRemediation(c: FieldManagerConflict): string | null { + // .spec.template.spec.containers[name="X"].env[name="ENV"].value → kubectl set env … ENV- + const envMatch = c.field.match(/\.env\[name="([^"]+)"\]/); + if (envMatch && /Deployment/i.test(c.kind)) { + const [ns, name] = c.object.includes("/") ? c.object.split("/") : ["", c.object]; + return `kubectl set env deployment/${name} -n ${ns} ${envMatch[1]}-`; + } + return null; +} + +/** Print the conflict report + the two remediation paths the operator can take. + * Pure presentation; never mutates anything. */ +function reportFieldManagerConflicts(conflicts: FieldManagerConflict[]): void { + console.error(chalk.yellow( + `\n ⚠ Pre-flight: ${conflicts.length} field(s) on this cluster are owned by another\n` + + ` field manager, not Helm. Server-side apply will NOT overwrite them, so the\n` + + ` upgrade was stopped before any change — exactly so it can't half-apply.\n`, + )); + for (const c of conflicts) { + console.error(chalk.yellow(` • ${c.kind} ${c.object}`)); + console.error(chalk.dim(` field: ${c.field}`)); + console.error(chalk.dim(` manager: ${c.manager}`)); + const fix = suggestConflictRemediation(c); + if (fix) console.error(chalk.dim(` release: ${fix}`)); + } + const fixes = conflicts.map(suggestConflictRemediation).filter((f): f is string => f !== null); + console.error(chalk.cyan("\n Choose one:\n")); + console.error( + " A) Let kars take ownership (use only if no other operator manages these\n" + + " fields — it overwrites them with the release's values):\n" + + chalk.bold(" kars upgrade --force-conflicts\n"), + ); + if (fixes.length > 0) { + console.error( + " B) Hand the field(s) back first (keeps the rest of that manager's\n" + + " ownership intact), then re-run the upgrade:\n" + + fixes.map((f) => chalk.bold(` ${f}`)).join("\n") + + chalk.bold(`\n kars upgrade\n`), + ); + } else { + console.error(chalk.dim( + " B) Or remove the foreign field manager's ownership of those fields by\n" + + " hand (e.g. revert the manual `kubectl` edit that set them), then\n" + + " re-run `kars upgrade`.\n", + )); + } +} + +/** Server-side dry-run pre-flight: detect field-manager conflicts WITHOUT + * changing anything. Returns the parsed conflicts (empty if the dry-run + * succeeds). A dry-run that fails for any *other* reason is reported as + * `otherError` and is non-fatal: the real upgrade runs next and surfaces the + * authoritative error rather than blocking on a flaky pre-flight. */ +async function preflightFieldManagerConflicts( + execa: Execa, + ctx: UpgradeContext, + helmPath: string, + target: string, + opts: { skipRuntimeImages?: boolean }, +): Promise<{ conflicts: FieldManagerConflict[]; otherError?: string }> { + const args = buildHelmUpgradeArgs(ctx, helmPath, target, { + skipRuntimeImages: opts.skipRuntimeImages, + dryRunServer: true, + }); + try { + await execa("helm", args, { stdio: "pipe" }); + return { conflicts: [] }; + } catch (err) { + const e = err as { stderr?: string; stdout?: string; message?: string }; + const blob = `${e.stderr ?? ""}\n${e.stdout ?? ""}\n${e.message ?? ""}`; + const conflicts = parseHelmFieldConflicts(blob); + if (conflicts.length > 0) return { conflicts }; + return { conflicts: [], otherError: e.message ?? String(err) }; + } +} + + export function upgradeCommand(): Command { const cmd = new Command("upgrade"); cmd @@ -180,6 +351,12 @@ export function upgradeCommand(): Command { .option("--rollback", "Roll the cluster back to the previous Helm revision.", false) .option("--skip-runtime-images", "Skip the 7 multi-runtime adapter images (faster).", false) .option("--force", "Re-run the upgrade even if already at the target version.", false) + .option( + "--force-conflicts", + "Take ownership of fields managed by another field manager (e.g. a manual " + + "`kubectl set env`). Only safe when no other operator manages those fields.", + false, + ) .option("--yes", "Skip the confirmation prompt (assume yes).", false) .addHelpText("after", ` Examples: @@ -187,6 +364,7 @@ Examples: kars upgrade --to v0.1.16 # Pin a specific release kars upgrade --dry-run # Show changelog + impact + plan, make no changes kars upgrade --yes # Skip the confirmation prompt + kars upgrade --force-conflicts # Take ownership of fields another manager owns kars upgrade --rollback # Revert to the previous Helm revision `) .action(async (options) => { @@ -374,8 +552,34 @@ Examples: // ── Step 4: Atomic Helm upgrade ─────────────────────────────── stepper.step("Upgrading controller + CRDs (atomic Helm upgrade)..."); const helmPath = requireBundledAsset("deploy/helm/kars"); + + // Pre-flight: a server-side dry-run detects fields owned by another + // field manager (e.g. a manual `kubectl set env`). Helm v4 applies + // server-side, so such a field makes the real apply — AND its atomic + // rollback — fail with a conflict, leaving the release wedged. Catch it + // here, before any mutation, and let the operator decide rather than + // silently overwriting a field a live operator may legitimately own. + const pf = await preflightFieldManagerConflicts(execa, ctx, helmPath, target, { + skipRuntimeImages: options.skipRuntimeImages, + }); + if (pf.conflicts.length > 0 && !options.forceConflicts) { + stepper.fail("Field-manager conflict — upgrade stopped before any change"); + reportFieldManagerConflicts(pf.conflicts); + process.exit(1); + } + if (pf.conflicts.length > 0 && options.forceConflicts) { + stepper.detail("info", + `Taking ownership of ${pf.conflicts.length} field(s) from another manager (--force-conflicts)`); + } + if (pf.otherError) { + // Non-conflict dry-run failure: don't block — the real upgrade below + // surfaces the authoritative error (and its --atomic rolls back). + stepper.detail("info", "Pre-flight dry-run inconclusive — proceeding (real upgrade will report any error)"); + } + await execa("helm", buildHelmUpgradeArgs(ctx, helmPath, target, { skipRuntimeImages: options.skipRuntimeImages, + forceConflicts: options.forceConflicts, }), { stdio: "pipe" }); stepper.done("Helm upgrade applied (auto-rollback on failure via --atomic)"); @@ -440,7 +644,19 @@ Examples: process.exit(0); } catch (err) { stepper.stop(); + const e = err as { stderr?: string; stdout?: string; message?: string }; const msg = err instanceof Error ? err.message : String(err); + // If a field-manager conflict slipped past the pre-flight (e.g. a field + // that only appears once the apply runs), give the same actionable + // guidance instead of a raw Helm error. + const lateConflicts = parseHelmFieldConflicts( + `${e.stderr ?? ""}\n${e.stdout ?? ""}\n${msg}`, + ); + if (lateConflicts.length > 0 && !options.forceConflicts) { + console.error(chalk.red(`\n Upgrade failed: field-manager conflict\n`)); + reportFieldManagerConflicts(lateConflicts); + process.exit(1); + } console.error(chalk.red(`\n Upgrade failed: ${msg}\n`)); console.error(chalk.yellow( " The atomic Helm upgrade auto-rolls-back the release on failure. If workloads\n" + From 8789c2451de116b3142c08dd830a79b74019f5f6 Mon Sep 17 00:00:00 2001 From: Pal Lakatos-Toth Date: Tue, 30 Jun 2026 15:28:54 +0200 Subject: [PATCH 06/10] docs(mcp): MCP guide + runnable Playwright MCP example MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add docs/mcp.md — how to add an MCP server (McpServer CR + mcpServerRefs), how tool calls are governed through the per-pod router, out-of-the-box egress auto-derivation, and the session keepalive that keeps stateful flows (e.g. browser automation) on one live page instead of resetting to about:blank. Add examples/playwright-mcp/ — a browser-automation OpenClaw agent on the official Playwright MCP, end to end: in-cluster MCP Deployment/Service, the McpServer CR, and a KarsSandbox whose only MCP-specific line is one mcpServerRefs entry (no hand-written egress). Wire both into the docs nav, examples catalogue, and CLI reference (kars update). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- docs/README.md | 1 + docs/SUMMARY.md | 1 + docs/cli-reference.md | 38 ++++ docs/examples.md | 1 + docs/mcp.md | 191 ++++++++++++++++++ examples/README.md | 1 + .../playwright-mcp/00-playwright-mcp.yaml | 85 ++++++++ examples/playwright-mcp/01-mcpserver.yaml | 53 +++++ examples/playwright-mcp/02-karssandbox.yaml | 104 ++++++++++ examples/playwright-mcp/README.md | 124 ++++++++++++ 10 files changed, 599 insertions(+) create mode 100644 docs/mcp.md create mode 100644 examples/playwright-mcp/00-playwright-mcp.yaml create mode 100644 examples/playwright-mcp/01-mcpserver.yaml create mode 100644 examples/playwright-mcp/02-karssandbox.yaml create mode 100644 examples/playwright-mcp/README.md diff --git a/docs/README.md b/docs/README.md index ba3e9fe07..d8c01086a 100644 --- a/docs/README.md +++ b/docs/README.md @@ -97,6 +97,7 @@ This section mirrors the chapter groups in **[`SUMMARY.md`](SUMMARY.md)**, which - [kars OpenClaw plugin](openclaw-plugin.md) — the in-sandbox plugin (24 governance-aware tools, 10 skills) every kars-managed agent loads. - [`@kars/mesh` plugin](mesh-plugin.md) — the companion local plugin (built from source, not yet published on npm) for pairing a local OpenClaw with a remote kars cluster. - [Channels & external plugins](channels-plugins.md) — Telegram / Slack / Discord / WhatsApp channels + 3rd-party search/scrape API integrations via CLI flags. +- [MCP servers](mcp.md) — add an MCP server (`McpServer` CR + `mcpServerRefs`), how tool calls are governed, out-of-the-box egress + session keepalive. - [Operator TUI](operator-tui.md) — `kars operator`, the live cluster dashboard. - [Permissions model](permissions.md) — the Azure RBAC `kars up` needs, enumerated. - [Per-sandbox identity](agent-identity.md) — each sandbox runs under its own Entra Agent ID. diff --git a/docs/SUMMARY.md b/docs/SUMMARY.md index 79fa56256..15c6f8ee9 100644 --- a/docs/SUMMARY.md +++ b/docs/SUMMARY.md @@ -44,6 +44,7 @@ - [kars Hermes plugin](hermes-plugin.md) - [`@kars/mesh` plugin (local OpenClaw)](mesh-plugin.md) - [Channels & external plugins](channels-plugins.md) +- [MCP servers](mcp.md) - [Operator TUI](operator-tui.md) - [Permissions model](permissions.md) - [Per-sandbox identity (Entra Agent ID)](agent-identity.md) diff --git a/docs/cli-reference.md b/docs/cli-reference.md index f35a59eec..3908642f1 100644 --- a/docs/cli-reference.md +++ b/docs/cli-reference.md @@ -85,6 +85,10 @@ All commands also inherit Commander.js built-in `--help`. - [mcp](#kars-mcp) - [memory](#kars-memory) +### Self-management + +- [update](#kars-update) + --- ## Lifecycle @@ -236,6 +240,40 @@ tagging model. --- +### `kars update` + +Keep the **CLI itself** current. Every `kars` run also does a quiet, cached +check (at most once per day) and prints a one-line notice when a newer +`@kars-runtime/cli` has been published; `kars update` is the explicit, +always-fresh version that shows the changelog and offers to install. + +> `kars upgrade` upgrades the **cluster** (controller, router, sandboxes). +> `kars update` upgrades the **CLI npm package** on your machine. Different +> things. + +``` +kars update [options] +``` + +| Option | Description | +|---|---| +| `--check` | Only check and report; never prompt or install. Exits non-zero when an update is available (handy in scripts). | +| `--yes` | Install the latest version without prompting. | + +```bash +kars update # check, show changelog, offer to install +kars update --check # report only (CI-friendly) +kars update --yes # non-interactive install +``` + +The install runs `npm install -g @kars-runtime/cli@latest`. Disable the +automatic background notice with `KARS_NO_UPDATE_CHECK=1` (also off in CI and +when stdout isn't a TTY). + +**See also:** [`kars upgrade`](#kars-upgrade) (cluster upgrade) + +--- + ### `kars dev` Runs a fully-policy-enforced sandbox locally for inner-loop development — diff --git a/docs/examples.md b/docs/examples.md index 48ac09821..615d84b96 100644 --- a/docs/examples.md +++ b/docs/examples.md @@ -12,6 +12,7 @@ Eight end-to-end examples live under [`examples/`](https://github.com/Azure/kars | [`openai-agents-quickstart`](https://github.com/Azure/kars/tree/main/examples/openai-agents-quickstart) | `OpenAIAgents` (Python) | YAMLs apply once you swap `REPLACE-ME/...` with your image | Hosts an unmodified [OpenAI Agents SDK](https://github.com/openai/openai-agents-python) app. The adapter transparently routes `api.openai.com` through the local inference router. | | [`maf-quickstart`](https://github.com/Azure/kars/tree/main/examples/maf-quickstart) | `MicrosoftAgentFramework` (Python) | YAMLs apply once you swap `REPLACE-ME/...` with your image | Hosts an unmodified [Microsoft Agent Framework](https://github.com/microsoft/agent-framework) app. | | [`byo-quickstart`](https://github.com/Azure/kars/tree/main/examples/byo-quickstart) | `BYO` | Builds + applies cleanly; runtime requires you to bring an image | Brings any container image under the BYO contract. Includes a tiny FastAPI reference agent. | +| [`playwright-mcp`](https://github.com/Azure/kars/tree/main/examples/playwright-mcp) | `OpenClaw` + MCP | ✅ Verified live on AKS — browser flow (navigate → click → snapshot → evaluate) stays on one page; controller auto-derives the MCP egress rule | Browser-automation agent on the official [Playwright MCP](https://github.com/microsoft/playwright-mcp). Shows zero-config MCP wiring (one `mcpServerRefs` entry), out-of-the-box egress auto-derivation, and reliable multi-step browsing via router session keepalive. | ## Multi-agent attack-simulation demos diff --git a/docs/mcp.md b/docs/mcp.md new file mode 100644 index 000000000..d8717160e --- /dev/null +++ b/docs/mcp.md @@ -0,0 +1,191 @@ +# MCP servers in kars + +The [Model Context Protocol](https://modelcontextprotocol.io) (MCP) is how a +kars agent reaches tools it doesn't ship with — a hosted search API, a wiki +reader, a headless browser, your internal services. kars treats every MCP +server as an **untrusted upstream**: the agent never holds its credentials and +never opens a socket to it directly. Everything goes through the per-pod +**inference router**, which discovers the MCP's tools, enforces governance on +every call, and is the only network path to the server. + +This guide covers adding an MCP, how it's reached, authentication, and the two +mechanisms that make MCP support **work out of the box**: egress +auto-derivation and session keepalive. + +> Looking for a runnable example? See +> [`examples/playwright-mcp/`](../examples/playwright-mcp/) — a browser agent on +> the official Playwright MCP, end to end. + +## The model: `McpServer` CR + `mcpServerRefs` + +Two pieces, both declarative: + +1. An **`McpServer`** custom resource describes the server: its URL, the tools + you allow, which sandboxes may use it, and (for hosted servers) its OAuth + config. +2. A sandbox **opts in** by naming that CR in + `spec.governance.mcpServerRefs`. + +```yaml +apiVersion: kars.azure.com/v1alpha1 +kind: McpServer +metadata: + name: playwright + namespace: kars-system # same namespace as the sandbox(es) +spec: + url: "http://playwright-mcp.kars-mcp.svc.cluster.local:8931/mcp" + allowedTools: [browser_navigate, browser_click, browser_snapshot, browser_evaluate] + allowedSandboxes: + matchLabels: { kars.azure.com/sandbox: browser } + productionMode: false + displayName: "Playwright (headless Chromium)" +--- +apiVersion: kars.azure.com/v1alpha1 +kind: KarsSandbox +metadata: + name: browser + namespace: kars-system + labels: { kars.azure.com/sandbox: browser } +spec: + runtime: { kind: OpenClaw, openclaw: {} } + governance: + enabled: true + mcpServerRefs: + - name: playwright # ← the only MCP-specific line + networkPolicy: + defaultDeny: true + egressMode: Strict +``` + +You can also create the CR imperatively: + +```bash +kars mcp apply playwright \ + --namespace kars-system \ + --url http://playwright-mcp.kars-mcp.svc.cluster.local:8931/mcp \ + --allowed-tool browser_navigate --allowed-tool browser_click \ + --allowed-sandbox-label kars.azure.com/sandbox=browser \ + --display-name "Playwright (headless Chromium)" + +kars mcp list -n kars-system +kars mcp get playwright -n kars-system -o yaml +kars mcp delete playwright -n kars-system +``` + +### Key `McpServer.spec` fields + +| Field | Meaning | +|---|---| +| `url` | Streamable-HTTP MCP endpoint. In-cluster Service DNS or a hosted `https://` URL. | +| `allowedTools` | Allow-list of tool names. Empty = none; `["*"]` = all (then gate with `ToolPolicy`). Pin explicitly so an upstream change can't widen the surface. | +| `allowedSandboxes.matchLabels` | Which sandboxes may use this MCP. Empty = same-namespace only. | +| `productionMode` | `true` requires HTTPS + OAuth 2.1. | +| `oauth` | OAuth issuer/audience/resource for `productionMode`. The router mints tokens; the agent never sees them. | +| `bearerFromEnv` | Static outbound bearer from a named env var, for MCPs that use a long-lived API token. | +| `crossNamespaceAllowed` | Allow sandboxes in other namespaces to reference this CR. | + +The full schema is in the [CRD reference](api/crd-reference.md). + +## How a tool call flows + +``` +agent (UID 1000) ──127.0.0.1:8443──▶ inference-router (UID 1001) ──▶ MCP server + tools/call "playwright.browser_navigate" │ + ├─ trust + ToolPolicy check + ├─ audit event + ├─ token budget + └─ outbound auth (OAuth / bearer) +``` + +The agent calls a **namespaced** tool (`.`, e.g. +`playwright.browser_navigate`) on loopback. The router authorises it, dispatches +to the MCP, and returns the result. The agent has no ambient network reach and +no credentials. + +## Out-of-the-box egress + +Because the router is the only path to the MCP, the sandbox's default-deny +NetworkPolicy has to admit the **router → MCP** hop. kars does this for you: +the controller parses the `McpServer.spec.url` and emits the right egress rule +automatically — you do **not** add the MCP host to +`networkPolicy.allowedEndpoints` by hand. + +- **In-cluster Service** (`*.svc.cluster.local`): the controller emits a + `namespaceSelector` rule for the MCP's namespace. This matters under the + Cilium CNI, where a K8s NetworkPolicy `ipBlock` (even `0.0.0.0/0`) only + matches the reserved `world` entity and never an in-cluster pod — so an + `ipBlock` rule would silently fail to admit traffic to another pod. +- **External host, non-443**: a coarse port-level rule; the router's CONNECT + allowlist enforces the exact host. +- **External host, 443**: already covered by the router's blanket HTTPS path — + no extra rule needed. + +Verify it after applying a sandbox: + +```bash +kubectl -n kars- get networkpolicy -o yaml | grep -A4 namespaceSelector +``` + +## Reliable sessions (no `about:blank` mid-task) + +Stateful MCP servers — Playwright is one — keep per-session state (your live +browser page) and run a **server-side heartbeat**: they send the client a +JSON-RPC `ping` every few seconds and destroy the session if no `pong` comes +back within a short window (Playwright's default is 5s). A naive request/response +client never answers those pings, so the server reaps the session; the next +tool call gets `404 Session not found`, the client re-initialises, and the work +lands on a **fresh, blank** page — the agent sees `about:blank` mid-task. + +kars's router is a **well-formed MCP client**: for every stateful session it +holds the standalone SSE stream open and answers the server's `ping`s with +`pong`s, keeping the session — and the agent's live page — alive. Multi-step +flows (`navigate → click → snapshot → evaluate`) therefore stay on one page. +This is automatic for any heartbeating MCP; there's nothing to configure. + +## Authentication + +The agent never holds MCP credentials. Two outbound modes, both handled by the +router: + +- **OAuth 2.1** (`productionMode: true` + `oauth:`): the controller wires JWKS + rotation and the router presents a signed bearer token to the MCP. + + ```yaml + spec: + url: "https://mcp.example.com/mcp" + productionMode: true + oauth: + issuer: "https://login.microsoftonline.com//v2.0" + audience: "api://your-mcp" + ``` + +- **Static bearer** (`bearerFromEnv`): for MCPs that authenticate with a + long-lived API token, stored in the sandbox's `-credentials` secret and + injected by name. The token stays in the router; the agent only sees tools. + +## Tool governance + +`allowedTools` on the `McpServer` is the coarse gate. For per-tool rules +(arguments, rate limits, approval), bind a [`ToolPolicy`](api/crd-reference.md) +via `governance.toolPolicyRef`. MCP tools are subject to the same AGT governance +as built-in tools — trust scoring, audit, and policy all apply. + +For the MCP-specific threat model (tool poisoning, confused-deputy, prompt +injection through tool output), see the +[MCP security top-10](security-mcp-top10.md). + +## Troubleshooting + +| Symptom | Cause | Fix | +|---|---|---| +| Agent says the tool doesn't exist | Tool not in `allowedTools`, or sandbox label doesn't match `allowedSandboxes` | Add the tool / fix the label; re-apply the CR. | +| `404 Session not found`, page resets to `about:blank` | Router not keeping the session alive (pre-0.1.24) | Upgrade the router; keepalive is automatic. | +| Calls time out to an in-cluster MCP | Egress not admitted (e.g. `ipBlock` under Cilium) | Use the MCP's Service DNS `url` so the controller derives a `namespaceSelector` rule; check `kubectl -n kars- get networkpolicy`. | +| `403`/`401` from a hosted MCP | OAuth/bearer misconfigured | Check `oauth.issuer`/`audience` or the `bearerFromEnv` secret. | + +## See also + +- [`examples/playwright-mcp/`](../examples/playwright-mcp/) — runnable end-to-end example. +- [CRD reference](api/crd-reference.md) — full `McpServer` / `KarsSandbox` schema. +- [MCP security top-10](security-mcp-top10.md) — the MCP threat model. +- [Architecture diagrams](architecture-diagrams.md) — the MCP data path. diff --git a/examples/README.md b/examples/README.md index b4f7a2648..c46cb23fd 100644 --- a/examples/README.md +++ b/examples/README.md @@ -12,6 +12,7 @@ End-to-end blueprints you can `kubectl apply -f` after running `kars up`. | [`openai-agents-quickstart`](openai-agents-quickstart/) | OpenAIAgents (Python) | Hosts an unmodified OpenAI Agents SDK app inside a kars sandbox. Same security as default. | | [`maf-quickstart`](maf-quickstart/) | MicrosoftAgentFramework (Python) | Hosts an unmodified Microsoft Agent Framework app inside a kars sandbox. | | [`byo-quickstart`](byo-quickstart/) | BYO | Brings any container image under the BYO contract (`spec.runtime.kind: BYO`). Same isolation, same router. | +| [`playwright-mcp`](playwright-mcp/) | OpenClaw + MCP | Browser-automation agent on the official Playwright MCP. Shows zero-config MCP wiring, **out-of-the-box egress auto-derivation**, and reliable multi-step browsing (session keepalive). | All examples share the same control-plane install and isolation guarantees — only the agent runtime image changes. See diff --git a/examples/playwright-mcp/00-playwright-mcp.yaml b/examples/playwright-mcp/00-playwright-mcp.yaml new file mode 100644 index 000000000..b55df984b --- /dev/null +++ b/examples/playwright-mcp/00-playwright-mcp.yaml @@ -0,0 +1,85 @@ +# 00-playwright-mcp.yaml — the Playwright MCP server, in-cluster. +# +# This is an ordinary Deployment + Service running Microsoft's official +# Playwright MCP image (a real Chromium driven over the Model Context +# Protocol's Streamable-HTTP transport). It is NOT a kars component — it's +# exactly the kind of third-party MCP server you'd bring yourself. kars treats +# it as an untrusted upstream the agent reaches only through its per-pod +# inference router. +# +# It lives in its own namespace (`kars-mcp`) so the example can show kars's +# out-of-the-box egress: the controller parses the McpServer URL +# (01-mcpserver.yaml) and auto-derives a NetworkPolicy rule admitting the +# sandbox router to THIS namespace on :8931 — no hand-written egress needed. +--- +apiVersion: v1 +kind: Namespace +metadata: + name: kars-mcp + labels: + # Auto-added by Kubernetes (>=1.21); the controller's derived egress rule + # selects this namespace by this well-known label, which is the only form + # Cilium honours for in-cluster pod destinations. + kubernetes.io/metadata.name: kars-mcp +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: playwright-mcp + namespace: kars-mcp + labels: + app: playwright-mcp +spec: + replicas: 1 + selector: + matchLabels: + app: playwright-mcp + template: + metadata: + labels: + app: playwright-mcp + spec: + containers: + - name: mcp + image: mcr.microsoft.com/playwright/mcp:latest + imagePullPolicy: IfNotPresent + args: + - --port=8931 + - --host=0.0.0.0 + - --headless + - --browser=chromium + - --no-sandbox + # Playwright MCP rejects requests whose Host header isn't allow-listed. + # The router connects via the Service's cluster DNS name, so list it + # (and the short forms) here. + - --allowed-hosts=playwright-mcp.kars-mcp.svc.cluster.local:8931,playwright-mcp.kars-mcp:8931,playwright-mcp:8931,localhost:8931,localhost + ports: + - containerPort: 8931 + name: mcp + readinessProbe: + tcpSocket: + port: 8931 + initialDelaySeconds: 3 + periodSeconds: 5 + resources: + requests: + cpu: "250m" + memory: "512Mi" + limits: + cpu: "2" + memory: "2Gi" +--- +apiVersion: v1 +kind: Service +metadata: + name: playwright-mcp + namespace: kars-mcp + labels: + app: playwright-mcp +spec: + selector: + app: playwright-mcp + ports: + - name: mcp + port: 8931 + targetPort: 8931 diff --git a/examples/playwright-mcp/01-mcpserver.yaml b/examples/playwright-mcp/01-mcpserver.yaml new file mode 100644 index 000000000..dc4123553 --- /dev/null +++ b/examples/playwright-mcp/01-mcpserver.yaml @@ -0,0 +1,53 @@ +# 01-mcpserver.yaml — register the Playwright MCP server with kars. +# +# The McpServer CR is the declarative "this MCP exists and these tools are +# allowed" record. The controller mirrors it into any sandbox that references +# it (02-karssandbox.yaml) and the per-pod inference router becomes the ONLY +# path to it: the agent process calls tools on 127.0.0.1, the router dispatches +# to this URL, and governance (trust, policy, audit, token budget) runs on the +# way through. The agent never gets the raw network path or any credentials. +# +# Out-of-the-box egress: because `url` resolves to an in-cluster Service, the +# controller derives the sandbox NetworkPolicy egress rule for the `kars-mcp` +# namespace automatically. You do NOT add `playwright-mcp...:8931` to the +# sandbox's `networkPolicy.allowedEndpoints` by hand. +--- +apiVersion: kars.azure.com/v1alpha1 +kind: McpServer +metadata: + name: playwright + namespace: kars-system + labels: + kars.azure.com/sandbox: browser +spec: + # Streamable-HTTP MCP endpoint. In-cluster Service DNS → the controller + # auto-derives the matching egress rule. For a hosted MCP you'd use an + # https:// URL (e.g. https://mcp.example.com/mcp) and nothing else changes. + url: "http://playwright-mcp.kars-mcp.svc.cluster.local:8931/mcp" + + # Pin the tool surface explicitly so an upstream change can't widen it + # without a CR edit. These are the browser-automation tools Playwright MCP + # advertises. + allowedTools: + - browser_navigate + - browser_click + - browser_type + - browser_snapshot + - browser_take_screenshot + - browser_evaluate + - browser_wait_for + - browser_press_key + - browser_select_option + + # Which sandboxes may use this MCP. Empty = same-namespace only; here we + # gate on a label so only the browser sandbox(es) get it. + allowedSandboxes: + matchLabels: + kars.azure.com/sandbox: browser + + # This MCP is unauthenticated (it's our own in-cluster Chromium). For a + # production/hosted MCP, set `productionMode: true` and an `oauth:` block; + # the controller then wires JWKS rotation + signed bearer tokens through the + # router and the agent still never sees a token. + productionMode: false + displayName: "Playwright (headless Chromium browser automation)" diff --git a/examples/playwright-mcp/02-karssandbox.yaml b/examples/playwright-mcp/02-karssandbox.yaml new file mode 100644 index 000000000..ce9539126 --- /dev/null +++ b/examples/playwright-mcp/02-karssandbox.yaml @@ -0,0 +1,104 @@ +# 02-karssandbox.yaml — a browser-automation agent that consumes the +# Playwright MCP. OpenClaw runtime, default kars hardening, governance on. +# +# The only MCP-specific line is `governance.mcpServerRefs` — it names the +# McpServer CR (01-mcpserver.yaml). From that single reference the controller: +# 1. mirrors the MCP's per-server config into this sandbox's namespace and +# mounts it into the inference router, +# 2. AUTO-DERIVES the NetworkPolicy egress rule to reach the in-cluster +# Playwright Service (namespace `kars-mcp`, port 8931) — note there is no +# hand-written `allowedEndpoints` entry for it below, and +# 3. exposes the allow-listed `browser_*` tools to the agent on loopback. +# +# Browser automation "just works": the router holds the MCP session alive +# (answers the server's heartbeat pings), so multi-step flows — navigate → +# click → snapshot → evaluate — keep the SAME live page instead of resetting +# to about:blank mid-task. +--- +apiVersion: kars.azure.com/v1alpha1 +kind: InferencePolicy +metadata: + name: browser-inference + namespace: kars-system + labels: + kars.azure.com/sandbox: browser +spec: + appliesTo: + sandboxName: browser + modelPreference: + primary: + provider: azure-openai + deployment: gpt-4.1 + contentSafety: + requirePromptShields: true + tokenBudget: + dailyTokens: 500000 + perRequestTokens: 128000 +--- +apiVersion: kars.azure.com/v1alpha1 +kind: KarsSandbox +metadata: + name: browser + namespace: kars-system + labels: + # Matches McpServer.spec.allowedSandboxes.matchLabels so this sandbox is + # permitted to use the Playwright MCP. + kars.azure.com/sandbox: browser +spec: + runtime: + kind: OpenClaw + openclaw: {} + + agent: + instructions: |- + You are a browser-automation agent. You have Playwright MCP tools + (browser_navigate, browser_click, browser_type, browser_snapshot, + browser_take_screenshot, browser_evaluate). Use them to drive a real + Chromium browser to fulfil the user's web tasks. Read the accessibility + snapshot to understand the page before you act. + + sandbox: + isolation: "enhanced" + seccompProfile: "kars-strict" + readOnlyRootFilesystem: true + runAsNonRoot: true + allowPrivilegeEscalation: false + writablePaths: + - /sandbox + - /tmp + + inferenceRef: + name: browser-inference + + # Governance ON. `mcpServerRefs` is the consumer side of the McpServer CR. + governance: + enabled: true + toolPolicyRef: + name: kars-default + mcpServerRefs: + - name: playwright + # Demo trust threshold. Production should run `kars mesh setup-trust` and + # raise this to 500+. + trustThreshold: 0 + + # Egress is default-deny and actively enforced. We deliberately DO NOT list + # the Playwright Service here — the controller derives that rule from the + # referenced McpServer's URL. Everything the agent reaches goes through the + # router on loopback, so this list is for direct sandbox→external egress only + # (empty here: the browser runs inside the MCP server, not the agent pod). + networkPolicy: + defaultDeny: true + egressMode: Strict + allowedEndpoints: [] + + azureServices: + - service: ai-foundry + permissions: [inference] + + resources: + requests: + cpu: "500m" + memory: "1Gi" + limits: + cpu: "2" + memory: "4Gi" diff --git a/examples/playwright-mcp/README.md b/examples/playwright-mcp/README.md new file mode 100644 index 000000000..20b624f88 --- /dev/null +++ b/examples/playwright-mcp/README.md @@ -0,0 +1,124 @@ +# kars Example: Browser-automation agent on a Playwright MCP + +A sandboxed OpenClaw agent that drives a **real headless Chromium** through the +official [Playwright MCP](https://github.com/microsoft/playwright-mcp) server — +navigating pages, clicking, typing, taking snapshots — entirely under kars +governance and isolation. + +This is the canonical "**bring an MCP server and it just works**" example. The +only MCP-specific line in the whole sandbox spec is a single +`governance.mcpServerRefs` entry; kars does the rest. + +## What it shows + +| Capability | How | +|---|---| +| **Zero-config MCP wiring** | One `McpServer` CR + one `mcpServerRefs` entry. The controller mirrors config into the sandbox and exposes the allow-listed `browser_*` tools on the agent's loopback. | +| **Out-of-the-box egress** | The controller parses the MCP's `url`, sees it's an in-cluster Service, and **auto-derives** the default-deny NetworkPolicy egress rule (`namespaceSelector` for `kars-mcp`, port 8931). You never hand-write that rule. | +| **Reliable multi-step browsing** | The per-pod router holds the MCP session alive by answering Playwright's heartbeat pings, so `navigate → click → snapshot → evaluate` keeps the **same live page** instead of resetting to `about:blank` mid-task. | +| **No credentials in the agent** | The agent calls tools on `127.0.0.1`; the router is the only path to the MCP. Trust, policy, audit and token budget run on every call. | +| **Default kars hardening** | seccomp `kars-strict`, read-only rootfs, drop-ALL caps, non-root, enhanced isolation. | + +## Architecture + +``` +┌─ namespace: kars-system ───────────────────────────────────────────┐ +│ KarsSandbox "browser" (OpenClaw, enhanced isolation) │ +│ ├── openclaw container (UID 1000) ── tools on 127.0.0.1:8443 ──┐ │ +│ └── inference-router (UID 1001) ─────────────────────────────┼─ │ +│ • holds MCP session alive (heartbeat pong) │ │ +│ • governance on every tool call │ │ +└─────────────────────────────────────────────────────────────────┼─┘ + derived egress (namespaceSelector kars-mcp:8931) │ +┌─ namespace: kars-mcp ─────────────────────────────────────────────▼┐ +│ Deployment/Service "playwright-mcp" (mcr.microsoft.com/playwright/mcp) +│ • real headless Chromium, Streamable-HTTP MCP on :8931 │ +└────────────────────────────────────────────────────────────────────┘ +``` + +## Prerequisites + +A running kars control plane (`kars up`) on an AKS cluster, and `kubectl` +pointed at it. + +## Deploy + +```bash +# 1. The Playwright MCP server (in its own namespace). +kubectl apply -f 00-playwright-mcp.yaml +kubectl -n kars-mcp rollout status deploy/playwright-mcp + +# 2. Register the MCP with kars + the browser agent. +kubectl apply -f 01-mcpserver.yaml +kubectl apply -f 02-karssandbox.yaml +kubectl -n kars-system wait --for=jsonpath='{.status.phase}'=Running \ + karssandbox/browser --timeout=180s +``` + +## Use it + +```bash +kars connect browser +``` + +Then ask it to do something on the web, e.g.: + +> Navigate to https://example.com, take a snapshot, and tell me the page heading. + +> Go to https://playwright.dev, click "Get started", and summarise the page. + +Multi-step flows work because the router keeps the browser session (and its +live page) alive across calls. + +## Verify the out-of-the-box egress (optional) + +The controller derives the egress rule from the MCP URL — confirm it's there +without you having written it: + +```bash +kubectl -n kars-browser get networkpolicy -o yaml | grep -A4 namespaceSelector +# → a rule selecting kubernetes.io/metadata.name: kars-mcp on TCP 8931 +``` + +## How it works + +1. **`McpServer` CR** (`01-mcpserver.yaml`) declares the endpoint, the + allow-listed tools, and which sandboxes may use it. +2. **`mcpServerRefs`** in the sandbox (`02-karssandbox.yaml`) opts the agent in. + The controller mirrors the MCP's config into the sandbox namespace, mounts + it into the router, and — because the URL is an in-cluster Service — + **auto-derives the NetworkPolicy egress** to `kars-mcp:8931`. (Cilium only + honours a `namespaceSelector` for in-cluster pod destinations, never an + `ipBlock`; the controller emits the correct form.) +3. **The inference router** discovers the MCP's tools, namespaces them, and + exposes them to the agent on loopback. It **acts as a well-formed MCP + client**: it holds the standalone SSE stream open and answers the server's + heartbeat `ping`s with `pong`s, so Playwright never reaps the session. That + is what keeps a browsing flow on one live page. + +## Going hosted + +To point at a **hosted** MCP instead of an in-cluster one, change just the URL: + +```yaml +spec: + url: "https://mcp.example.com/mcp" + productionMode: true + oauth: + issuer: "https://login.microsoftonline.com//v2.0" + audience: "api://your-mcp" +``` + +External `https://` hosts on 443 are already covered by the router's HTTPS +path; the controller adds a coarse port rule for non-443 hosts and the router +enforces the exact host. The agent still never sees a token. + +## Clean up + +```bash +kubectl delete -f 02-karssandbox.yaml -f 01-mcpserver.yaml -f 00-playwright-mcp.yaml +``` + +See [`docs/mcp.md`](../../docs/mcp.md) for the full MCP guide and +[`docs/api/crd-reference.md`](../../docs/api/crd-reference.md) for the +`McpServer` / `KarsSandbox` schemas. From 9d4e77053cc92f6a4779699e07e17ac92a641db6 Mon Sep 17 00:00:00 2001 From: Pal Lakatos-Toth Date: Tue, 30 Jun 2026 15:39:19 +0200 Subject: [PATCH 07/10] refactor(controller): extract reconciler::mcp_egress submodule Move the MCP egress derivation (mcp_url_host_port / cluster_internal_namespace / mcp_egress_rule + the per-ref derive_mcp_egress_rules loop) and its unit tests out of reconciler/mod.rs into a focused reconciler::mcp_egress module. Keeps mod.rs under the ci/check-loc.sh phase0 budget (3700 LOC) and groups the egress logic in one place. Pure code-move + the reconcile loop now calls derive_mcp_egress_rules; no behaviour change. 854 controller tests pass. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- controller/src/reconciler/mcp_egress.rs | 275 +++++++++++++++++++++ controller/src/reconciler/mod.rs | 305 ++---------------------- 2 files changed, 299 insertions(+), 281 deletions(-) create mode 100644 controller/src/reconciler/mcp_egress.rs diff --git a/controller/src/reconciler/mcp_egress.rs b/controller/src/reconciler/mcp_egress.rs new file mode 100644 index 000000000..fe6368a91 --- /dev/null +++ b/controller/src/reconciler/mcp_egress.rs @@ -0,0 +1,275 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +//! MCP egress derivation. +//! +//! "Apply an `McpServer` and reference it — it just works." The per-pod +//! inference router is the only network path to an MCP server, so the sandbox's +//! default-deny `NetworkPolicy` must admit the router→server hop without the +//! operator also hand-writing a `networkPolicy.allowedEndpoints` entry. These +//! helpers turn an `McpServer.spec.url` into the matching egress rule; the +//! reconciler walks every referenced server and adds the derived rules. + +use serde_json::json; + +/// Auto-derive the NetworkPolicy egress rules that admit the sandbox router to +/// every `McpServer` the sandbox references. For each referent we fetch its +/// `spec.url`, parse it, and build the matching rule (see [`mcp_egress_rule`]), +/// skipping any that duplicate `existing` rules or each other. +/// +/// A missing (`404`) referent is logged and skipped — the JWKS-mirror path +/// surfaces it elsewhere. Any other API error is returned so the caller can +/// requeue. Empty / unparseable URLs are skipped (those endpoints still need an +/// explicit allowlist entry). +pub(crate) async fn derive_mcp_egress_rules( + client: &kube::Client, + sandbox_self_ns: &str, + refs: &[&crate::mcp_server::LocalObjectRef], + sandbox_name: &str, + existing: &[serde_json::Value], +) -> Result, kube::Error> { + use kube::api::Api; + let mcp_api: Api = + Api::namespaced(client.clone(), sandbox_self_ns); + let mut out: Vec = Vec::new(); + for mcp_ref in refs { + let ref_name = mcp_ref.name.trim(); + if ref_name.is_empty() { + continue; + } + let url = match mcp_api.get(ref_name).await { + Ok(mcp) => mcp.spec.url.unwrap_or_default(), + Err(kube::Error::Api(ae)) if ae.code == 404 => { + tracing::warn!( + sandbox = %sandbox_name, + mcp = %ref_name, + "referenced McpServer not found; egress not auto-derived", + ); + continue; + } + Err(e) => return Err(e), + }; + if url.is_empty() { + continue; + } + let Some((host, port)) = mcp_url_host_port(&url) else { + tracing::warn!( + sandbox = %sandbox_name, + mcp = %ref_name, + url = %url, + "McpServer url not parseable for egress derivation; skipping", + ); + continue; + }; + if let Some(rule) = mcp_egress_rule(&host, port) + && !existing.contains(&rule) + && !out.contains(&rule) + { + tracing::info!( + sandbox = %sandbox_name, + mcp = %ref_name, + host = %host, + port, + "auto-derived MCP egress rule", + ); + out.push(rule); + } + } + Ok(out) +} + +/// Parse an `McpServer.spec.url` into the `(host, port)` the sandbox's +/// inference-router must be allowed to reach. +/// +/// Returns `None` when the URL is empty or unparseable (the caller skips +/// egress derivation and lets the router surface the misconfiguration). +/// The port defaults from the scheme (`https` → 443, `http` → 80) when not +/// explicitly given. Only `http`/`https` MCP transports are supported. +/// +/// Examples: +/// - `https://mcp.deepwiki.com/mcp` → `("mcp.deepwiki.com", 443)` +/// - `http://playwright-mcp.default.svc.cluster.local:8931/mcp` +/// → `("playwright-mcp.default.svc.cluster.local", 8931)` +pub(crate) fn mcp_url_host_port(url: &str) -> Option<(String, u16)> { + let url = url.trim(); + let (scheme, rest) = url.split_once("://")?; + let default_port: u16 = match scheme.to_ascii_lowercase().as_str() { + "https" => 443, + "http" => 80, + _ => return None, + }; + // Strip path / query / fragment — everything from the first '/', '?' or '#'. + let authority = rest.split(['/', '?', '#']).next().unwrap_or("").trim(); + if authority.is_empty() { + return None; + } + // Drop any userinfo ("user:pass@host"). + let host_port = authority.rsplit('@').next().unwrap_or(authority); + // IPv6 literal `[::1]:8931` — not expected for MCP services; reject so the + // caller falls back rather than mis-splitting on the inner colons. + if host_port.starts_with('[') { + return None; + } + let (host, port) = match host_port.rsplit_once(':') { + Some((h, p)) => { + let port = p.parse::().ok()?; + (h, port) + } + None => (host_port, default_port), + }; + if host.is_empty() { + return None; + } + Some((host.to_string(), port)) +} + +/// Build the NetworkPolicy egress rule that admits the sandbox router to an +/// MCP server at `host:port`. In-cluster Service DNS names get a +/// `namespaceSelector` rule (the only form Cilium honours for pod +/// destinations); external hosts get the coarse port-level `ipBlock` +/// (host-level enforcement lives in the router). Returns `None` for an +/// external host on 443 — already covered by the blanket HTTPS rule. +pub(crate) fn mcp_egress_rule(host: &str, port: u16) -> Option { + if let Some(ns) = cluster_internal_namespace(host) { + Some(json!({ + "to": [{ + "namespaceSelector": { + "matchLabels": { "kubernetes.io/metadata.name": ns } + } + }], + "ports": [{"protocol": "TCP", "port": port}] + })) + } else if port == 443 { + None + } else { + Some(json!({ + "to": [{"ipBlock": {"cidr": "0.0.0.0/0"}}], + "ports": [{"protocol": "TCP", "port": port}] + })) + } +} + +/// If `host` is a cluster-internal Kubernetes DNS name, return the namespace +/// it resolves into; otherwise `None`. +/// +/// In-cluster MCP servers / egress endpoints are addressed by their Service +/// DNS name — `..svc.cluster.local` (or the `.svc` short form). For +/// those we MUST express the NetworkPolicy egress with a `namespaceSelector`, +/// not an `ipBlock`: under the Cilium CNI a K8s NetworkPolicy `ipBlock` CIDR +/// (including `0.0.0.0/0`) only matches the reserved `world` entity and is +/// NOT applied to in-cluster pod endpoints, so an `ipBlock`-based rule silently +/// fails to admit traffic to another pod. Selecting the destination namespace +/// by its well-known `kubernetes.io/metadata.name` label is the portable way to +/// open egress to an in-cluster Service. +/// +/// Returns `None` for external hostnames (e.g. `api.openai.com`) and for bare, +/// namespace-ambiguous names — those keep the coarse `ipBlock` port rule, with +/// fine-grained host enforcement handled by the router's CONNECT allowlist. +pub(crate) fn cluster_internal_namespace(host: &str) -> Option { + let host = host.trim().trim_end_matches('.'); + // `..svc.cluster.local` or the cluster-suffixless `..svc`. + let stripped = host + .strip_suffix(".svc.cluster.local") + .or_else(|| host.strip_suffix(".svc"))?; + // After stripping the `.svc[.cluster.local]` suffix, the remainder is + // `.`; the namespace is the last label. + let ns = stripped.rsplit('.').next()?; + if ns.is_empty() || ns == stripped { + // `stripped` had no `.` → no namespace label present. + return None; + } + Some(ns.to_string()) +} + +#[cfg(test)] +mod tests { + use super::{cluster_internal_namespace, mcp_egress_rule, mcp_url_host_port}; + + #[test] + fn cluster_internal_namespace_parses_fqdn_forms() { + assert_eq!( + cluster_internal_namespace("playwright-mcp.default.svc.cluster.local"), + Some("default".to_string()) + ); + assert_eq!( + cluster_internal_namespace("playwright-mcp.default.svc"), + Some("default".to_string()) + ); + // Trailing dot (fully-qualified) is tolerated. + assert_eq!( + cluster_internal_namespace("svc-x.team-a.svc.cluster.local."), + Some("team-a".to_string()) + ); + } + + #[test] + fn cluster_internal_namespace_rejects_external_and_ambiguous() { + // External hostnames keep the coarse ipBlock rule. + assert_eq!(cluster_internal_namespace("api.openai.com"), None); + assert_eq!(cluster_internal_namespace("example.com"), None); + // Bare names without a namespace label are ambiguous → None. + assert_eq!(cluster_internal_namespace("playwright-mcp"), None); + // A `.svc` suffix with no namespace label is not a valid in-cluster name. + assert_eq!(cluster_internal_namespace("svc"), None); + } + + #[test] + fn mcp_url_host_port_parses_common_forms() { + // Remote HTTPS MCP — default port 443. + assert_eq!( + mcp_url_host_port("https://mcp.deepwiki.com/mcp"), + Some(("mcp.deepwiki.com".to_string(), 443)) + ); + assert_eq!( + mcp_url_host_port("https://api.githubcopilot.com/mcp"), + Some(("api.githubcopilot.com".to_string(), 443)) + ); + // In-cluster HTTP MCP with explicit port. + assert_eq!( + mcp_url_host_port("http://playwright-mcp.default.svc.cluster.local:8931/mcp"), + Some(("playwright-mcp.default.svc.cluster.local".to_string(), 8931)) + ); + // http default port 80, no path. + assert_eq!( + mcp_url_host_port("http://svc.ns.svc.cluster.local"), + Some(("svc.ns.svc.cluster.local".to_string(), 80)) + ); + // Query/fragment are stripped. + assert_eq!( + mcp_url_host_port("https://host.example:8443/mcp?x=1#frag"), + Some(("host.example".to_string(), 8443)) + ); + } + + #[test] + fn mcp_url_host_port_rejects_invalid() { + assert_eq!(mcp_url_host_port(""), None); + assert_eq!(mcp_url_host_port("not-a-url"), None); + assert_eq!(mcp_url_host_port("ftp://host:21/x"), None); + assert_eq!(mcp_url_host_port("https://"), None); + // Non-numeric port. + assert_eq!(mcp_url_host_port("https://host:notaport/mcp"), None); + // IPv6 literals are not supported (fall back rather than mis-split). + assert_eq!(mcp_url_host_port("http://[::1]:8931/mcp"), None); + } + + #[test] + fn mcp_egress_rule_shapes_by_destination() { + // In-cluster → namespaceSelector on the destination namespace + port. + let rule = mcp_egress_rule("playwright-mcp.default.svc.cluster.local", 8931) + .expect("in-cluster rule"); + assert_eq!( + rule["to"][0]["namespaceSelector"]["matchLabels"]["kubernetes.io/metadata.name"], + "default" + ); + assert_eq!(rule["ports"][0]["port"], 8931); + + // External non-443 → coarse ipBlock + port. + let rule = mcp_egress_rule("mcp.example.com", 8080).expect("external non-443 rule"); + assert_eq!(rule["to"][0]["ipBlock"]["cidr"], "0.0.0.0/0"); + assert_eq!(rule["ports"][0]["port"], 8080); + + // External 443 → None (blanket HTTPS rule already covers it). + assert!(mcp_egress_rule("mcp.deepwiki.com", 443).is_none()); + } +} diff --git a/controller/src/reconciler/mod.rs b/controller/src/reconciler/mod.rs index 018271a90..ce919983d 100644 --- a/controller/src/reconciler/mod.rs +++ b/controller/src/reconciler/mod.rs @@ -36,8 +36,11 @@ use crate::fedcred::{FedCredConfig, FedCredManager}; pub(crate) mod byo_contract; mod dev_env; pub(crate) mod governance_mounts; +mod mcp_egress; pub(crate) mod trustgraph_mount; +use mcp_egress::mcp_egress_rule; + /// Build pod security context, conditionally including SELinux options and /// choosing between RuntimeDefault and Localhost seccomp profiles. /// For Kata (confidential), we use RuntimeDefault since the VM provides isolation. @@ -89,109 +92,6 @@ pub(crate) fn isolation_scheduling(isolation: &str) -> (Option<&'static str>, &' } } -/// Parse an `McpServer.spec.url` into the `(host, port)` the sandbox's -/// inference-router must be allowed to reach. -/// -/// Returns `None` when the URL is empty or unparseable (the caller skips -/// egress derivation and lets the router surface the misconfiguration). -/// The port defaults from the scheme (`https` → 443, `http` → 80) when not -/// explicitly given. Only `http`/`https` MCP transports are supported. -/// -/// Examples: -/// - `https://mcp.deepwiki.com/mcp` → `("mcp.deepwiki.com", 443)` -/// - `http://playwright-mcp.default.svc.cluster.local:8931/mcp` -/// → `("playwright-mcp.default.svc.cluster.local", 8931)` -pub(crate) fn mcp_url_host_port(url: &str) -> Option<(String, u16)> { - let url = url.trim(); - let (scheme, rest) = url.split_once("://")?; - let default_port: u16 = match scheme.to_ascii_lowercase().as_str() { - "https" => 443, - "http" => 80, - _ => return None, - }; - // Strip path / query / fragment — everything from the first '/', '?' or '#'. - let authority = rest.split(['/', '?', '#']).next().unwrap_or("").trim(); - if authority.is_empty() { - return None; - } - // Drop any userinfo ("user:pass@host"). - let host_port = authority.rsplit('@').next().unwrap_or(authority); - // IPv6 literal `[::1]:8931` — not expected for MCP services; reject so the - // caller falls back rather than mis-splitting on the inner colons. - if host_port.starts_with('[') { - return None; - } - let (host, port) = match host_port.rsplit_once(':') { - Some((h, p)) => { - let port = p.parse::().ok()?; - (h, port) - } - None => (host_port, default_port), - }; - if host.is_empty() { - return None; - } - Some((host.to_string(), port)) -} - -/// Build the NetworkPolicy egress rule that admits the sandbox router to an -/// MCP server at `host:port`. In-cluster Service DNS names get a -/// `namespaceSelector` rule (the only form Cilium honours for pod -/// destinations); external hosts get the coarse port-level `ipBlock` -/// (host-level enforcement lives in the router). Returns `None` for an -/// external host on 443 — already covered by the blanket HTTPS rule. -fn mcp_egress_rule(host: &str, port: u16) -> Option { - if let Some(ns) = cluster_internal_namespace(host) { - Some(json!({ - "to": [{ - "namespaceSelector": { - "matchLabels": { "kubernetes.io/metadata.name": ns } - } - }], - "ports": [{"protocol": "TCP", "port": port}] - })) - } else if port == 443 { - None - } else { - Some(json!({ - "to": [{"ipBlock": {"cidr": "0.0.0.0/0"}}], - "ports": [{"protocol": "TCP", "port": port}] - })) - } -} - -/// If `host` is a cluster-internal Kubernetes DNS name, return the namespace -/// it resolves into; otherwise `None`. -/// -/// In-cluster MCP servers / egress endpoints are addressed by their Service -/// DNS name — `..svc.cluster.local` (or the `.svc` short form). For -/// those we MUST express the NetworkPolicy egress with a `namespaceSelector`, -/// not an `ipBlock`: under the Cilium CNI a K8s NetworkPolicy `ipBlock` CIDR -/// (including `0.0.0.0/0`) only matches the reserved `world` entity and is -/// NOT applied to in-cluster pod endpoints, so an `ipBlock`-based rule silently -/// fails to admit traffic to another pod. Selecting the destination namespace -/// by its well-known `kubernetes.io/metadata.name` label is the portable way to -/// open egress to an in-cluster Service. -/// -/// Returns `None` for external hostnames (e.g. `api.openai.com`) and for bare, -/// namespace-ambiguous names — those keep the coarse `ipBlock` port rule, with -/// fine-grained host enforcement handled by the router's CONNECT allowlist. -pub(crate) fn cluster_internal_namespace(host: &str) -> Option { - let host = host.trim().trim_end_matches('.'); - // `..svc.cluster.local` or the cluster-suffixless `..svc`. - let stripped = host - .strip_suffix(".svc.cluster.local") - .or_else(|| host.strip_suffix(".svc"))?; - // After stripping the `.svc[.cluster.local]` suffix, the remainder is - // `.`; the namespace is the last label. - let ns = stripped.rsplit('.').next()?; - if ns.is_empty() || ns == stripped { - // `stripped` had no `.` → no namespace label present. - return None; - } - Some(ns.to_string()) -} - /// Build the egress-guard init-container command. /// /// Standard sandboxes (every kind except SRE) get the full lockdown: @@ -354,99 +254,6 @@ mod egress_guard_tests { ); } } - - #[test] - fn cluster_internal_namespace_parses_fqdn_forms() { - use super::cluster_internal_namespace; - assert_eq!( - cluster_internal_namespace("playwright-mcp.default.svc.cluster.local"), - Some("default".to_string()) - ); - assert_eq!( - cluster_internal_namespace("playwright-mcp.default.svc"), - Some("default".to_string()) - ); - // Trailing dot (fully-qualified) is tolerated. - assert_eq!( - cluster_internal_namespace("svc-x.team-a.svc.cluster.local."), - Some("team-a".to_string()) - ); - } - - #[test] - fn cluster_internal_namespace_rejects_external_and_ambiguous() { - use super::cluster_internal_namespace; - // External hostnames keep the coarse ipBlock rule. - assert_eq!(cluster_internal_namespace("api.openai.com"), None); - assert_eq!(cluster_internal_namespace("example.com"), None); - // Bare names without a namespace label are ambiguous → None. - assert_eq!(cluster_internal_namespace("playwright-mcp"), None); - // A `.svc` suffix with no namespace label is not a valid in-cluster name. - assert_eq!(cluster_internal_namespace("svc"), None); - } - - #[test] - fn mcp_url_host_port_parses_common_forms() { - use super::mcp_url_host_port; - // Remote HTTPS MCP — default port 443. - assert_eq!( - mcp_url_host_port("https://mcp.deepwiki.com/mcp"), - Some(("mcp.deepwiki.com".to_string(), 443)) - ); - assert_eq!( - mcp_url_host_port("https://api.githubcopilot.com/mcp"), - Some(("api.githubcopilot.com".to_string(), 443)) - ); - // In-cluster HTTP MCP with explicit port. - assert_eq!( - mcp_url_host_port("http://playwright-mcp.default.svc.cluster.local:8931/mcp"), - Some(("playwright-mcp.default.svc.cluster.local".to_string(), 8931)) - ); - // http default port 80, no path. - assert_eq!( - mcp_url_host_port("http://svc.ns.svc.cluster.local"), - Some(("svc.ns.svc.cluster.local".to_string(), 80)) - ); - // Query/fragment are stripped. - assert_eq!( - mcp_url_host_port("https://host.example:8443/mcp?x=1#frag"), - Some(("host.example".to_string(), 8443)) - ); - } - - #[test] - fn mcp_url_host_port_rejects_invalid() { - use super::mcp_url_host_port; - assert_eq!(mcp_url_host_port(""), None); - assert_eq!(mcp_url_host_port("not-a-url"), None); - assert_eq!(mcp_url_host_port("ftp://host:21/x"), None); - assert_eq!(mcp_url_host_port("https://"), None); - // Non-numeric port. - assert_eq!(mcp_url_host_port("https://host:notaport/mcp"), None); - // IPv6 literals are not supported (fall back rather than mis-split). - assert_eq!(mcp_url_host_port("http://[::1]:8931/mcp"), None); - } - - #[test] - fn mcp_egress_rule_shapes_by_destination() { - use super::mcp_egress_rule; - // In-cluster → namespaceSelector on the destination namespace + port. - let rule = mcp_egress_rule("playwright-mcp.default.svc.cluster.local", 8931) - .expect("in-cluster rule"); - assert_eq!( - rule["to"][0]["namespaceSelector"]["matchLabels"]["kubernetes.io/metadata.name"], - "default" - ); - assert_eq!(rule["ports"][0]["port"], 8931); - - // External non-443 → coarse ipBlock + port. - let rule = mcp_egress_rule("mcp.example.com", 8080).expect("external non-443 rule"); - assert_eq!(rule["to"][0]["ipBlock"]["cidr"], "0.0.0.0/0"); - assert_eq!(rule["ports"][0]["port"], 8080); - - // External 443 → None (blanket HTTPS rule already covers it). - assert!(mcp_egress_rule("mcp.deepwiki.com", 443).is_none()); - } } #[derive(Debug, thiserror::Error)] enum ReconcileError { @@ -1382,96 +1189,32 @@ async fn reconcile(sandbox: Arc, ctx: Arc) -> Result..svc.cluster.local`) - // must be admitted with a `namespaceSelector`: a `0.0.0.0/0` ipBlock - // does NOT match in-cluster pods under Cilium, so the agent's router - // could never reach an in-cluster MCP server. External hosts keep the - // coarse port-level ipBlock (host-level enforcement lives in the - // router's CONNECT allowlist). - if let Some(ns) = cluster_internal_namespace(&ep.host) { - egress_rules.push(json!({ - "to": [{ - "namespaceSelector": { - "matchLabels": { "kubernetes.io/metadata.name": ns } - } - }], - "ports": [{"protocol": "TCP", "port": port}] - })); - } else { - egress_rules.push(json!({ - "to": [{"ipBlock": {"cidr": "0.0.0.0/0"}}], - "ports": [{"protocol": "TCP", "port": port}] - })); + // In-cluster endpoints need a `namespaceSelector` (a `0.0.0.0/0` + // ipBlock doesn't match in-cluster pods under Cilium); external + // hosts keep the coarse ipBlock. `mcp_egress_rule` encodes both and + // only returns `None` for 443, already skipped above. + if let Some(rule) = mcp_egress_rule(&ep.host, port) { + egress_rules.push(rule); } } } - // ── Auto-derive egress to referenced McpServer endpoints ───────────── - // - // "Apply an McpServer CRD and reference it — it just works." The router - // is the only network path to an MCP server, so the sandbox NetworkPolicy - // must admit the router→server hop WITHOUT the operator also hand-writing - // a `networkPolicy.allowedEndpoints` entry. For every McpServer the - // sandbox references via `governance.mcpServerRefs`, we parse its - // `spec.url` and emit the matching egress rule: - // - in-cluster Service DNS → `namespaceSelector` (Cilium-correct) - // - external host, non-443 → coarse `ipBlock` + port - // - external host, 443 → already covered by the blanket HTTPS rule - // - // Remote HTTPS MCPs (DeepWiki, GitHub, hosted Playwright) therefore need - // zero extra config; self-hosted in-cluster MCPs on a non-443 port get - // their egress opened automatically. + // Auto-derive egress to every referenced McpServer so "apply an McpServer + // and reference it" just works — the router is the only path to it, so the + // default-deny NetworkPolicy must admit the router→server hop. De-duplicated. + match mcp_egress::derive_mcp_egress_rules( + client, + &sandbox_self_ns, + &governance_config.effective_mcp_server_refs(), + &name, + &egress_rules, + ) + .await { - let mcp_api: Api = - Api::namespaced(client.clone(), &sandbox_self_ns); - for mcp_ref in governance_config.effective_mcp_server_refs() { - let ref_name = mcp_ref.name.trim(); - if ref_name.is_empty() { - continue; - } - let url = match mcp_api.get(ref_name).await { - Ok(mcp) => mcp.spec.url.unwrap_or_default(), - Err(kube::Error::Api(ae)) if ae.code == 404 => { - // Missing referent is surfaced by the JWKS-mirror path - // below; here we just skip egress derivation for it. - tracing::warn!( - sandbox = %name, - mcp = %ref_name, - "referenced McpServer not found; egress not auto-derived", - ); - continue; - } - Err(e) => { - tracing::error!(error = %e, sandbox = %name, mcp = %ref_name, "McpServer lookup failed"); - return Ok(Action::requeue(Duration::from_secs(15))); - } - }; - // Empty when the signed-bundle authoring path supplies the url; - // those endpoints still need an explicit allowlist entry. - if url.is_empty() { - continue; - } - let Some((host, port)) = mcp_url_host_port(&url) else { - tracing::warn!( - sandbox = %name, - mcp = %ref_name, - url = %url, - "McpServer url not parseable for egress derivation; skipping", - ); - continue; - }; - if let Some(rule) = mcp_egress_rule(&host, port) - && !egress_rules.contains(&rule) - { - tracing::info!( - sandbox = %name, - mcp = %ref_name, - host = %host, - port, - "auto-derived MCP egress rule", - ); - egress_rules.push(rule); - } + Ok(rules) => egress_rules.extend(rules), + Err(e) => { + tracing::error!(error = %e, sandbox = %name, "MCP egress derivation failed"); + return Ok(Action::requeue(Duration::from_secs(15))); } } From 66c53ab2a434a7283055869517df1d0cac7e243e Mon Sep 17 00:00:00 2001 From: Pal Lakatos-Toth Date: Tue, 30 Jun 2026 15:40:38 +0200 Subject: [PATCH 08/10] docs(security): security audit for MCP OOTB + CLI update (v0.1.24) Signed-off-by: Pal Lakatos-Toth Signed-off-by: Copilot <223556219+Copilot@users.noreply.github.com> Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../2026-06-30-mcp-out-of-the-box.md | 98 +++++++++++++++++++ 1 file changed, 98 insertions(+) create mode 100644 docs/security-audits/2026-06-30-mcp-out-of-the-box.md diff --git a/docs/security-audits/2026-06-30-mcp-out-of-the-box.md b/docs/security-audits/2026-06-30-mcp-out-of-the-box.md new file mode 100644 index 000000000..1c9721b93 --- /dev/null +++ b/docs/security-audits/2026-06-30-mcp-out-of-the-box.md @@ -0,0 +1,98 @@ +# Security Audit — MCP out-of-the-box: session keepalive, egress auto-derive, CLI update (v0.1.24) + +Date: 2026-06-30 +Scope: `inference-router/src/mcp/forwarder.rs`, `controller/src/reconciler/mod.rs`, `controller/src/reconciler/mcp_egress.rs`, `cli/src/commands/update.ts`, `cli/src/lib/update-check.ts`, `cli/src/commands/upgrade.ts`. +Gated paths: `inference-router/src/mcp/forwarder.rs`, `controller/src/reconciler/...`, `cli/src/commands/...`. + +## Summary + +This change makes MCP servers work out of the box and adds a CLI self-update +notice. Four capability-touching pieces: + +1. **Router MCP session keepalive** — the forwarder now holds the standalone + `GET /mcp` SSE stream open and answers a heartbeating MCP server's `ping`s + with `pong`s so the upstream doesn't reap the session mid-task. +2. **Controller MCP egress auto-derivation** — the controller parses each + referenced `McpServer.spec.url` and emits the matching default-deny + NetworkPolicy egress rule (namespaceSelector for in-cluster, coarse ipBlock + for external non-443, nothing for external 443). +3. **CLI `kars update` + automatic update notice** — best-effort npm version + check + optional `npm install -g @kars-runtime/cli@latest`. +4. **`kars upgrade` pre-flight** — server-side-apply conflict detection before + the atomic Helm upgrade (read-only dry-run). + +## T1: New capability / attack surface? (YES — bounded) + +- **Keepalive (router→MCP):** opens one long-lived GET to an MCP endpoint the + router was *already* authorised to call (same URL, headers, session id as the + tools/call path). It only ever **reads** SSE frames and **writes** a JSON-RPC + `pong` (`{"id":,"result":{}}`) — no new destinations, no new credentials, + no agent-reachable surface (the agent still only sees loopback tools). The + pong responder replies solely to server-initiated `ping`s; any other inbound + frame is ignored. The task is bounded by a per-request timeout and is + aborted/replaced on session re-init. +- **Egress auto-derivation:** *narrows* rather than widens — it adds precisely + the rule needed to reach a *referenced* `McpServer` and nothing else. The + derivation is driven only by `governance.mcpServerRefs` (operator-authored) + and the referenced CR's `url`; an attacker cannot inject a rule without RBAC + to create an `McpServer` and to reference it from a sandbox. In-cluster URLs + yield a `namespaceSelector` scoped to the MCP's namespace + exact port; + external non-443 yields a coarse port rule with host enforcement still at the + router; external 443 adds nothing. Unparseable/missing referents are skipped + (fail-closed: no rule added). +- **CLI update check:** introduces two outbound HTTPS calls from the operator's + workstation (npm registry `latest` + GitHub release notes), each bounded by a + 1.5s timeout, cached ≤24h in `~/.kars/update-check.json`, off in CI/non-TTY + and via `KARS_NO_UPDATE_CHECK=1`. It never auto-executes an install: the + passive notice only prints text; the actual `npm install -g` runs only on an + explicit `kars update` (interactive confirm, or `--yes`). No tokens, no + cluster access, no code executed from the network response (only a version + string and a changelog line are read). +- **Upgrade pre-flight:** a `helm upgrade --dry-run=server` — read-only, + mutates nothing — whose output is parsed for field-manager conflicts. + +## T2: Security-control change? (NEUTRAL→IMPROVED) + +- The router remains the sole network path to MCP servers; governance (trust, + ToolPolicy, audit, token budget) on tool calls is unchanged. Keepalive adds + no tool surface. +- Egress stays **default-deny**; auto-derivation produces the *same* rules an + operator would have hand-written, but correctly (namespaceSelector under + Cilium, where an ipBlock silently fails for in-cluster pods). This removes a + foot-gun that previously led operators to widen egress by hand. +- The session-loss classifier was tightened to fire only on genuine 4xx hard + signals, so a healthy 2xx tool result that merely mentions "session" no + longer triggers a destructive re-init. + +## T3: Availability / fail-open risk? (REDUCED) + +- Keepalive **reduces** an availability bug: stateful MCP sessions were being + reaped after ~5s, breaking multi-step flows. All keepalive failures are + swallowed and the task simply exits/reconnects; it never blocks a tool call. +- Egress derivation that errors on the API returns a short requeue (no partial + NetworkPolicy is applied). A missing/unparseable referent is skipped rather + than failing the whole reconcile. +- The CLI update check is fully best-effort: every error path returns silently; + it can never block or fail a `kars` command. The upgrade pre-flight is + additive and only *stops before* a known-bad apply (fail-closed, safer). + +## Verification + +- Rust: `cargo test --all` green (controller 854 + router 957 + integration); + `cargo clippy --all-targets -D warnings` clean; `cargo audit` / + `cargo deny check` clean (anyhow bumped to 1.0.103, RUSTSEC-2026-0190). +- CLI: build + typecheck + `oxlint` (0 errors) + 928 vitest tests green + (incl. new `update-check` suite). +- Live AKS validation: both sandbox routers hold a persistent GET SSE stream to + Playwright that survives well past the 5s reaper; zero `session lost` log + lines. Example CRDs validated with `kubectl apply --dry-run=server`. +- `ci/check-loc.sh`, `ci/no-stubs.sh`, `ci/check-copyright-headers.sh` pass. + +## Verdict + +Accept — the new surface is bounded, operator-gated, and fail-closed; the net +effect is improved availability (session keepalive), tighter-by-construction +egress, and a non-intrusive, consent-gated CLI updater. + +Signed-off-by: Pal Lakatos-Toth +Signed-off-by: Copilot <223556219+Copilot@users.noreply.github.com> From b6859217ba73af77bf833957575281eadccbbbb8 Mon Sep 17 00:00:00 2001 From: Pal Lakatos-Toth Date: Tue, 30 Jun 2026 15:49:10 +0200 Subject: [PATCH 09/10] fix(cli): harden update-check version handling (CodeQL) Address four CodeQL findings in the new update-check lib: - Strictly validate every version token (VERSION_RE / sanitizeVersion) before it is cached to disk or interpolated into a registry/GitHub URL, so untrusted response/cache bytes can't reach the filesystem or an outbound request (fixes 'network data written to file' + 'file data in outbound request'). - Percent-encode every '/' in the package name (replaceAll), not just the first (fixes 'incomplete string escaping'). - Tighten the test's registry-URL match to startsWith (fixes 'incomplete URL substring sanitization'). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- cli/src/lib/update-check.test.ts | 21 ++++++++++++++++++++- cli/src/lib/update-check.ts | 31 +++++++++++++++++++++++++------ 2 files changed, 45 insertions(+), 7 deletions(-) diff --git a/cli/src/lib/update-check.test.ts b/cli/src/lib/update-check.test.ts index cea271ea2..87a556a89 100644 --- a/cli/src/lib/update-check.test.ts +++ b/cli/src/lib/update-check.test.ts @@ -11,6 +11,7 @@ import { fetchLatestVersion, fetchChangelogSummary, checkForCliUpdate, + sanitizeVersion, CLI_PACKAGE, } from "./update-check.js"; @@ -72,6 +73,24 @@ describe("summarizeReleaseBody", () => { }); }); +describe("sanitizeVersion", () => { + it("accepts well-formed semver tokens", () => { + expect(sanitizeVersion("0.1.24")).toBe("0.1.24"); + expect(sanitizeVersion("10.20.30")).toBe("10.20.30"); + expect(sanitizeVersion("1.2.3-interim.4")).toBe("1.2.3-interim.4"); + }); + + it("rejects anything that isn't a clean version (no URLs, paths, junk)", () => { + expect(sanitizeVersion("../../etc/passwd")).toBeNull(); + expect(sanitizeVersion("1.2")).toBeNull(); + expect(sanitizeVersion("v1.2.3")).toBeNull(); // caller strips the leading v + expect(sanitizeVersion("1.2.3/evil")).toBeNull(); + expect(sanitizeVersion("https://evil.example/1.2.3")).toBeNull(); + expect(sanitizeVersion(42)).toBeNull(); + expect(sanitizeVersion(undefined)).toBeNull(); + }); +}); + describe("fetchLatestVersion", () => { const realFetch = globalThis.fetch; afterEach(() => { @@ -142,7 +161,7 @@ describe("checkForCliUpdate", () => { it("reports an update when the registry has a newer version", async () => { globalThis.fetch = vi.fn(async (url: string | URL) => { const u = String(url); - if (u.includes("registry.npmjs.org")) { + if (u.startsWith("https://registry.npmjs.org/")) { return new Response(JSON.stringify({ version: "999.0.0" }), { status: 200 }); } // changelog diff --git a/cli/src/lib/update-check.ts b/cli/src/lib/update-check.ts index 740d9f77a..3fe4d36e0 100644 --- a/cli/src/lib/update-check.ts +++ b/cli/src/lib/update-check.ts @@ -79,7 +79,9 @@ async function readCache(): Promise { const parsed = JSON.parse(raw) as Partial; if (typeof parsed.checkedAt !== "number") return null; return { - latest: typeof parsed.latest === "string" ? parsed.latest : null, + // Re-validate the cached version on read: the file is untrusted input, so + // only a well-formed version token is allowed back into comparisons / URLs. + latest: sanitizeVersion(parsed.latest), checkedAt: parsed.checkedAt, notifiedAt: typeof parsed.notifiedAt === "number" ? parsed.notifiedAt : 0, }; @@ -98,15 +100,27 @@ async function writeCache(cache: UpdateCache): Promise { } } +/** Strict semver-ish token: `MAJOR.MINOR.PATCH` with an optional pre-release of + * dot-separated alphanumerics/hyphens. This is the ONLY shape we ever accept + * from the network, cache to disk, or interpolate into a URL — it sanitises + * the registry/GitHub responses so untrusted bytes can't reach the filesystem + * or an outbound request. Exported for tests. */ +export const VERSION_RE = /^\d+\.\d+\.\d+(?:-[0-9A-Za-z][0-9A-Za-z.-]*)?$/; + +/** Return `v` iff it's a well-formed version token, else null. */ +export function sanitizeVersion(v: unknown): string | null { + return typeof v === "string" && VERSION_RE.test(v) ? v : null; +} + /** Fetch the `latest` dist-tag version from the npm registry. Best-effort: - * returns null on any error or timeout. Exported for tests. */ + * returns null on any error, timeout, or malformed version. Exported for tests. */ export async function fetchLatestVersion( pkg: string = CLI_PACKAGE, timeoutMs: number = FETCH_TIMEOUT_MS, ): Promise { // The `latest` endpoint returns just the dist-tag manifest — far smaller than - // the full packument. - const url = `https://registry.npmjs.org/${pkg.replace("/", "%2f")}/latest`; + // the full packument. Percent-encode every "/" in the (scoped) package name. + const url = `https://registry.npmjs.org/${pkg.replace(/\//g, "%2F")}/latest`; try { const resp = await fetch(url, { signal: AbortSignal.timeout(timeoutMs), @@ -114,7 +128,9 @@ export async function fetchLatestVersion( }); if (!resp.ok) return null; const body = (await resp.json()) as { version?: string }; - return typeof body.version === "string" ? body.version : null; + // Only ever return a strictly-validated version — nothing else is allowed + // to flow on to the cache file or a downstream URL. + return sanitizeVersion(body.version); } catch { return null; } @@ -126,7 +142,10 @@ export async function fetchChangelogSummary( version: string, timeoutMs: number = FETCH_TIMEOUT_MS, ): Promise { - const tag = version.startsWith("v") ? version : `v${version}`; + // Guard: refuse to interpolate anything but a validated version into the URL. + const safe = sanitizeVersion(version.replace(/^v/, "")); + if (!safe) return undefined; + const tag = `v${safe}`; const url = `https://api.github.com/repos/Azure/kars/releases/tags/${tag}`; try { const resp = await fetch(url, { From c189fbd8cfb1e2c5fbc8bc4667bd49d84ce4e80f Mon Sep 17 00:00:00 2001 From: Pal Lakatos-Toth Date: Tue, 30 Jun 2026 15:49:33 +0200 Subject: [PATCH 10/10] =?UTF-8?q?chore(release):=20v0.1.24=20=E2=80=94=20M?= =?UTF-8?q?CP=20out-of-the-box=20(egress=20auto-derive=20+=20session=20kee?= =?UTF-8?q?palive)=20+=20kars=20update?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- cli/package-lock.json | 4 ++-- cli/package.json | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cli/package-lock.json b/cli/package-lock.json index cf9e807f7..698fd19c1 100644 --- a/cli/package-lock.json +++ b/cli/package-lock.json @@ -1,12 +1,12 @@ { "name": "@kars-runtime/cli", - "version": "0.1.23", + "version": "0.1.24", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "@kars-runtime/cli", - "version": "0.1.23", + "version": "0.1.24", "license": "MIT", "dependencies": { "@azure/identity": "^4.0.0", diff --git a/cli/package.json b/cli/package.json index bb4e48e28..1d1b8c323 100644 --- a/cli/package.json +++ b/cli/package.json @@ -1,6 +1,6 @@ { "name": "@kars-runtime/cli", - "version": "0.1.23", + "version": "0.1.24", "description": "Enterprise-grade runtime for running OpenClaw AI assistants safely on Azure", "license": "MIT", "repository": {