From 0abb3c3b27f2dee9b5c7b82dafc76aa6f79bb42f Mon Sep 17 00:00:00 2001
From: Pal Lakatos-Toth <pallakatos@github.com>
Date: Tue, 30 Jun 2026 14:56:51 +0200
Subject: [PATCH 01/10] fix(mcp): keep stateful upstream sessions alive
 (Playwright about:blank)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The MCP forwarder spoke request/response only: it POSTed tools/call and
read the immediate response, but never opened the standalone GET /mcp SSE
stream nor answered server-initiated pings. Heartbeating MCP servers —
Playwright MCP runs one (HTTP transport, runHeartbeat=true) — send the
client a JSON-RPC ping every ~3s and call server.close() if no pong
arrives within PLAYWRIGHT_MCP_PING_TIMEOUT_MS (default 5000). So every
session was reaped ~5s after creation; the next tools/call got
404 "Session not found", the forwarder re-initialized, and the retry
landed on a brand-new blank browser context — the agent saw about:blank
mid-task (navigate/click state lost).

Fix: act as a well-formed MCP client. For each stateful session, spawn a
keepalive task that holds the standalone GET SSE stream open and replies
pong to server pings, keeping the session — and the agent's live page —
alive. The task is cancelled/replaced when the session is re-initialized.

Also:
- Tighten the session-loss classifier so it only triggers on genuine
  4xx hard signals (never on healthy 2xx tool output that merely mentions
  "session", e.g. browser_evaluate returning sessionStorage), since a
  false positive is destructive for stateful servers.
- Carry the triggering status+body on CallAttempt::SessionLost so the
  re-init log records exactly why a session was deemed lost.

Tests: keepalive_holds_get_stream_and_pongs_server_ping plus the existing
stateful/session-loss suite. 1048 router tests pass; clippy + fmt clean.
Validated live on AKS: both sandbox routers hold a persistent GET stream
to Playwright that survives well past the 5s reaper; zero session-loss.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 inference-router/src/mcp/forwarder.rs | 1086 ++++++++++++++++++++++++-
 1 file changed, 1041 insertions(+), 45 deletions(-)
diff --git a/inference-router/src/mcp/forwarder.rs b/inference-router/src/mcp/forwarder.rs
index e329c305b..be929a3a3 100644
--- a/inference-router/src/mcp/forwarder.rs
+++ b/inference-router/src/mcp/forwarder.rs
@@ -67,6 +67,7 @@ use serde_json::{Value, json};
 use std::collections::BTreeMap;
 use std::sync::Arc;
 use std::time::Duration;
+use tokio::sync::Mutex;
 
 use super::registry::{DiscoveredMcpServer, McpServerRegistry};
 use super::tools::{
@@ -92,6 +93,27 @@ struct ForwarderEntry {
     /// outbound `tools/list` and `tools/call` POST. Resolved at
     /// discovery time from the env var named by `meta.bearer_from_env`.
     bearer_token: Option<String>,
+    /// Live MCP session for a STATEFUL upstream — the `Mcp-Session-Id` plus the
+    /// negotiated protocol version, established by the `initialize` handshake at
+    /// discovery and reused across every `tools/call` so servers that bind state
+    /// to the session (e.g. Playwright, whose browser context lives in the
+    /// session) keep that state between calls. Re-established on demand when an
+    /// upstream reports the session is gone (pod restart / TTL expiry).
+    session: Arc<Mutex<McpSession>>,
+    /// Background keepalive task for a STATEFUL session (see
+    /// [`run_session_keepalive`]). MCP servers such as Playwright run a
+    /// server-side heartbeat: they send the client a JSON-RPC `ping` over the
+    /// standalone `GET /mcp` SSE stream and, if no response (`pong`) arrives
+    /// within a few seconds (Playwright: `PLAYWRIGHT_MCP_PING_TIMEOUT_MS`,
+    /// default 5000), they tear the session down. A forwarder that only issues
+    /// request/response `tools/call` POSTs never receives those pings, so every
+    /// session silently dies ~5s after creation — the next call then 404s with
+    /// "Session not found", forcing a re-init onto a brand-new (blank) browser
+    /// context. The keepalive task holds the GET stream open and answers pings,
+    /// keeping the session — and the agent's live browser page — alive. Holds
+    /// the task's [`AbortHandle`] so it can be cancelled when the session is
+    /// re-initialized (replaced) or the dispatcher is dropped.
+    keepalive: Arc<Mutex<Option<tokio::task::AbortHandle>>>,
 }
 
 /// Namespaced MCP forwarder — the production-mode `AsyncToolDispatcher`
@@ -297,7 +319,15 @@ async fn build_entry_for(
         );
     }
 
-    let upstream_tools = fetch_upstream_tools(http, &meta.url, bearer_token.as_deref()).await?;
+    // MCP streamable-HTTP servers may be STATEFUL: they require an
+    // `initialize` handshake and bind subsequent requests to the
+    // `Mcp-Session-Id` they issue. Establish that session up front
+    // (best-effort: stateless servers simply return no session id and the
+    // `tools/list` below proceeds unauthenticated as before).
+    let session = initialize_session(http, &meta.url, bearer_token.as_deref()).await;
+
+    let upstream_tools =
+        fetch_upstream_tools(http, &meta.url, bearer_token.as_deref(), &session).await?;
 
     let filtered = filter_by_allowlist(&upstream_tools, &meta.allowed_tools);
     if filtered.is_empty() {
@@ -320,6 +350,20 @@ async fn build_entry_for(
         });
     }
 
+    // For a STATEFUL session, start the heartbeat-responder so the upstream
+    // doesn't reap the session (and the agent's live browser page) after its
+    // ping timeout. Stateless sessions (id: None) need no keepalive.
+    let keepalive: Arc<Mutex<Option<tokio::task::AbortHandle>>> = Arc::new(Mutex::new(None));
+    if session.id.is_some() {
+        let handle = spawn_session_keepalive(
+            http.clone(),
+            meta.url.clone(),
+            session.clone(),
+            bearer_token.clone(),
+        );
+        *keepalive.lock().await = Some(handle);
+    }
+
     Ok((
         ForwarderEntry {
             name: name.to_string(),
@@ -327,12 +371,318 @@ async fn build_entry_for(
             upstream_url: meta.url.clone(),
             tools: tools_map,
             bearer_token,
+            session: Arc::new(Mutex::new(session)),
+            keepalive,
         },
         namespaced_defs,
     ))
 }
 
-/// POST a JSON-RPC `tools/list` to `url` and parse the result. No
+/// The MCP protocol version the router advertises in its `initialize`
+/// handshake. Servers negotiate to a version they support; this is the
+/// version the router proposes and falls back to when the upstream doesn't
+/// echo one back.
+const MCP_PROTOCOL_VERSION: &str = "2025-06-18";
+
+/// A negotiated MCP streamable-HTTP session.
+///
+/// Two pieces of per-connection state survive the `initialize` handshake and
+/// MUST be replayed on every subsequent request for broad server
+/// compatibility:
+///
+/// - **`id`** — the `Mcp-Session-Id` a STATEFUL server issues (Playwright, the
+///   official TS-SDK servers, GitHub MCP, …). `None` for stateless servers,
+///   which simply don't return the header.
+/// - **`protocol_version`** — the version the server negotiated. Per MCP
+///   2025-06-18 the client MUST send it back in the `MCP-Protocol-Version`
+///   header on every post-initialize HTTP request; SDK-based servers reject
+///   requests that omit it with `400 Bad Request`. We default to the version
+///   we proposed when the server doesn't echo one (older servers ignore the
+///   header).
+#[derive(Clone, Debug)]
+struct McpSession {
+    id: Option<String>,
+    protocol_version: String,
+}
+
+impl McpSession {
+    /// A session for an upstream that did not complete a handshake (stateless,
+    /// or handshake unreachable): no session id, default protocol version.
+    fn stateless() -> Self {
+        Self {
+            id: None,
+            protocol_version: MCP_PROTOCOL_VERSION.to_string(),
+        }
+    }
+
+    /// Apply this session's headers (`Mcp-Session-Id` when present, and always
+    /// `MCP-Protocol-Version`) to an outbound request builder.
+    fn apply(&self, mut req: reqwest::RequestBuilder) -> reqwest::RequestBuilder {
+        req = req.header("mcp-protocol-version", &self.protocol_version);
+        if let Some(sid) = self.id.as_deref() {
+            req = req.header("mcp-session-id", sid);
+        }
+        req
+    }
+}
+
+/// MCP `initialize` handshake — negotiates the protocol version and (for
+/// STATEFUL upstreams) establishes the `Mcp-Session-Id` reused across every
+/// subsequent request.
+///
+/// Best-effort by design: stateless servers (and servers that don't implement
+/// the handshake) return no session header, in which case the returned session
+/// has `id: None` and the caller proceeds without one — preserving the original
+/// `tools/list` behaviour. Stateful servers (e.g. Playwright MCP, which returns
+/// `400 "Server not initialized"` on a bare `tools/list`) issue a session id;
+/// for those we also send the required `notifications/initialized` to complete
+/// the handshake before any `tools/list` / `tools/call`.
+///
+/// Never returns an error: a failed handshake degrades to a stateless session
+/// and the real signal surfaces from the subsequent `tools/list` (which keeps
+/// error reporting in one place).
+async fn initialize_session(http: &reqwest::Client, url: &str, bearer: Option<&str>) -> McpSession {
+    let init_body = json!({
+        "jsonrpc": "2.0",
+        "id": 1,
+        "method": "initialize",
+        "params": {
+            "protocolVersion": MCP_PROTOCOL_VERSION,
+            "capabilities": {},
+            "clientInfo": {
+                "name": "kars-inference-router",
+                "version": env!("CARGO_PKG_VERSION"),
+            },
+        },
+    });
+
+    let mut req = http
+        .post(url)
+        .header("content-type", "application/json")
+        .header("accept", "application/json, text/event-stream")
+        .json(&init_body);
+    if let Some(token) = bearer {
+        req = req.bearer_auth(token);
+    }
+
+    let resp = match req.send().await {
+        Ok(r) => r,
+        Err(e) => {
+            tracing::debug!(url = %url, error = %e, "MCP initialize POST failed; treating upstream as stateless");
+            return McpSession::stateless();
+        }
+    };
+
+    let session_id = resp
+        .headers()
+        .get("mcp-session-id")
+        .and_then(|v| v.to_str().ok())
+        .map(|s| s.to_string());
+    let content_type = resp
+        .headers()
+        .get("content-type")
+        .and_then(|v| v.to_str().ok())
+        .unwrap_or("")
+        .to_string();
+
+    // The initialize *result* carries the negotiated `protocolVersion`. The
+    // body may be plain JSON or an SSE event stream (official TS-SDK servers
+    // reply with SSE); `extract_jsonrpc_payload` handles both. If anything is
+    // missing we fall back to the version we proposed.
+    let protocol_version = match resp.text().await {
+        Ok(body) => extract_jsonrpc_payload(&content_type, &body)
+            .ok()
+            .and_then(|v| {
+                v.get("result")
+                    .and_then(|r| r.get("protocolVersion"))
+                    .and_then(|p| p.as_str())
+                    .map(|s| s.to_string())
+            })
+            .unwrap_or_else(|| MCP_PROTOCOL_VERSION.to_string()),
+        Err(_) => MCP_PROTOCOL_VERSION.to_string(),
+    };
+
+    let Some(sid) = session_id else {
+        // Stateless server: no session id, but still report the negotiated
+        // protocol version so we send the `MCP-Protocol-Version` header.
+        return McpSession {
+            id: None,
+            protocol_version,
+        };
+    };
+
+    // Stateful server: complete the handshake. Per the MCP spec the client
+    // MUST send `notifications/initialized` (carrying the session + protocol
+    // headers) before issuing requests. Best-effort: a transient hiccup here
+    // surfaces as a clear error on the following `tools/list`.
+    let session = McpSession {
+        id: Some(sid),
+        protocol_version,
+    };
+    let note = json!({ "jsonrpc": "2.0", "method": "notifications/initialized" });
+    let mut nreq = http
+        .post(url)
+        .header("content-type", "application/json")
+        .header("accept", "application/json, text/event-stream")
+        .json(&note);
+    nreq = session.apply(nreq);
+    if let Some(token) = bearer {
+        nreq = nreq.bearer_auth(token);
+    }
+    if let Err(e) = nreq.send().await {
+        tracing::debug!(url = %url, error = %e, "MCP notifications/initialized POST failed");
+    }
+
+    tracing::debug!(url = %url, protocol = %session.protocol_version, "Established MCP session with stateful upstream");
+    session
+}
+
+/// How long the keepalive holds a single `GET /mcp` SSE connection before
+/// proactively reconnecting. The connection normally stays open for the life
+/// of the session; this bound just guarantees we recycle a wedged TCP
+/// connection rather than block forever on a half-open socket.
+const KEEPALIVE_GET_TIMEOUT: Duration = Duration::from_secs(3600);
+
+/// Backoff before re-opening the standalone SSE stream after it ends. Short
+/// enough that a transient blip doesn't leave a ping unanswered past the
+/// upstream's timeout, long enough to avoid a hot loop if the server has
+/// actually torn the session down (in which case the next GET 404s and the
+/// task exits).
+const KEEPALIVE_RECONNECT_BACKOFF: Duration = Duration::from_millis(750);
+
+/// Spawn the [`run_session_keepalive`] task and return its [`AbortHandle`].
+fn spawn_session_keepalive(
+    http: reqwest::Client,
+    url: String,
+    session: McpSession,
+    bearer: Option<String>,
+) -> tokio::task::AbortHandle {
+    tokio::spawn(run_session_keepalive(http, url, session, bearer)).abort_handle()
+}
+
+/// Keep a STATEFUL MCP session alive by behaving as a well-formed MCP client:
+/// hold the standalone `GET /mcp` SSE stream open and answer the server's
+/// JSON-RPC `ping` requests with a `pong`.
+///
+/// # Why this is required
+///
+/// MCP servers may run a server-initiated heartbeat to detect dead clients.
+/// Playwright MCP does (HTTP transport, `runHeartbeat = true`): every ~3s it
+/// calls `server.ping()` and, if no response arrives within
+/// `PLAYWRIGHT_MCP_PING_TIMEOUT_MS` (default 5000), it calls `server.close()`,
+/// which deletes the session. A forwarder that only POSTs `tools/call` and
+/// reads the immediate response never sees those pings, so the session — and
+/// the browser context bound to it — is reaped ~5s after creation. The next
+/// `tools/call` then gets `404 "Session not found"`, the forwarder re-inits,
+/// and the retry lands on a fresh blank page (`about:blank`). Holding the GET
+/// stream and ponging keeps the session (and the agent's page) alive.
+///
+/// # Behaviour
+///
+/// Loops for the life of the task (cancelled via its [`AbortHandle`] when the
+/// session is re-initialized or the dispatcher is dropped):
+///
+/// 1. Open `GET {url}` with `Accept: text/event-stream` + the session headers.
+/// 2. A non-2xx (e.g. `404 Session not found`, or `405` from a server that
+///    doesn't implement the standalone stream) means there's nothing to keep
+///    alive this way — exit; the normal `tools/call` path handles re-init.
+/// 3. Stream the SSE body. For each server→client JSON-RPC `ping`, POST a
+///    `{"result":{}}` response carrying the same id and the session headers.
+/// 4. If the stream ends while the session may still be valid, back off briefly
+///    and reconnect.
+async fn run_session_keepalive(
+    http: reqwest::Client,
+    url: String,
+    session: McpSession,
+    bearer: Option<String>,
+) {
+    use futures::StreamExt;
+
+    loop {
+        let mut req = http
+            .get(&url)
+            .header("accept", "text/event-stream")
+            .header("mcp-protocol-version", &session.protocol_version)
+            .timeout(KEEPALIVE_GET_TIMEOUT);
+        if let Some(sid) = session.id.as_deref() {
+            req = req.header("mcp-session-id", sid);
+        }
+        if let Some(token) = bearer.as_deref() {
+            req = req.bearer_auth(token);
+        }
+
+        let resp = match req.send().await {
+            Ok(r) => r,
+            Err(e) => {
+                tracing::debug!(url = %url, error = %e, "MCP keepalive GET failed; retrying");
+                tokio::time::sleep(KEEPALIVE_RECONNECT_BACKOFF).await;
+                continue;
+            }
+        };
+
+        if !resp.status().is_success() {
+            // 404 → session already gone; 405 → server has no standalone SSE
+            // stream (so it can't be heartbeating us either). Either way, stop:
+            // there is nothing this task can keep alive.
+            tracing::debug!(
+                url = %url,
+                status = %resp.status(),
+                "MCP keepalive GET not available; stopping keepalive for this session"
+            );
+            return;
+        }
+
+        let mut stream = resp.bytes_stream();
+        let mut buf = String::new();
+        while let Some(chunk) = stream.next().await {
+            let bytes = match chunk {
+                Ok(b) => b,
+                Err(e) => {
+                    tracing::debug!(url = %url, error = %e, "MCP keepalive SSE read error; reconnecting");
+                    break;
+                }
+            };
+            buf.push_str(&String::from_utf8_lossy(&bytes));
+            // SSE frames are newline-delimited; pull complete lines and act on
+            // `data:` lines that carry a server→client `ping` request.
+            while let Some(nl) = buf.find('\n') {
+                let line = buf[..nl].trim().to_string();
+                buf.drain(..=nl);
+                let Some(payload) = line.strip_prefix("data:") else {
+                    continue;
+                };
+                let Ok(msg) = serde_json::from_str::<Value>(payload.trim()) else {
+                    continue;
+                };
+                if msg.get("method").and_then(|m| m.as_str()) != Some("ping") {
+                    continue;
+                }
+                let Some(id) = msg.get("id").cloned() else {
+                    // A ping *notification* (no id) needs no response.
+                    continue;
+                };
+                let pong = json!({ "jsonrpc": "2.0", "id": id, "result": {} });
+                let mut preq = http
+                    .post(&url)
+                    .header("content-type", "application/json")
+                    .header("accept", "application/json, text/event-stream")
+                    .json(&pong);
+                preq = session.apply(preq);
+                if let Some(token) = bearer.as_deref() {
+                    preq = preq.bearer_auth(token);
+                }
+                if let Err(e) = preq.send().await {
+                    tracing::debug!(url = %url, error = %e, "MCP keepalive pong POST failed");
+                }
+            }
+        }
+
+        // Stream ended. Back off, then reconnect — if the session was reaped the
+        // next GET 404s and we exit; otherwise we resume answering pings.
+        tokio::time::sleep(KEEPALIVE_RECONNECT_BACKOFF).await;
+    }
+}
+
 /// pagination — Slice 4d.4 caps at one page; multi-page upstreams are
 /// truncated with a recorded warning. Multi-page support lands when
 /// we have a real consumer that hits the cap (principles §5).
@@ -340,6 +690,7 @@ async fn fetch_upstream_tools(
     http: &reqwest::Client,
     url: &str,
     bearer: Option<&str>,
+    session: &McpSession,
 ) -> Result<Vec<ToolDefinition>, String> {
     let body = json!({
         "jsonrpc": "2.0",
@@ -355,6 +706,7 @@ async fn fetch_upstream_tools(
     if let Some(token) = bearer {
         req = req.bearer_auth(token);
     }
+    req = session.apply(req);
     let resp = req
         .send()
         .await
@@ -421,12 +773,82 @@ fn filter_by_allowlist(upstream: &[ToolDefinition], allow: &[String]) -> Vec<Too
 }
 
 /// Forward one `tools/call` invocation to `entry.upstream_url`.
-async fn forward_tools_call(
+/// Outcome of a single `tools/call` POST attempt, distinguishing a stale
+/// session (which warrants one re-handshake + retry) from a terminal result.
+enum CallAttempt {
+    /// Upstream produced a usable response (success or a per-call isError).
+    Done(ToolCallOutput),
+    /// Upstream reported its session is gone (pod restart / TTL expiry).
+    /// The caller re-initializes the session and retries once. Carries the
+    /// triggering signal (HTTP status + trimmed body, or the JSON-RPC error)
+    /// so the re-init log line records *why* the session was deemed lost —
+    /// turning any future false-positive into a one-line diagnosis instead of
+    /// a guessing game.
+    SessionLost { reason: String },
+    /// Terminal failure — surface to the agent as-is.
+    Fatal(DispatchError),
+}
+
+/// Heuristic: does this upstream response indicate the MCP session is no
+/// longer valid (so a single re-handshake + retry is warranted)?
+///
+/// This MUST be conservative. A false positive is *destructive* for
+/// stateful servers: re-initializing throws away live per-session state
+/// (e.g. a Playwright server's open browser page) and the retry lands on
+/// a brand-new blank context. We therefore only treat a response as
+/// session-loss on the canonical, unambiguous signals:
+///
+///   - the MCP SDK's exact `Server not initialized` 400 (what Playwright
+///     and the official TypeScript SDK return once a session is gone), or
+///   - a body that explicitly couples "session" with an expiry/not-found
+///     qualifier (`expired`, `not found`, `invalid`, `unknown`, …).
+///
+/// Crucially we do NOT match the bare word "session": large tool
+/// responses (browser snapshots, `evaluate` output) routinely embed it
+/// (`sessionStorage`, page text), and matching it corrupted healthy
+/// sessions mid-flight.
+fn is_session_lost(status: reqwest::StatusCode, body: &str) -> bool {
+    let code = status.as_u16();
+    if code != 400 && code != 404 {
+        return false;
+    }
+    body_signals_session_loss(body)
+}
+
+/// Shared session-loss body classifier for both the HTTP-status path and
+/// the JSON-RPC-error path. Conservative by design (see [`is_session_lost`]).
+fn body_signals_session_loss(body: &str) -> bool {
+    let b = body.to_ascii_lowercase();
+    // Canonical MCP SDK signal — emitted verbatim by Playwright MCP and the
+    // official reference servers when the session they issued is gone.
+    if b.contains("not initialized") {
+        return true;
+    }
+    // Other servers phrase it as an expired/invalid/missing *session*. Require
+    // the word "session" AND an explicit invalidation qualifier so incidental
+    // occurrences of "session" in tool output never trigger a re-init.
+    if b.contains("session") {
+        const QUALIFIERS: [&str; 6] = [
+            "expired",
+            "not found",
+            "invalid",
+            "unknown",
+            "missing",
+            "no longer",
+        ];
+        return QUALIFIERS.iter().any(|q| b.contains(q));
+    }
+    false
+}
+
+async fn post_tools_call(
     http: &reqwest::Client,
     entry: &ForwarderEntry,
     upstream_name: &str,
     arguments: &Value,
-) -> Result<ToolCallOutput, DispatchError> {
+    session: &McpSession,
+) -> CallAttempt {
+    let tool_label = format!("{}.{}", entry.prefix, upstream_name);
     let body = json!({
         "jsonrpc": "2.0",
         "id": 1,
@@ -445,13 +867,17 @@ async fn forward_tools_call(
     if let Some(token) = entry.bearer_token.as_deref() {
         req = req.bearer_auth(token);
     }
-    let resp = req
-        .send()
-        .await
-        .map_err(|e| DispatchError::ExecutionFailed {
-            tool: format!("{}.{}", entry.prefix, upstream_name),
-            reason: format!("upstream POST failed: {e}"),
-        })?;
+    req = session.apply(req);
+
+    let resp = match req.send().await {
+        Ok(r) => r,
+        Err(e) => {
+            return CallAttempt::Fatal(DispatchError::ExecutionFailed {
+                tool: tool_label,
+                reason: format!("upstream POST failed: {e}"),
+            });
+        }
+    };
 
     let status = resp.status();
     let content_type = resp
@@ -460,17 +886,28 @@ async fn forward_tools_call(
         .and_then(|v| v.to_str().ok())
         .unwrap_or("")
         .to_string();
-    let body_text = resp
-        .text()
-        .await
-        .map_err(|e| DispatchError::ExecutionFailed {
-            tool: format!("{}.{}", entry.prefix, upstream_name),
-            reason: format!("upstream body read failed: {e}"),
-        })?;
+    let body_text = match resp.text().await {
+        Ok(t) => t,
+        Err(e) => {
+            return CallAttempt::Fatal(DispatchError::ExecutionFailed {
+                tool: tool_label,
+                reason: format!("upstream body read failed: {e}"),
+            });
+        }
+    };
 
     if !status.is_success() {
-        return Err(DispatchError::ExecutionFailed {
-            tool: format!("{}.{}", entry.prefix, upstream_name),
+        if is_session_lost(status, &body_text) {
+            return CallAttempt::SessionLost {
+                reason: format!(
+                    "http {} body: {}",
+                    status,
+                    body_text.chars().take(160).collect::<String>()
+                ),
+            };
+        }
+        return CallAttempt::Fatal(DispatchError::ExecutionFailed {
+            tool: tool_label,
             reason: format!(
                 "upstream non-2xx: {} (body trimmed: {})",
                 status,
@@ -479,26 +916,43 @@ async fn forward_tools_call(
         });
     }
 
-    let json_payload = extract_jsonrpc_payload(&content_type, &body_text).map_err(|e| {
-        DispatchError::ExecutionFailed {
-            tool: format!("{}.{}", entry.prefix, upstream_name),
-            reason: format!("upstream response decode failed: {e}"),
+    let json_payload = match extract_jsonrpc_payload(&content_type, &body_text) {
+        Ok(v) => v,
+        Err(e) => {
+            return CallAttempt::Fatal(DispatchError::ExecutionFailed {
+                tool: tool_label,
+                reason: format!("upstream response decode failed: {e}"),
+            });
+        }
+    };
+    let parsed: ToolsCallResponse = match serde_json::from_value(json_payload) {
+        Ok(p) => p,
+        Err(e) => {
+            return CallAttempt::Fatal(DispatchError::ExecutionFailed {
+                tool: tool_label,
+                reason: format!("upstream tools/call parse failed: {e}"),
+            });
         }
-    })?;
-    let parsed: ToolsCallResponse =
-        serde_json::from_value(json_payload).map_err(|e| DispatchError::ExecutionFailed {
-            tool: format!("{}.{}", entry.prefix, upstream_name),
-            reason: format!("upstream tools/call parse failed: {e}"),
-        })?;
+    };
 
     if let Some(err) = parsed.error {
-        // Upstream protocol error → surface as an isError content
-        // entry, not a DispatchError. Per MCP spec, JSON-RPC errors
-        // from `tools/call` indicate the *protocol* failed; the
-        // semantic "tool ran but errored" path uses isError:true.
-        // We collapse them here because the agent-facing surface
-        // shouldn't distinguish (the audit layer can see both).
-        return Ok(ToolCallOutput {
+        // A JSON-RPC session error (code -32000/-32001 with an explicit
+        // session-loss message) also means the session is gone — retry once.
+        // We reuse the same conservative body classifier so an ordinary
+        // tool-level error whose message merely mentions "session" is NOT
+        // mistaken for a lost session.
+        if (err.code == -32000 || err.code == -32001) && body_signals_session_loss(&err.message) {
+            return CallAttempt::SessionLost {
+                reason: format!("jsonrpc error code={} message={}", err.code, err.message),
+            };
+        }
+        // Other upstream protocol error → surface as an isError content
+        // entry, not a DispatchError. Per MCP spec, JSON-RPC errors from
+        // `tools/call` indicate the *protocol* failed; the semantic "tool
+        // ran but errored" path uses isError:true. We collapse them here
+        // because the agent-facing surface shouldn't distinguish (the audit
+        // layer can see both).
+        return CallAttempt::Done(ToolCallOutput {
             content: vec![ToolContent::Text {
                 text: format!(
                     "upstream JSON-RPC error code={} message={}",
@@ -509,19 +963,78 @@ async fn forward_tools_call(
         });
     }
 
-    let result = parsed
-        .result
-        .ok_or_else(|| DispatchError::ExecutionFailed {
-            tool: format!("{}.{}", entry.prefix, upstream_name),
-            reason: "tools/call missing result".to_string(),
-        })?;
+    let result = match parsed.result {
+        Some(r) => r,
+        None => {
+            return CallAttempt::Fatal(DispatchError::ExecutionFailed {
+                tool: tool_label,
+                reason: "tools/call missing result".to_string(),
+            });
+        }
+    };
 
-    Ok(ToolCallOutput {
+    CallAttempt::Done(ToolCallOutput {
         content: result.content,
         is_error: result.is_error.unwrap_or(false),
     })
 }
 
+async fn forward_tools_call(
+    http: &reqwest::Client,
+    entry: &ForwarderEntry,
+    upstream_name: &str,
+    arguments: &Value,
+) -> Result<ToolCallOutput, DispatchError> {
+    // Reuse the session established at discovery so stateful upstreams keep
+    // their per-session state (e.g. the open browser page) across calls.
+    let session = entry.session.lock().await.clone();
+
+    match post_tools_call(http, entry, upstream_name, arguments, &session).await {
+        CallAttempt::Done(out) => Ok(out),
+        CallAttempt::Fatal(e) => Err(e),
+        CallAttempt::SessionLost { reason } => {
+            // Re-establish the session once and retry. Covers upstream pod
+            // restarts and session TTL expiry without failing the agent's call.
+            tracing::info!(
+                tool = %format!("{}.{}", entry.prefix, upstream_name),
+                reason = %reason,
+                "MCP upstream session lost; re-initializing and retrying once"
+            );
+            let new_session =
+                initialize_session(http, &entry.upstream_url, entry.bearer_token.as_deref()).await;
+            *entry.session.lock().await = new_session.clone();
+
+            // The old keepalive (if any) was bound to the now-dead session;
+            // cancel it and start a fresh one for the re-established session so
+            // the new session is likewise protected from the upstream's
+            // heartbeat reaper.
+            {
+                let mut guard = entry.keepalive.lock().await;
+                if let Some(old) = guard.take() {
+                    old.abort();
+                }
+                if new_session.id.is_some() {
+                    *guard = Some(spawn_session_keepalive(
+                        http.clone(),
+                        entry.upstream_url.clone(),
+                        new_session.clone(),
+                        entry.bearer_token.clone(),
+                    ));
+                }
+            }
+
+            match post_tools_call(http, entry, upstream_name, arguments, &new_session).await {
+                CallAttempt::Done(out) => Ok(out),
+                CallAttempt::Fatal(e) => Err(e),
+                CallAttempt::SessionLost { .. } => Err(DispatchError::ExecutionFailed {
+                    tool: format!("{}.{}", entry.prefix, upstream_name),
+                    reason: "upstream session could not be re-established after retry".to_string(),
+                }),
+            }
+        }
+    }
+}
+
 /// Decode an MCP Streamable HTTP response body into a JSON-RPC payload.
 ///
 /// Per the MCP Streamable HTTP transport spec, a server MAY respond
@@ -633,7 +1146,7 @@ mod tests {
         Json, Router,
         extract::State,
         http::{HeaderMap, StatusCode},
-        routing::{any, post},
+        routing::{any, get, post},
     };
     use std::collections::BTreeMap;
     use std::sync::Arc as StdArc;
@@ -1151,4 +1664,487 @@ mod tests {
         let ToolContent::Text { text: t2 } = &out2.content[0];
         assert!(t2.contains("called query"));
     }
+
+    /// A configurable STATEFUL mock upstream covering the transport variants of
+    /// the popular MCP servers:
+    ///
+    /// - **Playwright MCP** — JSON `initialize` body, `Mcp-Session-Id`, rejects
+    ///   bare `tools/list` with `400 "Server not initialized"`.
+    /// - **Official TS-SDK servers / GitHub MCP** — SSE `initialize` body, a
+    ///   negotiated `protocolVersion`, and a hard requirement that every
+    ///   post-initialize request carries the `MCP-Protocol-Version` header
+    ///   (returns `400` otherwise).
+    /// - **Session expiry / pod restart** — one request fails with a session
+    ///   error, forcing a re-initialize + retry.
+    #[derive(Clone)]
+    struct StatefulState {
+        tools: Vec<ToolDefinition>,
+        session_id: String,
+        negotiated_version: String,
+        /// Return the `initialize` result as a `text/event-stream` body.
+        sse_init: bool,
+        /// Reject `tools/*` requests that omit the `MCP-Protocol-Version` header.
+        require_protocol_header: bool,
+        init_count: StdArc<AtomicUsize>,
+        call_count: StdArc<AtomicUsize>,
+        /// When set, the next `tools/call` returns a `400` session error once
+        /// (then clears the flag), simulating session expiry / pod restart.
+        fail_next_with_session_lost: StdArc<std::sync::atomic::AtomicBool>,
+        /// When set, every successful `tools/call` result embeds the word
+        /// "session" in its text (mimics Playwright `browser_evaluate` output
+        /// that references `sessionStorage`). A healthy 200 like this must
+        /// NEVER be misread as a lost session.
+        result_mentions_session: bool,
+    }
+
+    impl StatefulState {
+        fn new(session_id: &str, tools: Vec<ToolDefinition>) -> Self {
+            Self {
+                tools,
+                session_id: session_id.to_string(),
+                negotiated_version: MCP_PROTOCOL_VERSION.to_string(),
+                sse_init: false,
+                require_protocol_header: false,
+                init_count: StdArc::new(AtomicUsize::new(0)),
+                call_count: StdArc::new(AtomicUsize::new(0)),
+                fail_next_with_session_lost: StdArc::new(std::sync::atomic::AtomicBool::new(false)),
+                result_mentions_session: false,
+            }
+        }
+    }
+
+    async fn stateful_mock_handler(
+        State(state): State<StatefulState>,
+        headers: HeaderMap,
+        Json(body): Json<serde_json::Value>,
+    ) -> axum::response::Response {
+        use axum::response::IntoResponse;
+        let method = body.get("method").and_then(|v| v.as_str()).unwrap_or("");
+        let id = body.get("id").cloned().unwrap_or(serde_json::json!(1));
+        let sid = headers
+            .get("mcp-session-id")
+            .and_then(|v| v.to_str().ok())
+            .unwrap_or("");
+        let has_proto = headers.contains_key("mcp-protocol-version");
+
+        let session_lost = || {
+            (
+                StatusCode::BAD_REQUEST,
+                Json(serde_json::json!({
+                    "jsonrpc": "2.0", "id": null,
+                    "error": {"code": -32000, "message": "Bad Request: Server not initialized"}
+                })),
+            )
+                .into_response()
+        };
+
+        match method {
+            "initialize" => {
+                state.init_count.fetch_add(1, Ordering::SeqCst);
+                let result = serde_json::json!({
+                    "jsonrpc": "2.0", "id": id,
+                    "result": {"protocolVersion": state.negotiated_version, "capabilities": {}}
+                });
+                if state.sse_init {
+                    let sse = format!("event: message\ndata: {result}\n\n");
+                    (
+                        StatusCode::OK,
+                        [
+                            ("mcp-session-id", state.session_id.clone()),
+                            ("content-type", "text/event-stream".to_string()),
+                        ],
+                        sse,
+                    )
+                        .into_response()
+                } else {
+                    (
+                        StatusCode::OK,
+                        [("mcp-session-id", state.session_id.clone())],
+                        Json(result),
+                    )
+                        .into_response()
+                }
+            }
+            "notifications/initialized" => StatusCode::ACCEPTED.into_response(),
+            "tools/list" | "tools/call" if sid != state.session_id => session_lost(),
+            "tools/list" | "tools/call" if state.require_protocol_header && !has_proto => (
+                StatusCode::BAD_REQUEST,
+                Json(serde_json::json!({
+                    "jsonrpc": "2.0", "id": null,
+                    "error": {"code": -32600, "message": "Missing MCP-Protocol-Version header"}
+                })),
+            )
+                .into_response(),
+            "tools/list" => (
+                StatusCode::OK,
+                Json(serde_json::json!({
+                    "jsonrpc": "2.0", "id": id, "result": {"tools": state.tools}
+                })),
+            )
+                .into_response(),
+            "tools/call" => {
+                state.call_count.fetch_add(1, Ordering::SeqCst);
+                if state
+                    .fail_next_with_session_lost
+                    .swap(false, Ordering::SeqCst)
+                {
+                    return session_lost();
+                }
+                let tool = body
+                    .pointer("/params/name")
+                    .and_then(|v| v.as_str())
+                    .unwrap_or("");
+                let text = if state.result_mentions_session {
+                    format!("called {tool}; page has sessionStorage keys: [token]")
+                } else {
+                    format!("called {tool}")
+                };
+                (
+                    StatusCode::OK,
+                    Json(serde_json::json!({
+                        "jsonrpc": "2.0", "id": id,
+                        "result": {"content": [{"type": "text", "text": text}], "isError": false}
+                    })),
+                )
+                    .into_response()
+            }
+            _ => (
+                StatusCode::OK,
+                Json(serde_json::json!({
+                    "jsonrpc": "2.0", "id": id,
+                    "error": {"code": -32601, "message": "method not found"}
+                })),
+            )
+                .into_response(),
+        }
+    }
+
+    async fn stateful_mock_upstream(state: StatefulState) -> String {
+        let app = Router::new()
+            .route("/", post(stateful_mock_handler))
+            .route("/", any(method_block))
+            .with_state(state);
+        let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
+        let addr = listener.local_addr().unwrap();
+        tokio::spawn(async move {
+            axum::serve(listener, app).await.unwrap();
+        });
+        format!("http://{addr}/")
+    }
+
+    /// Playwright-MCP shape: JSON `initialize`, session id, bare `tools/list`
+    /// rejected. Discovery + call must succeed and reuse one session.
+    #[tokio::test]
+    async fn stateful_playwright_shape_discovers_and_calls_with_reused_session() {
+        let state = StatefulState::new(
+            "sess-abc-123",
+            vec![tool_def("browser_navigate", "open a url")],
+        );
+        let init_count = state.init_count.clone();
+        let url = stateful_mock_upstream(state).await;
+        let registry = registry_with(vec![discovered("playwright", &url, vec!["*"])]);
+
+        let dispatcher = RouterToolDispatcher::discover(registry, Duration::from_secs(5))
+            .await
+            .expect("discover");
+        assert_eq!(
+            dispatcher.len(),
+            1,
+            "stateful server tools must be discovered"
+        );
+        assert!(
+            dispatcher.skipped().is_empty(),
+            "stateful server must not be skipped"
+        );
+
+        let out = dispatcher
+            .invoke(
+                "playwright.browser_navigate",
+                &serde_json::json!({"url": "https://x"}),
+            )
+            .await
+            .expect("invoke");
+        let ToolContent::Text { text } = &out.content[0];
+        assert!(text.contains("called browser_navigate"));
+        assert!(!out.is_error);
+
+        // Exactly one initialize: the session is reused, not re-created per call.
+        assert_eq!(
+            init_count.load(Ordering::SeqCst),
+            1,
+            "session must be reused across calls"
+        );
+    }
+
+    /// Official TS-SDK / GitHub-MCP shape: SSE `initialize` body, a negotiated
+    /// protocol version, and a hard requirement that the `MCP-Protocol-Version`
+    /// header is replayed on every request. Discovery succeeding proves the
+    /// router parses the SSE-negotiated version and sends the header.
+    #[tokio::test]
+    async fn stateful_sse_init_with_required_protocol_header() {
+        let mut state = StatefulState::new("gh-session-xyz", vec![tool_def("search_repos", "")]);
+        state.sse_init = true;
+        state.require_protocol_header = true;
+        state.negotiated_version = "2025-03-26".to_string();
+        let url = stateful_mock_upstream(state).await;
+        let registry = registry_with(vec![discovered("github-mcp", &url, vec!["*"])]);
+
+        let dispatcher = RouterToolDispatcher::discover(registry, Duration::from_secs(5))
+            .await
+            .expect("discover");
+        assert_eq!(
+            dispatcher.len(),
+            1,
+            "SSE-init server with required protocol header must be discovered"
+        );
+        assert!(dispatcher.skipped().is_empty());
+
+        let out = dispatcher
+            .invoke("github_mcp.search_repos", &serde_json::json!({"q": "kars"}))
+            .await
+            .expect("invoke");
+        let ToolContent::Text { text } = &out.content[0];
+        assert!(text.contains("called search_repos"));
+    }
+
+    /// Session expiry / upstream pod restart: the first `tools/call` fails with
+    /// a session error; the router must re-initialize and retry transparently.
+    #[tokio::test]
+    async fn stateful_session_expiry_triggers_reinit_and_retry() {
+        let state = StatefulState::new("sess-1", vec![tool_def("do_thing", "")]);
+        let init_count = state.init_count.clone();
+        let fail_flag = state.fail_next_with_session_lost.clone();
+        let url = stateful_mock_upstream(state).await;
+        let registry = registry_with(vec![discovered("svc", &url, vec!["*"])]);
+
+        let dispatcher = RouterToolDispatcher::discover(registry, Duration::from_secs(5))
+            .await
+            .expect("discover");
+        assert_eq!(
+            init_count.load(Ordering::SeqCst),
+            1,
+            "one initialize at discovery"
+        );
+
+        // Arm the upstream to drop the session on the next call.
+        fail_flag.store(true, Ordering::SeqCst);
+        let out = dispatcher
+            .invoke("svc.do_thing", &serde_json::json!({}))
+            .await
+            .expect("invoke should succeed after transparent re-init + retry");
+        let ToolContent::Text { text } = &out.content[0];
+        assert!(text.contains("called do_thing"));
+
+        // The retry path re-established the session exactly once more.
+        assert_eq!(
+            init_count.load(Ordering::SeqCst),
+            2,
+            "session must be re-initialized once on expiry"
+        );
+    }
+
+    /// Regression: a HEALTHY `tools/call` whose 200 result merely mentions the
+    /// word "session" (e.g. Playwright `browser_evaluate` returning page text
+    /// that references `sessionStorage`) must NOT be misread as a lost session.
+    /// Re-initializing here is destructive — it drops the live browser context
+    /// and the retry lands on a blank `about:blank` page. The session must be
+    /// reused with ZERO re-initializations across many such calls.
+    #[tokio::test]
+    async fn healthy_call_mentioning_session_does_not_reinitialize() {
+        let mut state =
+            StatefulState::new("sess-keep", vec![tool_def("browser_evaluate", "run js")]);
+        state.result_mentions_session = true;
+        let init_count = state.init_count.clone();
+        let url = stateful_mock_upstream(state).await;
+        let registry = registry_with(vec![discovered("pw", &url, vec!["*"])]);
+
+        let dispatcher = RouterToolDispatcher::discover(registry, Duration::from_secs(5))
+            .await
+            .expect("discover");
+        assert_eq!(
+            init_count.load(Ordering::SeqCst),
+            1,
+            "one init at discovery"
+        );
+
+        // Several calls in a row, each returning "session" in the result text.
+        for _ in 0..5 {
+            let out = dispatcher
+                .invoke(
+                    "pw.browser_evaluate",
+                    &serde_json::json!({"function": "() => 1"}),
+                )
+                .await
+                .expect("invoke");
+            let ToolContent::Text { text } = &out.content[0];
+            assert!(text.contains("sessionStorage"), "result text preserved");
+            assert!(!out.is_error);
+        }
+
+        // The single discovery session must have been reused for every call —
+        // no false-positive session-loss re-init.
+        assert_eq!(
+            init_count.load(Ordering::SeqCst),
+            1,
+            "healthy 'session'-mentioning results must not trigger re-initialization"
+        );
+    }
+
+    #[test]
+    fn session_loss_classifier_is_conservative() {
+        use reqwest::StatusCode;
+
+        // Canonical MCP SDK signal — Playwright / official servers.
+        assert!(is_session_lost(
+            StatusCode::BAD_REQUEST,
+            "Bad Request: Server not initialized"
+        ));
+        // Explicit session invalidation phrasings.
+        assert!(is_session_lost(StatusCode::NOT_FOUND, "session not found"));
+        assert!(is_session_lost(
+            StatusCode::BAD_REQUEST,
+            "Mcp-Session-Id expired"
+        ));
+        assert!(is_session_lost(
+            StatusCode::BAD_REQUEST,
+            "this session is no longer valid"
+        ));
+
+        // Incidental "session" mentions in error bodies must NOT match — this
+        // is exactly the false positive that corrupted live browser sessions.
+        assert!(!is_session_lost(
+            StatusCode::BAD_REQUEST,
+            "Error: page.evaluate failed; sessionStorage is empty"
+        ));
+        assert!(!is_session_lost(
+            StatusCode::BAD_REQUEST,
+            "invalid arguments: missing 'url'"
+        ));
+        // Non-4xx never qualifies regardless of body.
+        assert!(!is_session_lost(StatusCode::OK, "Server not initialized"));
+        assert!(!is_session_lost(
+            StatusCode::INTERNAL_SERVER_ERROR,
+            "session expired"
+        ));
+    }
+
+    /// Mock upstream that mimics a heartbeating MCP server (Playwright shape):
+    /// it issues a session on `initialize`, serves a `tools/list`, and — on the
+    /// standalone `GET /` SSE stream — pushes a server→client `ping` then holds
+    /// the stream open. A correct client MUST answer that ping with a `pong`
+    /// POST or the real server would reap the session after its ping timeout.
+    #[derive(Clone)]
+    struct HeartbeatState {
+        session_id: String,
+        tools: Vec<ToolDefinition>,
+        get_hits: StdArc<AtomicUsize>,
+        pong_hits: StdArc<AtomicUsize>,
+    }
+
+    async fn heartbeat_post(
+        State(state): State<HeartbeatState>,
+        Json(body): Json<serde_json::Value>,
+    ) -> axum::response::Response {
+        use axum::response::IntoResponse;
+        let method = body.get("method").and_then(|v| v.as_str()).unwrap_or("");
+        let id = body.get("id").cloned().unwrap_or(serde_json::json!(1));
+        match method {
+            "initialize" => (
+                StatusCode::OK,
+                [("mcp-session-id", state.session_id.clone())],
+                Json(serde_json::json!({
+                    "jsonrpc": "2.0", "id": id,
+                    "result": {"protocolVersion": MCP_PROTOCOL_VERSION, "capabilities": {}}
+                })),
+            )
+                .into_response(),
+            "notifications/initialized" => StatusCode::ACCEPTED.into_response(),
+            "tools/list" => (
+                StatusCode::OK,
+                Json(serde_json::json!({
+                    "jsonrpc": "2.0", "id": id, "result": {"tools": state.tools}
+                })),
+            )
+                .into_response(),
+            // A `pong` is a JSON-RPC *response*: it has `result` + `id` and no
+            // `method`. That's the heartbeat reply we're asserting on.
+            "" if body.get("result").is_some() && body.get("id").is_some() => {
+                state.pong_hits.fetch_add(1, Ordering::SeqCst);
+                StatusCode::ACCEPTED.into_response()
+            }
+            _ => StatusCode::ACCEPTED.into_response(),
+        }
+    }
+
+    async fn heartbeat_get(State(state): State<HeartbeatState>) -> axum::response::Response {
+        use axum::response::IntoResponse;
+        use futures::stream::{self, StreamExt};
+        state.get_hits.fetch_add(1, Ordering::SeqCst);
+        // Push one server→client ping, then hold the stream open forever.
+        let ping = "event: message\ndata: {\"jsonrpc\":\"2.0\",\"id\":0,\"method\":\"ping\"}\n\n";
+        let body = axum::body::Body::from_stream(
+            stream::once(async move {
+                Ok::<_, std::io::Error>(axum::body::Bytes::from_static(ping.as_bytes()))
+            })
+            .chain(stream::pending()),
+        );
+        (
+            StatusCode::OK,
+            [("content-type", "text/event-stream")],
+            body,
+        )
+            .into_response()
+    }
+
+    /// A heartbeating upstream must not reap our session: the forwarder has to
+    /// hold the standalone `GET` SSE stream open and answer server pings with a
+    /// `pong`. This is the fix for the Playwright `about:blank` regression
+    /// (session reaped after the 5s ping timeout → re-init onto a blank page).
+    #[tokio::test]
+    async fn keepalive_holds_get_stream_and_pongs_server_ping() {
+        let state = HeartbeatState {
+            session_id: "hb-session-1".to_string(),
+            tools: vec![tool_def("browser_navigate", "go")],
+            get_hits: StdArc::new(AtomicUsize::new(0)),
+            pong_hits: StdArc::new(AtomicUsize::new(0)),
+        };
+        let get_hits = state.get_hits.clone();
+        let pong_hits = state.pong_hits.clone();
+
+        let app = Router::new()
+            .route("/", post(heartbeat_post))
+            .route("/", get(heartbeat_get))
+            .with_state(state);
+        let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
+        let addr = listener.local_addr().unwrap();
+        tokio::spawn(async move {
+            axum::serve(listener, app).await.unwrap();
+        });
+        let url = format!("http://{addr}/");
+
+        let registry = registry_with(vec![discovered("browser", &url, vec!["*"])]);
+        let dispatcher =
+            RouterToolDispatcher::discover_with_client(registry, reqwest::Client::new())
+                .await
+                .unwrap();
+        assert_eq!(dispatcher.len(), 1, "stateful server should be discovered");
+
+        // The keepalive runs in the background; give it a moment to open the
+        // GET stream, receive the ping, and POST the pong.
+        for _ in 0..50 {
+            if pong_hits.load(Ordering::SeqCst) >= 1 {
+                break;
+            }
+            tokio::time::sleep(Duration::from_millis(20)).await;
+        }
+
+        assert!(
+            get_hits.load(Ordering::SeqCst) >= 1,
+            "keepalive must open the standalone GET SSE stream"
+        );
+        assert!(
+            pong_hits.load(Ordering::SeqCst) >= 1,
+            "keepalive must answer the server's ping with a pong (else the upstream reaps the session)"
+        );
+    }
 }

From eb1a7fd0d43cb68b614ec7cdd27f31d7a8ba2003 Mon Sep 17 00:00:00 2001
From: Pal Lakatos-Toth <pallakatos@github.com>
Date: Tue, 30 Jun 2026 15:28:19 +0200
Subject: [PATCH 02/10] fix(deps): bump anyhow to 1.0.103 (RUSTSEC-2026-0190)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

cargo-audit and cargo-deny on main fail on RUSTSEC-2026-0190 — unsoundness
in anyhow's Error::downcast_mut() (UB when downcasting a context-wrapped
error). Fixed in 1.0.103. Lockfile-only bump; no API changes.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 Cargo.lock | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 0ad5ae27a..996d5f74e 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -209,9 +209,9 @@ dependencies = [
 
 [[package]]
 name = "anyhow"
-version = "1.0.102"
+version = "1.0.103"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c"
+checksum = "2a4385e2e34eb35d6b3efe798b9eb88096925d87726c0798709bf56d9ed84af3"
 
 [[package]]
 name = "ar_archive_writer"

From 499d44daa761e111c9391c7f11e71213ae6d797e Mon Sep 17 00:00:00 2001
From: Pal Lakatos-Toth <pallakatos@github.com>
Date: Tue, 30 Jun 2026 15:28:30 +0200
Subject: [PATCH 03/10] feat(controller): auto-derive sandbox egress from
 referenced McpServer URLs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adding an MCP server should 'just work': the per-pod router is the only
network path to it, so the sandbox's default-deny NetworkPolicy must admit
the router→MCP hop. Until now the operator also had to hand-write a
networkPolicy.allowedEndpoints entry for every MCP; miss it and calls to an
in-cluster MCP silently time out.

The controller now parses each referenced McpServer's spec.url and emits the
correct egress rule automatically:
  - in-cluster Service DNS (*.svc.cluster.local) -> namespaceSelector rule.
    Under Cilium a K8s NetworkPolicy ipBlock (even 0.0.0.0/0) only matches the
    reserved 'world' entity and never an in-cluster pod, so an ipBlock rule
    would never admit traffic to another pod — namespaceSelector is the only
    form that works.
  - external host, non-443 -> coarse port-level ipBlock (router enforces host).
  - external host, 443 -> already covered by the blanket HTTPS rule.

New helpers mcp_url_host_port / cluster_internal_namespace / mcp_egress_rule
with unit tests; the reconcile loop walks effective_mcp_server_refs and adds
de-duplicated rules. Missing/unparseable referents are logged and skipped.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 controller/src/reconciler/mod.rs | 289 ++++++++++++++++++++++++++++++-
 1 file changed, 285 insertions(+), 4 deletions(-)

diff --git a/controller/src/reconciler/mod.rs b/controller/src/reconciler/mod.rs
index a202ce24b..018271a90 100644
--- a/controller/src/reconciler/mod.rs
+++ b/controller/src/reconciler/mod.rs
@@ -89,6 +89,109 @@ pub(crate) fn isolation_scheduling(isolation: &str) -> (Option<&'static str>, &'
     }
 }
 
+/// Parse an `McpServer.spec.url` into the `(host, port)` the sandbox's
+/// inference-router must be allowed to reach.
+///
+/// Returns `None` when the URL is empty or unparseable (the caller skips
+/// egress derivation and lets the router surface the misconfiguration).
+/// The port defaults from the scheme (`https` → 443, `http` → 80) when not
+/// explicitly given. Only `http`/`https` MCP transports are supported.
+///
+/// Examples:
+/// - `https://mcp.deepwiki.com/mcp` → `("mcp.deepwiki.com", 443)`
+/// - `http://playwright-mcp.default.svc.cluster.local:8931/mcp`
+///   → `("playwright-mcp.default.svc.cluster.local", 8931)`
+pub(crate) fn mcp_url_host_port(url: &str) -> Option<(String, u16)> {
+    let url = url.trim();
+    let (scheme, rest) = url.split_once("://")?;
+    let default_port: u16 = match scheme.to_ascii_lowercase().as_str() {
+        "https" => 443,
+        "http" => 80,
+        _ => return None,
+    };
+    // Strip path / query / fragment — everything from the first '/', '?' or '#'.
+    let authority = rest.split(['/', '?', '#']).next().unwrap_or("").trim();
+    if authority.is_empty() {
+        return None;
+    }
+    // Drop any userinfo ("user:pass@host").
+    let host_port = authority.rsplit('@').next().unwrap_or(authority);
+    // IPv6 literal `[::1]:8931` — not expected for MCP services; reject so the
+    // caller falls back rather than mis-splitting on the inner colons.
+    if host_port.starts_with('[') {
+        return None;
+    }
+    let (host, port) = match host_port.rsplit_once(':') {
+        Some((h, p)) => {
+            let port = p.parse::<u16>().ok()?;
+            (h, port)
+        }
+        None => (host_port, default_port),
+    };
+    if host.is_empty() {
+        return None;
+    }
+    Some((host.to_string(), port))
+}
+
+/// Build the NetworkPolicy egress rule that admits the sandbox router to an
+/// MCP server at `host:port`. In-cluster Service DNS names get a
+/// `namespaceSelector` rule (the only form Cilium honours for pod
+/// destinations); external hosts get the coarse port-level `ipBlock`
+/// (host-level enforcement lives in the router). Returns `None` for an
+/// external host on 443 — already covered by the blanket HTTPS rule.
+fn mcp_egress_rule(host: &str, port: u16) -> Option<serde_json::Value> {
+    if let Some(ns) = cluster_internal_namespace(host) {
+        Some(json!({
+            "to": [{
+                "namespaceSelector": {
+                    "matchLabels": { "kubernetes.io/metadata.name": ns }
+                }
+            }],
+            "ports": [{"protocol": "TCP", "port": port}]
+        }))
+    } else if port == 443 {
+        None
+    } else {
+        Some(json!({
+            "to": [{"ipBlock": {"cidr": "0.0.0.0/0"}}],
+            "ports": [{"protocol": "TCP", "port": port}]
+        }))
+    }
+}
+
+/// If `host` is a cluster-internal Kubernetes DNS name, return the namespace
+/// it resolves into; otherwise `None`.
+///
+/// In-cluster MCP servers / egress endpoints are addressed by their Service
+/// DNS name — `<svc>.<ns>.svc.cluster.local` (or the `.svc` short form). For
+/// those we MUST express the NetworkPolicy egress with a `namespaceSelector`,
+/// not an `ipBlock`: under the Cilium CNI a K8s NetworkPolicy `ipBlock` CIDR
+/// (including `0.0.0.0/0`) only matches the reserved `world` entity and is
+/// NOT applied to in-cluster pod endpoints, so an `ipBlock`-based rule silently
+/// fails to admit traffic to another pod. Selecting the destination namespace
+/// by its well-known `kubernetes.io/metadata.name` label is the portable way to
+/// open egress to an in-cluster Service.
+///
+/// Returns `None` for external hostnames (e.g. `api.openai.com`) and for bare,
+/// namespace-ambiguous names — those keep the coarse `ipBlock` port rule, with
+/// fine-grained host enforcement handled by the router's CONNECT allowlist.
+pub(crate) fn cluster_internal_namespace(host: &str) -> Option<String> {
+    let host = host.trim().trim_end_matches('.');
+    // `<svc>.<ns>.svc.cluster.local` or the cluster-suffixless `<svc>.<ns>.svc`.
+    let stripped = host
+        .strip_suffix(".svc.cluster.local")
+        .or_else(|| host.strip_suffix(".svc"))?;
+    // After stripping the `.svc[.cluster.local]` suffix, the remainder is
+    // `<svc>.<ns>`; the namespace is the last label.
+    let ns = stripped.rsplit('.').next()?;
+    if ns.is_empty() || ns == stripped {
+        // `stripped` had no `.` → no namespace label present.
+        return None;
+    }
+    Some(ns.to_string())
+}
+
 /// Build the egress-guard init-container command.
 ///
 /// Standard sandboxes (every kind except SRE) get the full lockdown:
@@ -251,9 +354,100 @@ mod egress_guard_tests {
             );
         }
     }
-}
 
-/// Custom error type that bridges serde_json and kube errors.
+    #[test]
+    fn cluster_internal_namespace_parses_fqdn_forms() {
+        use super::cluster_internal_namespace;
+        assert_eq!(
+            cluster_internal_namespace("playwright-mcp.default.svc.cluster.local"),
+            Some("default".to_string())
+        );
+        assert_eq!(
+            cluster_internal_namespace("playwright-mcp.default.svc"),
+            Some("default".to_string())
+        );
+        // Trailing dot (fully-qualified) is tolerated.
+        assert_eq!(
+            cluster_internal_namespace("svc-x.team-a.svc.cluster.local."),
+            Some("team-a".to_string())
+        );
+    }
+
+    #[test]
+    fn cluster_internal_namespace_rejects_external_and_ambiguous() {
+        use super::cluster_internal_namespace;
+        // External hostnames keep the coarse ipBlock rule.
+        assert_eq!(cluster_internal_namespace("api.openai.com"), None);
+        assert_eq!(cluster_internal_namespace("example.com"), None);
+        // Bare names without a namespace label are ambiguous → None.
+        assert_eq!(cluster_internal_namespace("playwright-mcp"), None);
+        // A `.svc` suffix with no namespace label is not a valid in-cluster name.
+        assert_eq!(cluster_internal_namespace("svc"), None);
+    }
+
+    #[test]
+    fn mcp_url_host_port_parses_common_forms() {
+        use super::mcp_url_host_port;
+        // Remote HTTPS MCP — default port 443.
+        assert_eq!(
+            mcp_url_host_port("https://mcp.deepwiki.com/mcp"),
+            Some(("mcp.deepwiki.com".to_string(), 443))
+        );
+        assert_eq!(
+            mcp_url_host_port("https://api.githubcopilot.com/mcp"),
+            Some(("api.githubcopilot.com".to_string(), 443))
+        );
+        // In-cluster HTTP MCP with explicit port.
+        assert_eq!(
+            mcp_url_host_port("http://playwright-mcp.default.svc.cluster.local:8931/mcp"),
+            Some(("playwright-mcp.default.svc.cluster.local".to_string(), 8931))
+        );
+        // http default port 80, no path.
+        assert_eq!(
+            mcp_url_host_port("http://svc.ns.svc.cluster.local"),
+            Some(("svc.ns.svc.cluster.local".to_string(), 80))
+        );
+        // Query/fragment are stripped.
+        assert_eq!(
+            mcp_url_host_port("https://host.example:8443/mcp?x=1#frag"),
+            Some(("host.example".to_string(), 8443))
+        );
+    }
+
+    #[test]
+    fn mcp_url_host_port_rejects_invalid() {
+        use super::mcp_url_host_port;
+        assert_eq!(mcp_url_host_port(""), None);
+        assert_eq!(mcp_url_host_port("not-a-url"), None);
+        assert_eq!(mcp_url_host_port("ftp://host:21/x"), None);
+        assert_eq!(mcp_url_host_port("https://"), None);
+        // Non-numeric port.
+        assert_eq!(mcp_url_host_port("https://host:notaport/mcp"), None);
+        // IPv6 literals are not supported (fall back rather than mis-split).
+        assert_eq!(mcp_url_host_port("http://[::1]:8931/mcp"), None);
+    }
+
+    #[test]
+    fn mcp_egress_rule_shapes_by_destination() {
+        use super::mcp_egress_rule;
+        // In-cluster → namespaceSelector on the destination namespace + port.
+        let rule = mcp_egress_rule("playwright-mcp.default.svc.cluster.local", 8931)
+            .expect("in-cluster rule");
+        assert_eq!(
+            rule["to"][0]["namespaceSelector"]["matchLabels"]["kubernetes.io/metadata.name"],
+            "default"
+        );
+        assert_eq!(rule["ports"][0]["port"], 8931);
+
+        // External non-443 → coarse ipBlock + port.
+        let rule = mcp_egress_rule("mcp.example.com", 8080).expect("external non-443 rule");
+        assert_eq!(rule["to"][0]["ipBlock"]["cidr"], "0.0.0.0/0");
+        assert_eq!(rule["ports"][0]["port"], 8080);
+
+        // External 443 → None (blanket HTTPS rule already covers it).
+        assert!(mcp_egress_rule("mcp.deepwiki.com", 443).is_none());
+    }
+}
 #[derive(Debug, thiserror::Error)]
 enum ReconcileError {
     #[error("Kubernetes API error: {0}")]
@@ -1184,8 +1378,26 @@ async fn reconcile(sandbox: Arc<KarsSandbox>, ctx: Arc<Context>) -> Result<Actio
     if let Some(endpoints) = allowlist_resolution.endpoints.as_ref() {
         for ep in endpoints {
             let port = ep.port.unwrap_or(443);
-            if port != 443 {
-                // Only add non-443 ports (443 is already covered by the blanket HTTPS rule)
+            if port == 443 {
+                // 443 is already covered by the blanket HTTPS rule.
+                continue;
+            }
+            // In-cluster MCP servers / endpoints (`<svc>.<ns>.svc.cluster.local`)
+            // must be admitted with a `namespaceSelector`: a `0.0.0.0/0` ipBlock
+            // does NOT match in-cluster pods under Cilium, so the agent's router
+            // could never reach an in-cluster MCP server. External hosts keep the
+            // coarse port-level ipBlock (host-level enforcement lives in the
+            // router's CONNECT allowlist).
+            if let Some(ns) = cluster_internal_namespace(&ep.host) {
+                egress_rules.push(json!({
+                    "to": [{
+                        "namespaceSelector": {
+                            "matchLabels": { "kubernetes.io/metadata.name": ns }
+                        }
+                    }],
+                    "ports": [{"protocol": "TCP", "port": port}]
+                }));
+            } else {
                 egress_rules.push(json!({
                     "to": [{"ipBlock": {"cidr": "0.0.0.0/0"}}],
                     "ports": [{"protocol": "TCP", "port": port}]
@@ -1194,6 +1406,75 @@ async fn reconcile(sandbox: Arc<KarsSandbox>, ctx: Arc<Context>) -> Result<Actio
         }
     }
 
+    // ── Auto-derive egress to referenced McpServer endpoints ─────────────
+    //
+    // "Apply an McpServer CRD and reference it — it just works." The router
+    // is the only network path to an MCP server, so the sandbox NetworkPolicy
+    // must admit the router→server hop WITHOUT the operator also hand-writing
+    // a `networkPolicy.allowedEndpoints` entry. For every McpServer the
+    // sandbox references via `governance.mcpServerRefs`, we parse its
+    // `spec.url` and emit the matching egress rule:
+    //   - in-cluster Service DNS  → `namespaceSelector` (Cilium-correct)
+    //   - external host, non-443  → coarse `ipBlock` + port
+    //   - external host, 443      → already covered by the blanket HTTPS rule
+    //
+    // Remote HTTPS MCPs (DeepWiki, GitHub, hosted Playwright) therefore need
+    // zero extra config; self-hosted in-cluster MCPs on a non-443 port get
+    // their egress opened automatically.
+    {
+        let mcp_api: Api<crate::mcp_server::McpServer> =
+            Api::namespaced(client.clone(), &sandbox_self_ns);
+        for mcp_ref in governance_config.effective_mcp_server_refs() {
+            let ref_name = mcp_ref.name.trim();
+            if ref_name.is_empty() {
+                continue;
+            }
+            let url = match mcp_api.get(ref_name).await {
+                Ok(mcp) => mcp.spec.url.unwrap_or_default(),
+                Err(kube::Error::Api(ae)) if ae.code == 404 => {
+                    // Missing referent is surfaced by the JWKS-mirror path
+                    // below; here we just skip egress derivation for it.
+                    tracing::warn!(
+                        sandbox = %name,
+                        mcp = %ref_name,
+                        "referenced McpServer not found; egress not auto-derived",
+                    );
+                    continue;
+                }
+                Err(e) => {
+                    tracing::error!(error = %e, sandbox = %name, mcp = %ref_name, "McpServer lookup failed");
+                    return Ok(Action::requeue(Duration::from_secs(15)));
+                }
+            };
+            // Empty when the signed-bundle authoring path supplies the url;
+            // those endpoints still need an explicit allowlist entry.
+            if url.is_empty() {
+                continue;
+            }
+            let Some((host, port)) = mcp_url_host_port(&url) else {
+                tracing::warn!(
+                    sandbox = %name,
+                    mcp = %ref_name,
+                    url = %url,
+                    "McpServer url not parseable for egress derivation; skipping",
+                );
+                continue;
+            };
+            if let Some(rule) = mcp_egress_rule(&host, port)
+                && !egress_rules.contains(&rule)
+            {
+                tracing::info!(
+                    sandbox = %name,
+                    mcp = %ref_name,
+                    host = %host,
+                    port,
+                    "auto-derived MCP egress rule",
+                );
+                egress_rules.push(rule);
+            }
+        }
+    }
+
     // Compute ingress rules up front. When governance is enabled the sandbox
     // exposes :8443 (mesh inference) and :18789/:18791 (gateway WebUX/WebSocket)
     // to peer sandbox namespaces, plus :8443 to the operator namespace for

From 18fc9c5c9ac6d34e99b58b88f97ec66ee6f05302 Mon Sep 17 00:00:00 2001
From: Pal Lakatos-Toth <pallakatos@github.com>
Date: Tue, 30 Jun 2026 15:28:39 +0200
Subject: [PATCH 04/10] feat(cli): kars update + automatic CLI update notice

Every kars run now does a quiet, cached check (at most once per day) of the
npm registry and prints a one-line notice when a newer @kars-runtime/cli has
been published, with a short changelog summary pulled from the GitHub release.
The new 'kars update' command is the explicit, always-fresh path: it shows the
changelog and offers to install (npm install -g @kars-runtime/cli@latest), with
--check (report-only, non-zero exit) and --yes (non-interactive) modes.

Best-effort and unobtrusive by design: hard 1.5s network timeout, all errors
swallowed, result cached in ~/.kars/update-check.json, never prompts after an
arbitrary command, and silent in CI / non-TTY / when KARS_NO_UPDATE_CHECK=1.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 cli/src/cli.ts                   |   5 +
 cli/src/commands/update.ts       |  59 ++++++
 cli/src/index.ts                 |  12 +-
 cli/src/lib/update-check.test.ts | 156 +++++++++++++++
 cli/src/lib/update-check.ts      | 319 +++++++++++++++++++++++++++++++
 5 files changed, 550 insertions(+), 1 deletion(-)
 create mode 100644 cli/src/commands/update.ts
 create mode 100644 cli/src/lib/update-check.test.ts
 create mode 100644 cli/src/lib/update-check.ts

diff --git a/cli/src/cli.ts b/cli/src/cli.ts
index 4b2539939..a018a62d9 100644
--- a/cli/src/cli.ts
+++ b/cli/src/cli.ts
@@ -39,6 +39,7 @@ import { inspectCommand } from "./commands/inspect.js";
 import { auditCommand } from "./commands/audit.js";
 import { headlampCommand } from "./commands/headlamp.js";
 import { sreCommand } from "./commands/sre.js";
+import { updateCommand } from "./commands/update.js";
 
 export function createCli(): Command {
   const program = new Command();
@@ -100,6 +101,9 @@ export function createCli(): Command {
   // Attestation
   program.addCommand(attestCommand());
 
+  // Self-management
+  program.addCommand(updateCommand());
+
   program.addHelpText("after", `
 Command groups:
   Lifecycle       up, dev, add, push, destroy
@@ -110,6 +114,7 @@ Command groups:
   Interop         convert, a2a, a2a-agent, migrate
   Governance      toolpolicy, inferencepolicy, mcp, memory
   Attestation     attest
+  Self            update
 
 Quick start:
   kars up                    # Provision Azure + deploy controller + first sandbox
diff --git a/cli/src/commands/update.ts b/cli/src/commands/update.ts
new file mode 100644
index 000000000..053808d7c
--- /dev/null
+++ b/cli/src/commands/update.ts
@@ -0,0 +1,59 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+// commands/update.ts — `kars update`: explicitly check npm for a newer
+// `@kars-runtime/cli`, show the changelog, and (interactively) install it.
+//
+// This is the on-demand counterpart to the passive end-of-run notice wired into
+// every invocation (lib/update-check.ts). It always hits the network (bypassing
+// the 24h cache) and always offers to install when a newer version exists.
+
+import { Command } from "commander";
+import chalk from "chalk";
+import { cliVersion } from "../lib/version.js";
+import {
+  CLI_PACKAGE,
+  checkForCliUpdate,
+  renderUpdateNotice,
+} from "../lib/update-check.js";
+
+export function updateCommand(): Command {
+  const cmd = new Command("update");
+
+  cmd
+    .description(`Check for and install a newer ${CLI_PACKAGE}`)
+    .option("--check", "Only check; never prompt or install")
+    .option("--yes", "Install the latest version without prompting")
+    .action(async (options: { check?: boolean; yes?: boolean }) => {
+      const info = await checkForCliUpdate({ force: true, withChangelog: true });
+
+      if (!info) {
+        console.error(
+          chalk.green(`\n  ✓ ${CLI_PACKAGE} is up to date `) +
+            chalk.dim(`(v${cliVersion()}).\n`),
+        );
+        return;
+      }
+
+      if (options.check) {
+        // Report-only: print the notice without the install prompt.
+        await renderUpdateNotice(info, { offerInstall: false });
+        // Non-zero so scripts/CI can detect "an update is available".
+        process.exitCode = 1;
+        return;
+      }
+
+      if (options.yes) {
+        const { runGlobalInstall } = await import("../lib/update-check.js");
+        await renderUpdateNotice(info, { offerInstall: false });
+        await runGlobalInstall();
+        return;
+      }
+
+      await renderUpdateNotice(info, {
+        offerInstall: !!process.stdout.isTTY && !!process.stdin.isTTY,
+      });
+    });
+
+  return cmd;
+}
diff --git a/cli/src/index.ts b/cli/src/index.ts
index 0e0b4ac43..051f3b1fe 100644
--- a/cli/src/index.ts
+++ b/cli/src/index.ts
@@ -11,7 +11,17 @@
 
 import { createCli } from "./cli.js";
 import { bootstrapKubeContext } from "./lib/kube-bootstrap.js";
+import { maybeNotifyCliUpdate } from "./lib/update-check.js";
 
 await bootstrapKubeContext(process.argv);
 const program = createCli();
-program.parse(process.argv);
+await program.parseAsync(process.argv);
+
+// Passive, best-effort: tell the user if a newer CLI has been published. Runs
+// after the command so it never delays or clutters real output, is cached to
+// at most one network check per day, and stays silent in CI / non-TTY / when
+// opted out (KARS_NO_UPDATE_CHECK=1). The interactive install lives in
+// `kars update`. Skip when the user already ran `kars update`.
+if (!process.argv.slice(2).includes("update")) {
+  await maybeNotifyCliUpdate();
+}
diff --git a/cli/src/lib/update-check.test.ts b/cli/src/lib/update-check.test.ts
new file mode 100644
index 000000000..cea271ea2
--- /dev/null
+++ b/cli/src/lib/update-check.test.ts
@@ -0,0 +1,156 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+import { describe, it, expect, vi, beforeEach, afterEach } from "vitest";
+import os from "node:os";
+import path from "node:path";
+import { promises as fs } from "node:fs";
+import {
+  updateCheckDisabled,
+  summarizeReleaseBody,
+  fetchLatestVersion,
+  fetchChangelogSummary,
+  checkForCliUpdate,
+  CLI_PACKAGE,
+} from "./update-check.js";
+
+// Redirect the on-disk cache into a throwaway temp dir so tests never touch the
+// developer's real ~/.kars/update-check.json.
+let stateDir: string;
+beforeEach(async () => {
+  stateDir = await fs.mkdtemp(path.join(os.tmpdir(), "kars-update-test-"));
+  process.env.KARS_STATE_DIR = stateDir;
+});
+afterEach(async () => {
+  delete process.env.KARS_STATE_DIR;
+  await fs.rm(stateDir, { recursive: true, force: true });
+});
+
+describe("updateCheckDisabled", () => {
+  it("is enabled by default", () => {
+    expect(updateCheckDisabled({})).toBe(false);
+  });
+
+  it("respects explicit opt-out env vars", () => {
+    expect(updateCheckDisabled({ KARS_NO_UPDATE_CHECK: "1" })).toBe(true);
+    expect(updateCheckDisabled({ KARS_NO_UPDATE_CHECK: "true" })).toBe(true);
+    expect(updateCheckDisabled({ KARS_UPDATE_CHECK: "0" })).toBe(true);
+    expect(updateCheckDisabled({ KARS_UPDATE_CHECK: "false" })).toBe(true);
+  });
+
+  it("disables in CI", () => {
+    expect(updateCheckDisabled({ CI: "true" })).toBe(true);
+    expect(updateCheckDisabled({ CI: "1" })).toBe(true);
+    // A falsey CI value does not disable.
+    expect(updateCheckDisabled({ CI: "0" })).toBe(false);
+    expect(updateCheckDisabled({ CI: "false" })).toBe(false);
+  });
+});
+
+describe("summarizeReleaseBody", () => {
+  it("prefers the summary half of a 'vX — summary' release title", () => {
+    expect(summarizeReleaseBody("v0.1.24 — MCP keepalive + egress auto-derive", ""))
+      .toBe("MCP keepalive + egress auto-derive");
+  });
+
+  it("falls back to the first meaningful body line, stripped of markdown", () => {
+    const body = "## What's changed\n\n- Fixed the Playwright `about:blank` session reaper\n";
+    expect(summarizeReleaseBody(undefined, body))
+      .toBe("Fixed the Playwright about:blank session reaper");
+  });
+
+  it("truncates very long summaries", () => {
+    const long = "x".repeat(200);
+    const out = summarizeReleaseBody(long, "")!;
+    expect(out.length).toBeLessThanOrEqual(100);
+    expect(out.endsWith("…")).toBe(true);
+  });
+
+  it("returns undefined when nothing usable is present", () => {
+    expect(summarizeReleaseBody(undefined, undefined)).toBeUndefined();
+    expect(summarizeReleaseBody("", "   \n  \n")).toBeUndefined();
+  });
+});
+
+describe("fetchLatestVersion", () => {
+  const realFetch = globalThis.fetch;
+  afterEach(() => {
+    globalThis.fetch = realFetch;
+    vi.restoreAllMocks();
+  });
+
+  it("returns the version from the registry latest dist-tag", async () => {
+    globalThis.fetch = vi.fn(async () =>
+      new Response(JSON.stringify({ version: "0.2.0" }), { status: 200 }),
+    ) as unknown as typeof fetch;
+    expect(await fetchLatestVersion(CLI_PACKAGE, 500)).toBe("0.2.0");
+  });
+
+  it("returns null on a non-200 response", async () => {
+    globalThis.fetch = vi.fn(async () => new Response("nope", { status: 404 })) as unknown as typeof fetch;
+    expect(await fetchLatestVersion(CLI_PACKAGE, 500)).toBeNull();
+  });
+
+  it("returns null (never throws) on a network error", async () => {
+    globalThis.fetch = vi.fn(async () => {
+      throw new Error("ECONNREFUSED");
+    }) as unknown as typeof fetch;
+    expect(await fetchLatestVersion(CLI_PACKAGE, 500)).toBeNull();
+  });
+});
+
+describe("fetchChangelogSummary", () => {
+  const realFetch = globalThis.fetch;
+  afterEach(() => {
+    globalThis.fetch = realFetch;
+    vi.restoreAllMocks();
+  });
+
+  it("summarizes the GitHub release for the tag", async () => {
+    globalThis.fetch = vi.fn(async () =>
+      new Response(JSON.stringify({ name: "v0.2.0 — big fixes", body: "" }), { status: 200 }),
+    ) as unknown as typeof fetch;
+    expect(await fetchChangelogSummary("0.2.0", 500)).toBe("big fixes");
+  });
+
+  it("returns undefined when the release is missing", async () => {
+    globalThis.fetch = vi.fn(async () => new Response("", { status: 404 })) as unknown as typeof fetch;
+    expect(await fetchChangelogSummary("9.9.9", 500)).toBeUndefined();
+  });
+});
+
+describe("checkForCliUpdate", () => {
+  const realFetch = globalThis.fetch;
+  afterEach(() => {
+    globalThis.fetch = realFetch;
+    vi.restoreAllMocks();
+  });
+
+  it("returns null when disabled", async () => {
+    const info = await checkForCliUpdate({ env: { KARS_NO_UPDATE_CHECK: "1" } });
+    expect(info).toBeNull();
+  });
+
+  it("returns null when the registry reports an older/equal version", async () => {
+    globalThis.fetch = vi.fn(async () =>
+      new Response(JSON.stringify({ version: "0.0.0" }), { status: 200 }),
+    ) as unknown as typeof fetch;
+    const info = await checkForCliUpdate({ force: true });
+    expect(info).toBeNull();
+  });
+
+  it("reports an update when the registry has a newer version", async () => {
+    globalThis.fetch = vi.fn(async (url: string | URL) => {
+      const u = String(url);
+      if (u.includes("registry.npmjs.org")) {
+        return new Response(JSON.stringify({ version: "999.0.0" }), { status: 200 });
+      }
+      // changelog
+      return new Response(JSON.stringify({ name: "v999.0.0 — the future", body: "" }), { status: 200 });
+    }) as unknown as typeof fetch;
+    const info = await checkForCliUpdate({ force: true, withChangelog: true });
+    expect(info).not.toBeNull();
+    expect(info!.latest).toBe("999.0.0");
+    expect(info!.changelog).toBe("the future");
+  });
+});
diff --git a/cli/src/lib/update-check.ts b/cli/src/lib/update-check.ts
new file mode 100644
index 000000000..740d9f77a
--- /dev/null
+++ b/cli/src/lib/update-check.ts
@@ -0,0 +1,319 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+// lib/update-check.ts — best-effort "a newer CLI is published" notifier.
+//
+// Every interactive `kars` run quietly checks whether a newer
+// `@kars-runtime/cli` has been published to npm and, if so, prints a short
+// notice (current → latest + a one-line changelog) and offers to install it.
+// Design constraints, in priority order:
+//   1. NEVER break or slow a command. The check is wrapped in a hard timeout,
+//      all errors are swallowed, and the result is cached so we hit the network
+//      at most once per `CHECK_TTL_MS` window.
+//   2. NEVER nag. We surface the notice at most once per `CHECK_TTL_MS`, only on
+//      a TTY, never in CI, and respect an explicit opt-out env var.
+//   3. NEVER act without consent. The install only runs if the user confirms the
+//      interactive prompt; otherwise we just print the copy-paste command.
+//
+// The cache lives at `~/.kars/update-check.json`. It records the last fetched
+// latest version, when we fetched it, and when we last showed the notice — so
+// the notice is decoupled from the network check (classic update-notifier
+// behaviour: we may show a previously-fetched result while refreshing for next
+// time).
+
+import { promises as fs } from "node:fs";
+import os from "node:os";
+import path from "node:path";
+import { cliVersion } from "./version.js";
+import { compareVersions } from "./release.js";
+
+/** npm package this CLI is published as. */
+export const CLI_PACKAGE = "@kars-runtime/cli";
+
+/** How long a network result / shown-notice is considered fresh. */
+export const CHECK_TTL_MS = 24 * 60 * 60 * 1000; // 24h
+
+/** Hard cap on the registry request so a slow/unreachable network never delays
+ *  the user's actual command by more than this. */
+const FETCH_TIMEOUT_MS = 1500;
+
+const CACHE_DIR = path.join(os.homedir(), ".kars");
+const CACHE_FILE = path.join(CACHE_DIR, "update-check.json");
+
+/** Resolve the cache file, honouring `KARS_STATE_DIR` so tests (and unusual
+ *  environments) can redirect the on-disk cache away from `~/.kars`. */
+function cacheFile(): string {
+  const override = process.env.KARS_STATE_DIR;
+  return override ? path.join(override, "update-check.json") : CACHE_FILE;
+}
+
+interface UpdateCache {
+  /** Latest version string seen on the registry (no `v` prefix), or null. */
+  latest: string | null;
+  /** Epoch ms of the last successful (or attempted) registry fetch. */
+  checkedAt: number;
+  /** Epoch ms we last printed the notice to the user. */
+  notifiedAt: number;
+}
+
+export interface UpdateInfo {
+  current: string;
+  latest: string;
+  /** Short changelog blurb for `latest`, when one is cheaply available. */
+  changelog?: string;
+}
+
+/** True when update checks are disabled for this process: explicit opt-out, CI,
+ *  or a non-interactive stdout (piped / redirected). Exported for tests. */
+export function updateCheckDisabled(env: NodeJS.ProcessEnv = process.env): boolean {
+  if (env.KARS_NO_UPDATE_CHECK === "1" || env.KARS_NO_UPDATE_CHECK === "true") return true;
+  if (env.KARS_UPDATE_CHECK === "0" || env.KARS_UPDATE_CHECK === "false") return true;
+  // Common CI signal — never nag an automated pipeline.
+  if (env.CI && env.CI !== "0" && env.CI !== "false") return true;
+  return false;
+}
+
+async function readCache(): Promise<UpdateCache | null> {
+  try {
+    const raw = await fs.readFile(cacheFile(), "utf8");
+    const parsed = JSON.parse(raw) as Partial<UpdateCache>;
+    if (typeof parsed.checkedAt !== "number") return null;
+    return {
+      latest: typeof parsed.latest === "string" ? parsed.latest : null,
+      checkedAt: parsed.checkedAt,
+      notifiedAt: typeof parsed.notifiedAt === "number" ? parsed.notifiedAt : 0,
+    };
+  } catch {
+    return null;
+  }
+}
+
+async function writeCache(cache: UpdateCache): Promise<void> {
+  try {
+    const file = cacheFile();
+    await fs.mkdir(path.dirname(file), { recursive: true });
+    await fs.writeFile(file, JSON.stringify(cache), "utf8");
+  } catch {
+    // Cache is an optimisation; a write failure must never surface.
+  }
+}
+
+/** Fetch the `latest` dist-tag version from the npm registry. Best-effort:
+ *  returns null on any error or timeout. Exported for tests. */
+export async function fetchLatestVersion(
+  pkg: string = CLI_PACKAGE,
+  timeoutMs: number = FETCH_TIMEOUT_MS,
+): Promise<string | null> {
+  // The `latest` endpoint returns just the dist-tag manifest — far smaller than
+  // the full packument.
+  const url = `https://registry.npmjs.org/${pkg.replace("/", "%2f")}/latest`;
+  try {
+    const resp = await fetch(url, {
+      signal: AbortSignal.timeout(timeoutMs),
+      headers: { accept: "application/json" },
+    });
+    if (!resp.ok) return null;
+    const body = (await resp.json()) as { version?: string };
+    return typeof body.version === "string" ? body.version : null;
+  } catch {
+    return null;
+  }
+}
+
+/** A terse, best-effort one-line changelog for `version`, pulled from the
+ *  GitHub release body. Returns undefined when unavailable. Exported for tests. */
+export async function fetchChangelogSummary(
+  version: string,
+  timeoutMs: number = FETCH_TIMEOUT_MS,
+): Promise<string | undefined> {
+  const tag = version.startsWith("v") ? version : `v${version}`;
+  const url = `https://api.github.com/repos/Azure/kars/releases/tags/${tag}`;
+  try {
+    const resp = await fetch(url, {
+      signal: AbortSignal.timeout(timeoutMs),
+      headers: { accept: "application/vnd.github+json" },
+    });
+    if (!resp.ok) return undefined;
+    const body = (await resp.json()) as { name?: string; body?: string };
+    return summarizeReleaseBody(body.name, body.body);
+  } catch {
+    return undefined;
+  }
+}
+
+/** Reduce a GitHub release `name`/`body` to a single readable line. Prefers the
+ *  release title; otherwise the first meaningful bullet/line of the body.
+ *  Exported for tests. */
+export function summarizeReleaseBody(name?: string, body?: string): string | undefined {
+  const clean = (s: string): string =>
+    s.replace(/^[#>*\-\s]+/, "").replace(/`/g, "").replace(/\s+/g, " ").trim();
+  if (name && name.trim()) {
+    // Release titles are often "vX.Y.Z — summary"; keep the summary half.
+    const dash = name.split(/\s[—-]\s/);
+    const tail = dash.length > 1 ? dash.slice(1).join(" - ") : name;
+    const t = clean(tail);
+    if (t) return truncate(t);
+  }
+  if (body) {
+    for (const line of body.split(/\r?\n/)) {
+      // Skip markdown headings ("## What's changed") — we want a content line.
+      if (/^\s*#/.test(line)) continue;
+      const c = clean(line);
+      if (c.length >= 8) return truncate(c);
+    }
+  }
+  return undefined;
+}
+
+function truncate(s: string, max = 100): string {
+  return s.length > max ? `${s.slice(0, max - 1).trimEnd()}…` : s;
+}
+
+/**
+ * Determine whether an update is available, using the cache to avoid hitting the
+ * network more than once per TTL. Returns null when up-to-date, disabled, or the
+ * lookup couldn't determine a newer version.
+ *
+ * `force` bypasses the TTL (used by the explicit `kars update` command).
+ */
+export async function checkForCliUpdate(opts: {
+  force?: boolean;
+  withChangelog?: boolean;
+  env?: NodeJS.ProcessEnv;
+} = {}): Promise<UpdateInfo | null> {
+  const env = opts.env ?? process.env;
+  if (!opts.force && updateCheckDisabled(env)) return null;
+
+  const current = cliVersion();
+  const now = Date.now();
+  const cache = await readCache();
+
+  let latest = cache?.latest ?? null;
+  const fresh = cache && now - cache.checkedAt < CHECK_TTL_MS;
+
+  if (opts.force || !fresh) {
+    const fetched = await fetchLatestVersion();
+    // Record the attempt regardless of outcome so a flaky network doesn't make
+    // us re-check on every single invocation.
+    latest = fetched ?? latest;
+    await writeCache({
+      latest,
+      checkedAt: now,
+      notifiedAt: cache?.notifiedAt ?? 0,
+    });
+  }
+
+  if (!latest) return null;
+  if (compareVersions(latest, current) <= 0) return null;
+
+  const info: UpdateInfo = { current, latest };
+  if (opts.withChangelog) {
+    info.changelog = await fetchChangelogSummary(latest);
+  }
+  return info;
+}
+
+/** Record that we've shown the notice now, so we don't repeat it within the TTL. */
+async function markNotified(): Promise<void> {
+  const cache = (await readCache()) ?? { latest: null, checkedAt: Date.now(), notifiedAt: 0 };
+  cache.notifiedAt = Date.now();
+  await writeCache(cache);
+}
+
+/** True if we've already shown a notice within the TTL window. */
+async function notifiedRecently(): Promise<boolean> {
+  const cache = await readCache();
+  return !!cache && Date.now() - cache.notifiedAt < CHECK_TTL_MS;
+}
+
+/**
+ * The passive, end-of-run hook wired into every `kars` invocation. Best-effort:
+ * checks for an update (cached) and, at most once per TTL on an interactive
+ * terminal, offers to install it. Silent and side-effect-free when up-to-date,
+ * disabled, non-interactive, or already shown recently.
+ *
+ * Returns true if a notice was shown (used by tests).
+ */
+export async function maybeNotifyCliUpdate(): Promise<boolean> {
+  try {
+    if (updateCheckDisabled()) return false;
+    if (await notifiedRecently()) return false;
+
+    const info = await checkForCliUpdate({ withChangelog: true });
+    if (!info) return false;
+
+    await markNotified();
+    // The passive notice never hijacks stdin after an arbitrary command — it
+    // prints the changelog + how to upgrade. The interactive install lives in
+    // `kars update`, which the notice points to.
+    await renderUpdateNotice(info, { offerInstall: false });
+    return true;
+  } catch {
+    return false;
+  }
+}
+
+/**
+ * Render the update notice. On an interactive TTY (`offerInstall`) it prompts to
+ * run the global install; otherwise it just prints the copy-paste command.
+ * Exported so the `kars update` command can reuse the exact same presentation.
+ */
+export async function renderUpdateNotice(
+  info: UpdateInfo,
+  opts: { offerInstall?: boolean } = {},
+): Promise<void> {
+  const { default: chalk } = await import("chalk");
+  const installCmd = `npm install -g ${CLI_PACKAGE}@latest`;
+
+  const lines = [
+    "",
+    chalk.cyan(`  ✨ A new kars CLI is available: `) +
+      chalk.dim(info.current) + chalk.cyan("  →  ") + chalk.green.bold(info.latest),
+  ];
+  if (info.changelog) {
+    lines.push(chalk.dim(`     ${info.changelog}`));
+  }
+  lines.push(
+    chalk.dim(`     Release notes: https://github.com/Azure/kars/releases/tag/v${info.latest}`),
+    "",
+  );
+  console.error(lines.join("\n"));
+
+  if (opts.offerInstall) {
+    const { default: inquirer } = await import("inquirer");
+    const { doInstall } = await inquirer.prompt<{ doInstall: boolean }>([
+      {
+        type: "confirm",
+        name: "doInstall",
+        message: `Update to ${info.latest} now?`,
+        default: true,
+      },
+    ]);
+    if (doInstall) {
+      await runGlobalInstall();
+      return;
+    }
+  }
+  console.error(
+    chalk.dim(`  Run `) + chalk.bold(`kars update`) +
+      chalk.dim(` to install it, or: `) + chalk.bold(installCmd) + "\n",
+  );
+}
+
+/** Run the global npm install of the latest CLI, streaming output. Best-effort:
+ *  reports failure with the manual command rather than throwing. */
+export async function runGlobalInstall(): Promise<void> {
+  const { default: chalk } = await import("chalk");
+  const { execa } = await import("execa");
+  const installCmd = `npm install -g ${CLI_PACKAGE}@latest`;
+  try {
+    console.error(chalk.dim(`\n  $ ${installCmd}\n`));
+    await execa("npm", ["install", "-g", `${CLI_PACKAGE}@latest`], { stdio: "inherit" });
+    console.error(chalk.green(`\n  ✓ Updated. Re-run your command to use ${CLI_PACKAGE}@latest.\n`));
+  } catch {
+    console.error(
+      chalk.yellow(`\n  ⚠ Automatic update failed. Update manually with:\n  `) +
+        chalk.bold(installCmd) + "\n",
+    );
+  }
+}

From c7d1e2c94540a91f9385caeba54c85e0cad0ff91 Mon Sep 17 00:00:00 2001
From: Pal Lakatos-Toth <pallakatos@github.com>
Date: Tue, 30 Jun 2026 15:28:46 +0200
Subject: [PATCH 05/10] feat(upgrade): pre-flight server-side-apply conflict
 detection

Helm v4 applies releases server-side, so a Deployment field owned by another
field manager (e.g. a manual 'kubectl set env HERMES_RUNTIME_IMAGE') makes both
the real apply AND its atomic rollback fail with a conflict, wedging the
release mid-upgrade.

kars upgrade now runs a server-side dry-run first and parses any field-manager
conflicts. On conflict it stops BEFORE any change and prints the offending
object/field/manager plus two remediation paths: --force-conflicts to take
ownership, or a precise copy-paste command to hand the field back. It never
silently clobbers a field a live operator legitimately manages.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 cli/src/commands/upgrade.test.ts |  97 +++++++++++++
 cli/src/commands/upgrade.ts      | 226 ++++++++++++++++++++++++++++++-
 2 files changed, 318 insertions(+), 5 deletions(-)

diff --git a/cli/src/commands/upgrade.test.ts b/cli/src/commands/upgrade.test.ts
index ee429a2ab..683794471 100644
--- a/cli/src/commands/upgrade.test.ts
+++ b/cli/src/commands/upgrade.test.ts
@@ -9,6 +9,8 @@ import {
   detectCurrentVersion,
   rolloutRestartAll,
   verifyHealth,
+  parseHelmFieldConflicts,
+  suggestConflictRemediation,
 } from "./upgrade.js";
 
 describe("isFoundryProjectHost", () => {
@@ -127,6 +129,29 @@ describe("buildHelmUpgradeArgs", () => {
     expect(args.slice(0, 4)).toEqual(["upgrade", "--install", "kars", "/chart"]);
   });
 
+  it("does NOT take ownership of foreign-managed fields by default (no --force-conflicts)", () => {
+    const args = buildHelmUpgradeArgs(ctx, "/chart", "v0.1.20");
+    expect(args).not.toContain("--force-conflicts");
+  });
+
+  it("adds --force-conflicts only when explicitly opted in", () => {
+    const args = buildHelmUpgradeArgs(ctx, "/chart", "v0.1.20", { forceConflicts: true });
+    expect(args).toContain("--force-conflicts");
+    // still a real, atomic upgrade
+    expect(args).toContain("--atomic");
+  });
+
+  it("builds a non-mutating server dry-run for the conflict pre-flight", () => {
+    const args = buildHelmUpgradeArgs(ctx, "/chart", "v0.1.20", { dryRunServer: true });
+    expect(args).toContain("--dry-run=server");
+    // dry-run must NOT carry --atomic/--wait (meaningless) nor force ownership
+    expect(args).not.toContain("--atomic");
+    expect(args).not.toContain("--wait");
+    expect(args).not.toContain("--force-conflicts");
+    // ...but still carries the real image values so the dry-run is representative
+    expect(args.join(" ")).toContain("controller.image.tag=v0.1.20");
+  });
+
   it("only sets the Foundry endpoint when one is configured", () => {
     const withFoundry = buildHelmUpgradeArgs({ ...ctx, foundryEndpoint: "https://x.services.ai.azure.com" }, "/chart", "v0.1.20").join(" ");
     expect(withFoundry).toContain("inferenceRouter.azure.openai.endpoint=https://x.services.ai.azure.com");
@@ -135,6 +160,78 @@ describe("buildHelmUpgradeArgs", () => {
   });
 });
 
+describe("parseHelmFieldConflicts — failsafe field-manager conflict detection", () => {
+  // The real Helm v4 error that wedged the v0.1.18 → v0.1.23 upgrade on launch
+  // day, as printed to stderr (`Error: UPGRADE FAILED:` — quotes are bare here).
+  const realError =
+    "Error: UPGRADE FAILED: an error occurred while rolling back the release. " +
+    "original upgrade error: conflict occurred while applying object " +
+    "kars-system/kars-controller apps/v1, Kind=Deployment: Apply failed with 1 " +
+    'conflict: conflict with "kubectl-set" using apps/v1: ' +
+    '.spec.template.spec.containers[name="controller"].env[name="HERMES_RUNTIME_IMAGE"].value';
+
+  it("extracts the object, kind, manager, and field from a real conflict", () => {
+    const conflicts = parseHelmFieldConflicts(realError);
+    expect(conflicts).toHaveLength(1);
+    expect(conflicts[0]).toMatchObject({
+      object: "kars-system/kars-controller",
+      kind: "Deployment",
+      manager: "kubectl-set",
+    });
+    expect(conflicts[0].field).toContain('env[name="HERMES_RUNTIME_IMAGE"].value');
+  });
+
+  it("also parses Go-%q escaped quotes from the structured WARN log line", () => {
+    const warn =
+      'level=WARN msg="upgrade failed" name=kars error="conflict occurred while ' +
+      'applying object kars-system/kars-controller apps/v1, Kind=Deployment: Apply ' +
+      'failed with 1 conflict: conflict with \\"kubectl-set\\" using apps/v1: ' +
+      '.spec.template.spec.containers[name=\\"controller\\"].env[name=\\"HERMES_RUNTIME_IMAGE\\"].value"';
+    const conflicts = parseHelmFieldConflicts(warn);
+    expect(conflicts).toHaveLength(1);
+    expect(conflicts[0].manager).toBe("kubectl-set");
+  });
+
+  it("attributes multiple conflicting fields to the right object and de-dupes", () => {
+    const multi =
+      "conflict occurred while applying object ns/dep apps/v1, Kind=Deployment: " +
+      'Apply failed with 2 conflicts: conflict with "mgr-a" using apps/v1: .spec.foo, ' +
+      'conflict with "mgr-b" using apps/v1: .spec.bar; ' +
+      // a repeat of the same conflict must not double-count
+      'conflict with "mgr-a" using apps/v1: .spec.foo';
+    const conflicts = parseHelmFieldConflicts(multi);
+    expect(conflicts).toHaveLength(2);
+    expect(conflicts.map((c) => c.manager).sort()).toEqual(["mgr-a", "mgr-b"]);
+    expect(conflicts.every((c) => c.object === "ns/dep")).toBe(true);
+  });
+
+  it("returns nothing for output that has no conflict (a clean dry-run)", () => {
+    expect(parseHelmFieldConflicts("")).toEqual([]);
+    expect(parseHelmFieldConflicts("Release \"kars\" has been upgraded. Happy Helming!")).toEqual([]);
+  });
+});
+
+describe("suggestConflictRemediation — actionable copy-paste fix", () => {
+  it("turns an env-var conflict into the exact `kubectl set env … NAME-` command", () => {
+    const fix = suggestConflictRemediation({
+      object: "kars-system/kars-controller",
+      kind: "Deployment",
+      manager: "kubectl-set",
+      field: '.spec.template.spec.containers[name="controller"].env[name="HERMES_RUNTIME_IMAGE"].value',
+    });
+    expect(fix).toBe("kubectl set env deployment/kars-controller -n kars-system HERMES_RUNTIME_IMAGE-");
+  });
+
+  it("returns null for field shapes it can't safely auto-remediate", () => {
+    expect(suggestConflictRemediation({
+      object: "ns/cm",
+      kind: "ConfigMap",
+      manager: "kubectl-edit",
+      field: ".data.someKey",
+    })).toBeNull();
+  });
+});
+
 // ── health / safety / recovery coverage ────────────────────────────
 
 type Execa = typeof import("execa").execa;
diff --git a/cli/src/commands/upgrade.ts b/cli/src/commands/upgrade.ts
index c5ecc0808..4f9fab21f 100644
--- a/cli/src/commands/upgrade.ts
+++ b/cli/src/commands/upgrade.ts
@@ -9,9 +9,16 @@
 //   1. detects current vs. target version (latest GHCR release, or --to <tag>),
 //   2. records a rollback point (Helm revision),
 //   3. imports the target release images into the user's ACR (pinned + :latest),
-//   4. `helm upgrade --atomic` (auto-rolls-back the release on failure),
-//   5. rolls the controller, router, and sandbox workloads to the new images,
-//   6. verifies health and prints what changed.
+//   4. pre-flights a server-side dry-run to detect fields owned by another field
+//      manager (e.g. a manual `kubectl set env`) — Helm v4 applies server-side,
+//      so such a field would make the real apply AND its atomic rollback fail
+//      with a conflict and wedge the release. On conflict the upgrade stops
+//      BEFORE any change and prints the offending fields + remediation options
+//      (`--force-conflicts` to take ownership, or a copy-paste command to hand
+//      the field back); it never silently overwrites a field a live operator owns,
+//   5. `helm upgrade --atomic` (auto-rolls-back the release on failure),
+//   6. rolls the controller, router, and sandbox workloads to the new images,
+//   7. verifies health and prints what changed.
 // `--dry-run` shows the plan with no writes; `--rollback` reverts to the
 // previous Helm revision.
 
@@ -129,7 +136,7 @@ export function buildHelmUpgradeArgs(
   ctx: UpgradeContext,
   helmPath: string,
   target?: string,
-  opts: { skipRuntimeImages?: boolean } = {},
+  opts: { skipRuntimeImages?: boolean; forceConflicts?: boolean; dryRunServer?: boolean } = {},
 ): string[] {
   const tag = target && target.length > 0 ? target : "latest";
   const args = [
@@ -155,7 +162,24 @@ export function buildHelmUpgradeArgs(
       args.push("--set", `${valueKey}=${ctx.acrLoginServer}/${repo}:${tag}`);
     }
   }
-  args.push("--atomic", "--wait", "--timeout", "8m");
+  // Execution mode:
+  //  - dryRunServer: a server-side apply in dry-run — mutates nothing but makes
+  //    the API server report field-manager (server-side apply) conflicts, which
+  //    is how the pre-flight detects fields another manager owns BEFORE we touch
+  //    the cluster. `--atomic`/`--wait` are meaningless here and omitted.
+  //  - otherwise: the real upgrade. `--atomic` auto-rolls-back a failed rollout.
+  //  - forceConflicts: opt-in. Makes server-side apply overwrite fields owned by
+  //    other field managers (e.g. a manual `kubectl set env`). Off by default so
+  //    we never silently clobber a field a live operator legitimately manages —
+  //    the pre-flight surfaces the conflict and the operator chooses.
+  if (opts.dryRunServer) {
+    args.push("--dry-run=server");
+  } else {
+    args.push("--atomic", "--wait", "--timeout", "8m");
+  }
+  if (opts.forceConflicts) {
+    args.push("--force-conflicts");
+  }
   if (ctx.foundryEndpoint) {
     args.push("--set", `inferenceRouter.azure.openai.endpoint=${ctx.foundryEndpoint}`);
   }
@@ -168,6 +192,153 @@ export function buildHelmUpgradeArgs(
   return args;
 }
 
+/** A single server-side-apply field-manager conflict, parsed from Helm's error.
+ *  Helm v4 applies releases server-side, so a field another manager owns (e.g. a
+ *  manual `kubectl set env`) makes the apply fail with a structured conflict
+ *  instead of silently overwriting it. */
+export interface FieldManagerConflict {
+  /** `namespace/name` of the object, e.g. `kars-system/kars-controller`. */
+  object: string;
+  /** Kind of the object, e.g. `Deployment`. */
+  kind: string;
+  /** The field manager that owns the field, e.g. `kubectl-set`. */
+  manager: string;
+  /** The conflicting field path, e.g.
+   *  `.spec.template.spec.containers[name="controller"].env[name="HERMES_RUNTIME_IMAGE"].value`. */
+  field: string;
+}
+
+/** Parse server-side-apply field-manager conflicts out of a failed Helm
+ *  upgrade's stderr/stdout. Returns one entry per conflicting field. Exported
+ *  for tests. The shape we match (Helm v4 / Kubernetes SSA):
+ *
+ *    conflict occurred while applying object kars-system/kars-controller apps/v1,
+ *    Kind=Deployment: Apply failed with 1 conflict: conflict with "kubectl-set"
+ *    using apps/v1: .spec.template.spec.containers[name="controller"]
+ *    .env[name="HERMES_RUNTIME_IMAGE"].value
+ */
+export function parseHelmFieldConflicts(output: string): FieldManagerConflict[] {
+  if (!output) return [];
+  const conflicts: FieldManagerConflict[] = [];
+  const seen = new Set<string>();
+  // Each "applying object <ns/name> <group/version>, Kind=<Kind>:" header is
+  // followed by one or more `conflict with "<manager>" using <gv>: <field>`.
+  // Conflicts can repeat for the same object, so track the most recent header.
+  const objectRe = /applying object (\S+?) \S+, Kind=(\w+)/g;
+  // Manager quotes may be bare (`"x"`) or Go-%q escaped (`\"x\"`) depending on
+  // whether the line came from Helm's plain `Error:` output or its structured
+  // WARN log. Field is the last token; stop at whitespace or a `,`/`;` list
+  // separator so a multi-conflict list doesn't fold the separator into the path.
+  const conflictRe = /conflict with \\?"([^"\\]+)\\?" using [^:]+:\s*([^\s,;]+)/g;
+
+  // Walk the string, attributing each conflict to the nearest preceding object.
+  interface Header { index: number; object: string; kind: string }
+  const headers: Header[] = [];
+  for (let m = objectRe.exec(output); m; m = objectRe.exec(output)) {
+    headers.push({ index: m.index, object: m[1], kind: m[2] });
+  }
+  for (let m = conflictRe.exec(output); m; m = conflictRe.exec(output)) {
+    let header: Header | undefined;
+    for (const h of headers) {
+      if (h.index <= m.index) header = h;
+      else break;
+    }
+    const c: FieldManagerConflict = {
+      object: header?.object ?? "unknown",
+      kind: header?.kind ?? "unknown",
+      manager: m[1],
+      field: m[2],
+    };
+    const key = `${c.object}|${c.manager}|${c.field}`;
+    if (!seen.has(key)) {
+      seen.add(key);
+      conflicts.push(c);
+    }
+  }
+  return conflicts;
+}
+
+/** Suggest a precise, copy-pasteable command to hand a conflicting field back so
+ *  the upgrade can own it — when the field is a recognisable shape (a container
+ *  env var set by a manual `kubectl set env`). Returns null for field shapes we
+ *  can't safely auto-remediate; the caller then falls back to generic guidance. */
+export function suggestConflictRemediation(c: FieldManagerConflict): string | null {
+  // .spec.template.spec.containers[name="X"].env[name="ENV"].value  → kubectl set env … ENV-
+  const envMatch = c.field.match(/\.env\[name="([^"]+)"\]/);
+  if (envMatch && /Deployment/i.test(c.kind)) {
+    const [ns, name] = c.object.includes("/") ? c.object.split("/") : ["<namespace>", c.object];
+    return `kubectl set env deployment/${name} -n ${ns} ${envMatch[1]}-`;
+  }
+  return null;
+}
+
+/** Print the conflict report + the two remediation paths the operator can take.
+ *  Pure presentation; never mutates anything. */
+function reportFieldManagerConflicts(conflicts: FieldManagerConflict[]): void {
+  console.error(chalk.yellow(
+    `\n  ⚠ Pre-flight: ${conflicts.length} field(s) on this cluster are owned by another\n` +
+    `  field manager, not Helm. Server-side apply will NOT overwrite them, so the\n` +
+    `  upgrade was stopped before any change — exactly so it can't half-apply.\n`,
+  ));
+  for (const c of conflicts) {
+    console.error(chalk.yellow(`  • ${c.kind} ${c.object}`));
+    console.error(chalk.dim(`      field:   ${c.field}`));
+    console.error(chalk.dim(`      manager: ${c.manager}`));
+    const fix = suggestConflictRemediation(c);
+    if (fix) console.error(chalk.dim(`      release: ${fix}`));
+  }
+  const fixes = conflicts.map(suggestConflictRemediation).filter((f): f is string => f !== null);
+  console.error(chalk.cyan("\n  Choose one:\n"));
+  console.error(
+    "  A) Let kars take ownership (use only if no other operator manages these\n" +
+    "     fields — it overwrites them with the release's values):\n" +
+    chalk.bold("        kars upgrade --force-conflicts\n"),
+  );
+  if (fixes.length > 0) {
+    console.error(
+      "  B) Hand the field(s) back first (keeps the rest of that manager's\n" +
+      "     ownership intact), then re-run the upgrade:\n" +
+      fixes.map((f) => chalk.bold(`        ${f}`)).join("\n") +
+      chalk.bold(`\n        kars upgrade\n`),
+    );
+  } else {
+    console.error(chalk.dim(
+      "  B) Or remove the foreign field manager's ownership of those fields by\n" +
+      "     hand (e.g. revert the manual `kubectl` edit that set them), then\n" +
+      "     re-run `kars upgrade`.\n",
+    ));
+  }
+}
+
+/** Server-side dry-run pre-flight: detect field-manager conflicts WITHOUT
+ *  changing anything. Returns the parsed conflicts (empty if the dry-run
+ *  succeeds). A dry-run that fails for any *other* reason is reported as
+ *  `otherError` and is non-fatal: the real upgrade runs next and surfaces the
+ *  authoritative error rather than blocking on a flaky pre-flight. */
+async function preflightFieldManagerConflicts(
+  execa: Execa,
+  ctx: UpgradeContext,
+  helmPath: string,
+  target: string,
+  opts: { skipRuntimeImages?: boolean },
+): Promise<{ conflicts: FieldManagerConflict[]; otherError?: string }> {
+  const args = buildHelmUpgradeArgs(ctx, helmPath, target, {
+    skipRuntimeImages: opts.skipRuntimeImages,
+    dryRunServer: true,
+  });
+  try {
+    await execa("helm", args, { stdio: "pipe" });
+    return { conflicts: [] };
+  } catch (err) {
+    const e = err as { stderr?: string; stdout?: string; message?: string };
+    const blob = `${e.stderr ?? ""}\n${e.stdout ?? ""}\n${e.message ?? ""}`;
+    const conflicts = parseHelmFieldConflicts(blob);
+    if (conflicts.length > 0) return { conflicts };
+    return { conflicts: [], otherError: e.message ?? String(err) };
+  }
+}
+
+
 export function upgradeCommand(): Command {
   const cmd = new Command("upgrade");
   cmd
@@ -180,6 +351,12 @@ export function upgradeCommand(): Command {
     .option("--rollback", "Roll the cluster back to the previous Helm revision.", false)
     .option("--skip-runtime-images", "Skip the 7 multi-runtime adapter images (faster).", false)
     .option("--force", "Re-run the upgrade even if already at the target version.", false)
+    .option(
+      "--force-conflicts",
+      "Take ownership of fields managed by another field manager (e.g. a manual " +
+      "`kubectl set env`). Only safe when no other operator manages those fields.",
+      false,
+    )
     .option("--yes", "Skip the confirmation prompt (assume yes).", false)
     .addHelpText("after", `
 Examples:
@@ -187,6 +364,7 @@ Examples:
   kars upgrade --to v0.1.16        # Pin a specific release
   kars upgrade --dry-run           # Show changelog + impact + plan, make no changes
   kars upgrade --yes               # Skip the confirmation prompt
+  kars upgrade --force-conflicts   # Take ownership of fields another manager owns
   kars upgrade --rollback          # Revert to the previous Helm revision
 `)
     .action(async (options) => {
@@ -374,8 +552,34 @@ Examples:
         // ── Step 4: Atomic Helm upgrade ───────────────────────────────
         stepper.step("Upgrading controller + CRDs (atomic Helm upgrade)...");
         const helmPath = requireBundledAsset("deploy/helm/kars");
+
+        // Pre-flight: a server-side dry-run detects fields owned by another
+        // field manager (e.g. a manual `kubectl set env`). Helm v4 applies
+        // server-side, so such a field makes the real apply — AND its atomic
+        // rollback — fail with a conflict, leaving the release wedged. Catch it
+        // here, before any mutation, and let the operator decide rather than
+        // silently overwriting a field a live operator may legitimately own.
+        const pf = await preflightFieldManagerConflicts(execa, ctx, helmPath, target, {
+          skipRuntimeImages: options.skipRuntimeImages,
+        });
+        if (pf.conflicts.length > 0 && !options.forceConflicts) {
+          stepper.fail("Field-manager conflict — upgrade stopped before any change");
+          reportFieldManagerConflicts(pf.conflicts);
+          process.exit(1);
+        }
+        if (pf.conflicts.length > 0 && options.forceConflicts) {
+          stepper.detail("info",
+            `Taking ownership of ${pf.conflicts.length} field(s) from another manager (--force-conflicts)`);
+        }
+        if (pf.otherError) {
+          // Non-conflict dry-run failure: don't block — the real upgrade below
+          // surfaces the authoritative error (and its --atomic rolls back).
+          stepper.detail("info", "Pre-flight dry-run inconclusive — proceeding (real upgrade will report any error)");
+        }
+
         await execa("helm", buildHelmUpgradeArgs(ctx, helmPath, target, {
           skipRuntimeImages: options.skipRuntimeImages,
+          forceConflicts: options.forceConflicts,
         }), { stdio: "pipe" });
         stepper.done("Helm upgrade applied (auto-rollback on failure via --atomic)");
 
@@ -440,7 +644,19 @@ Examples:
         process.exit(0);
       } catch (err) {
         stepper.stop();
+        const e = err as { stderr?: string; stdout?: string; message?: string };
         const msg = err instanceof Error ? err.message : String(err);
+        // If a field-manager conflict slipped past the pre-flight (e.g. a field
+        // that only appears once the apply runs), give the same actionable
+        // guidance instead of a raw Helm error.
+        const lateConflicts = parseHelmFieldConflicts(
+          `${e.stderr ?? ""}\n${e.stdout ?? ""}\n${msg}`,
+        );
+        if (lateConflicts.length > 0 && !options.forceConflicts) {
+          console.error(chalk.red(`\n  Upgrade failed: field-manager conflict\n`));
+          reportFieldManagerConflicts(lateConflicts);
+          process.exit(1);
+        }
         console.error(chalk.red(`\n  Upgrade failed: ${msg}\n`));
         console.error(chalk.yellow(
           "  The atomic Helm upgrade auto-rolls-back the release on failure. If workloads\n" +

From 8789c2451de116b3142c08dd830a79b74019f5f6 Mon Sep 17 00:00:00 2001
From: Pal Lakatos-Toth <pallakatos@github.com>
Date: Tue, 30 Jun 2026 15:28:54 +0200
Subject: [PATCH 06/10] docs(mcp): MCP guide + runnable Playwright MCP example
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add docs/mcp.md — how to add an MCP server (McpServer CR + mcpServerRefs), how
tool calls are governed through the per-pod router, out-of-the-box egress
auto-derivation, and the session keepalive that keeps stateful flows (e.g.
browser automation) on one live page instead of resetting to about:blank.

Add examples/playwright-mcp/ — a browser-automation OpenClaw agent on the
official Playwright MCP, end to end: in-cluster MCP Deployment/Service, the
McpServer CR, and a KarsSandbox whose only MCP-specific line is one
mcpServerRefs entry (no hand-written egress). Wire both into the docs nav,
examples catalogue, and CLI reference (kars update).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 docs/README.md                                |   1 +
 docs/SUMMARY.md                               |   1 +
 docs/cli-reference.md                         |  38 ++++
 docs/examples.md                              |   1 +
 docs/mcp.md                                   | 191 ++++++++++++++++++
 examples/README.md                            |   1 +
 .../playwright-mcp/00-playwright-mcp.yaml     |  85 ++++++++
 examples/playwright-mcp/01-mcpserver.yaml     |  53 +++++
 examples/playwright-mcp/02-karssandbox.yaml   | 104 ++++++++++
 examples/playwright-mcp/README.md             | 124 ++++++++++++
 10 files changed, 599 insertions(+)
 create mode 100644 docs/mcp.md
 create mode 100644 examples/playwright-mcp/00-playwright-mcp.yaml
 create mode 100644 examples/playwright-mcp/01-mcpserver.yaml
 create mode 100644 examples/playwright-mcp/02-karssandbox.yaml
 create mode 100644 examples/playwright-mcp/README.md

diff --git a/docs/README.md b/docs/README.md
index ba3e9fe07..d8c01086a 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -97,6 +97,7 @@ This section mirrors the chapter groups in **[`SUMMARY.md`](SUMMARY.md)**, which
 - [kars OpenClaw plugin](openclaw-plugin.md) — the in-sandbox plugin (24 governance-aware tools, 10 skills) every kars-managed agent loads.
 - [`@kars/mesh` plugin](mesh-plugin.md) — the companion local plugin (built from source, not yet published on npm) for pairing a local OpenClaw with a remote kars cluster.
 - [Channels & external plugins](channels-plugins.md) — Telegram / Slack / Discord / WhatsApp channels + 3rd-party search/scrape API integrations via CLI flags.
+- [MCP servers](mcp.md) — add an MCP server (`McpServer` CR + `mcpServerRefs`), how tool calls are governed, out-of-the-box egress + session keepalive.
 - [Operator TUI](operator-tui.md) — `kars operator`, the live cluster dashboard.
 - [Permissions model](permissions.md) — the Azure RBAC `kars up` needs, enumerated.
 - [Per-sandbox identity](agent-identity.md) — each sandbox runs under its own Entra Agent ID.
diff --git a/docs/SUMMARY.md b/docs/SUMMARY.md
index 79fa56256..15c6f8ee9 100644
--- a/docs/SUMMARY.md
+++ b/docs/SUMMARY.md
@@ -44,6 +44,7 @@
 - [kars Hermes plugin](hermes-plugin.md)
 - [`@kars/mesh` plugin (local OpenClaw)](mesh-plugin.md)
 - [Channels & external plugins](channels-plugins.md)
+- [MCP servers](mcp.md)
 - [Operator TUI](operator-tui.md)
 - [Permissions model](permissions.md)
 - [Per-sandbox identity (Entra Agent ID)](agent-identity.md)
diff --git a/docs/cli-reference.md b/docs/cli-reference.md
index f35a59eec..3908642f1 100644
--- a/docs/cli-reference.md
+++ b/docs/cli-reference.md
@@ -85,6 +85,10 @@ All commands also inherit Commander.js built-in `--help`.
 - [mcp](#kars-mcp)
 - [memory](#kars-memory)
 
+### Self-management
+
+- [update](#kars-update)
+
 ---
 
 ## Lifecycle
@@ -236,6 +240,40 @@ tagging model.
 
 ---
 
+### `kars update`
+
+Keep the **CLI itself** current. Every `kars` run also does a quiet, cached
+check (at most once per day) and prints a one-line notice when a newer
+`@kars-runtime/cli` has been published; `kars update` is the explicit,
+always-fresh version that shows the changelog and offers to install.
+
+> `kars upgrade` upgrades the **cluster** (controller, router, sandboxes).
+> `kars update` upgrades the **CLI npm package** on your machine. Different
+> things.
+
+```
+kars update [options]
+```
+
+| Option | Description |
+|---|---|
+| `--check` | Only check and report; never prompt or install. Exits non-zero when an update is available (handy in scripts). |
+| `--yes` | Install the latest version without prompting. |
+
+```bash
+kars update            # check, show changelog, offer to install
+kars update --check    # report only (CI-friendly)
+kars update --yes      # non-interactive install
+```
+
+The install runs `npm install -g @kars-runtime/cli@latest`. Disable the
+automatic background notice with `KARS_NO_UPDATE_CHECK=1` (also off in CI and
+when stdout isn't a TTY).
+
+**See also:** [`kars upgrade`](#kars-upgrade) (cluster upgrade)
+
+---
+
 ### `kars dev`
 
 Runs a fully-policy-enforced sandbox locally for inner-loop development —
diff --git a/docs/examples.md b/docs/examples.md
index 48ac09821..615d84b96 100644
--- a/docs/examples.md
+++ b/docs/examples.md
@@ -12,6 +12,7 @@ Eight end-to-end examples live under [`examples/`](https://github.com/Azure/kars
 | [`openai-agents-quickstart`](https://github.com/Azure/kars/tree/main/examples/openai-agents-quickstart) | `OpenAIAgents` (Python) | YAMLs apply once you swap `REPLACE-ME/...` with your image | Hosts an unmodified [OpenAI Agents SDK](https://github.com/openai/openai-agents-python) app. The adapter transparently routes `api.openai.com` through the local inference router. |
 | [`maf-quickstart`](https://github.com/Azure/kars/tree/main/examples/maf-quickstart) | `MicrosoftAgentFramework` (Python) | YAMLs apply once you swap `REPLACE-ME/...` with your image | Hosts an unmodified [Microsoft Agent Framework](https://github.com/microsoft/agent-framework) app. |
 | [`byo-quickstart`](https://github.com/Azure/kars/tree/main/examples/byo-quickstart) | `BYO` | Builds + applies cleanly; runtime requires you to bring an image | Brings any container image under the BYO contract. Includes a tiny FastAPI reference agent. |
+| [`playwright-mcp`](https://github.com/Azure/kars/tree/main/examples/playwright-mcp) | `OpenClaw` + MCP | ✅ Verified live on AKS — browser flow (navigate → click → snapshot → evaluate) stays on one page; controller auto-derives the MCP egress rule | Browser-automation agent on the official [Playwright MCP](https://github.com/microsoft/playwright-mcp). Shows zero-config MCP wiring (one `mcpServerRefs` entry), out-of-the-box egress auto-derivation, and reliable multi-step browsing via router session keepalive. |
 
 ## Multi-agent attack-simulation demos
 
diff --git a/docs/mcp.md b/docs/mcp.md
new file mode 100644
index 000000000..d8717160e
--- /dev/null
+++ b/docs/mcp.md
@@ -0,0 +1,191 @@
+# MCP servers in kars
+
+The [Model Context Protocol](https://modelcontextprotocol.io) (MCP) is how a
+kars agent reaches tools it doesn't ship with — a hosted search API, a wiki
+reader, a headless browser, your internal services. kars treats every MCP
+server as an **untrusted upstream**: the agent never holds its credentials and
+never opens a socket to it directly. Everything goes through the per-pod
+**inference router**, which discovers the MCP's tools, enforces governance on
+every call, and is the only network path to the server.
+
+This guide covers adding an MCP, how it's reached, authentication, and the two
+mechanisms that make MCP support **work out of the box**: egress
+auto-derivation and session keepalive.
+
+> Looking for a runnable example? See
+> [`examples/playwright-mcp/`](../examples/playwright-mcp/) — a browser agent on
+> the official Playwright MCP, end to end.
+
+## The model: `McpServer` CR + `mcpServerRefs`
+
+Two pieces, both declarative:
+
+1. An **`McpServer`** custom resource describes the server: its URL, the tools
+   you allow, which sandboxes may use it, and (for hosted servers) its OAuth
+   config.
+2. A sandbox **opts in** by naming that CR in
+   `spec.governance.mcpServerRefs`.
+
+```yaml
+apiVersion: kars.azure.com/v1alpha1
+kind: McpServer
+metadata:
+  name: playwright
+  namespace: kars-system            # same namespace as the sandbox(es)
+spec:
+  url: "http://playwright-mcp.kars-mcp.svc.cluster.local:8931/mcp"
+  allowedTools: [browser_navigate, browser_click, browser_snapshot, browser_evaluate]
+  allowedSandboxes:
+    matchLabels: { kars.azure.com/sandbox: browser }
+  productionMode: false
+  displayName: "Playwright (headless Chromium)"
+---
+apiVersion: kars.azure.com/v1alpha1
+kind: KarsSandbox
+metadata:
+  name: browser
+  namespace: kars-system
+  labels: { kars.azure.com/sandbox: browser }
+spec:
+  runtime: { kind: OpenClaw, openclaw: {} }
+  governance:
+    enabled: true
+    mcpServerRefs:
+      - name: playwright           # ← the only MCP-specific line
+  networkPolicy:
+    defaultDeny: true
+    egressMode: Strict
+```
+
+You can also create the CR imperatively:
+
+```bash
+kars mcp apply playwright \
+  --namespace kars-system \
+  --url http://playwright-mcp.kars-mcp.svc.cluster.local:8931/mcp \
+  --allowed-tool browser_navigate --allowed-tool browser_click \
+  --allowed-sandbox-label kars.azure.com/sandbox=browser \
+  --display-name "Playwright (headless Chromium)"
+
+kars mcp list -n kars-system
+kars mcp get playwright -n kars-system -o yaml
+kars mcp delete playwright -n kars-system
+```
+
+### Key `McpServer.spec` fields
+
+| Field | Meaning |
+|---|---|
+| `url` | Streamable-HTTP MCP endpoint. In-cluster Service DNS or a hosted `https://` URL. |
+| `allowedTools` | Allow-list of tool names. Empty = none; `["*"]` = all (then gate with `ToolPolicy`). Pin explicitly so an upstream change can't widen the surface. |
+| `allowedSandboxes.matchLabels` | Which sandboxes may use this MCP. Empty = same-namespace only. |
+| `productionMode` | `true` requires HTTPS + OAuth 2.1. |
+| `oauth` | OAuth issuer/audience/resource for `productionMode`. The router mints tokens; the agent never sees them. |
+| `bearerFromEnv` | Static outbound bearer from a named env var, for MCPs that use a long-lived API token. |
+| `crossNamespaceAllowed` | Allow sandboxes in other namespaces to reference this CR. |
+
+The full schema is in the [CRD reference](api/crd-reference.md).
+
+## How a tool call flows
+
+```
+agent (UID 1000) ──127.0.0.1:8443──▶ inference-router (UID 1001) ──▶ MCP server
+        tools/call "playwright.browser_navigate"      │
+                                                        ├─ trust + ToolPolicy check
+                                                        ├─ audit event
+                                                        ├─ token budget
+                                                        └─ outbound auth (OAuth / bearer)
+```
+
+The agent calls a **namespaced** tool (`<server>.<tool>`, e.g.
+`playwright.browser_navigate`) on loopback. The router authorises it, dispatches
+to the MCP, and returns the result. The agent has no ambient network reach and
+no credentials.
+
+## Out-of-the-box egress
+
+Because the router is the only path to the MCP, the sandbox's default-deny
+NetworkPolicy has to admit the **router → MCP** hop. kars does this for you:
+the controller parses the `McpServer.spec.url` and emits the right egress rule
+automatically — you do **not** add the MCP host to
+`networkPolicy.allowedEndpoints` by hand.
+
+- **In-cluster Service** (`*.svc.cluster.local`): the controller emits a
+  `namespaceSelector` rule for the MCP's namespace. This matters under the
+  Cilium CNI, where a K8s NetworkPolicy `ipBlock` (even `0.0.0.0/0`) only
+  matches the reserved `world` entity and never an in-cluster pod — so an
+  `ipBlock` rule would silently fail to admit traffic to another pod.
+- **External host, non-443**: a coarse port-level rule; the router's CONNECT
+  allowlist enforces the exact host.
+- **External host, 443**: already covered by the router's blanket HTTPS path —
+  no extra rule needed.
+
+Verify it after applying a sandbox:
+
+```bash
+kubectl -n kars-<sandbox> get networkpolicy -o yaml | grep -A4 namespaceSelector
+```
+
+## Reliable sessions (no `about:blank` mid-task)
+
+Stateful MCP servers — Playwright is one — keep per-session state (your live
+browser page) and run a **server-side heartbeat**: they send the client a
+JSON-RPC `ping` every few seconds and destroy the session if no `pong` comes
+back within a short window (Playwright's default is 5s). A naive request/response
+client never answers those pings, so the server reaps the session; the next
+tool call gets `404 Session not found`, the client re-initialises, and the work
+lands on a **fresh, blank** page — the agent sees `about:blank` mid-task.
+
+kars's router is a **well-formed MCP client**: for every stateful session it
+holds the standalone SSE stream open and answers the server's `ping`s with
+`pong`s, keeping the session — and the agent's live page — alive. Multi-step
+flows (`navigate → click → snapshot → evaluate`) therefore stay on one page.
+This is automatic for any heartbeating MCP; there's nothing to configure.
+
+## Authentication
+
+The agent never holds MCP credentials. Two outbound modes, both handled by the
+router:
+
+- **OAuth 2.1** (`productionMode: true` + `oauth:`): the controller wires JWKS
+  rotation and the router presents a signed bearer token to the MCP.
+
+  ```yaml
+  spec:
+    url: "https://mcp.example.com/mcp"
+    productionMode: true
+    oauth:
+      issuer: "https://login.microsoftonline.com/<tenant>/v2.0"
+      audience: "api://your-mcp"
+  ```
+
+- **Static bearer** (`bearerFromEnv`): for MCPs that authenticate with a
+  long-lived API token, stored in the sandbox's `<name>-credentials` secret and
+  injected by name. The token stays in the router; the agent only sees tools.
+
+## Tool governance
+
+`allowedTools` on the `McpServer` is the coarse gate. For per-tool rules
+(arguments, rate limits, approval), bind a [`ToolPolicy`](api/crd-reference.md)
+via `governance.toolPolicyRef`. MCP tools are subject to the same AGT governance
+as built-in tools — trust scoring, audit, and policy all apply.
+
+For the MCP-specific threat model (tool poisoning, confused-deputy, prompt
+injection through tool output), see the
+[MCP security top-10](security-mcp-top10.md).
+
+## Troubleshooting
+
+| Symptom | Cause | Fix |
+|---|---|---|
+| Agent says the tool doesn't exist | Tool not in `allowedTools`, or sandbox label doesn't match `allowedSandboxes` | Add the tool / fix the label; re-apply the CR. |
+| `404 Session not found`, page resets to `about:blank` | Router not keeping the session alive (pre-0.1.24) | Upgrade the router; keepalive is automatic. |
+| Calls time out to an in-cluster MCP | Egress not admitted (e.g. `ipBlock` under Cilium) | Use the MCP's Service DNS `url` so the controller derives a `namespaceSelector` rule; check `kubectl -n kars-<sandbox> get networkpolicy`. |
+| `403`/`401` from a hosted MCP | OAuth/bearer misconfigured | Check `oauth.issuer`/`audience` or the `bearerFromEnv` secret. |
+
+## See also
+
+- [`examples/playwright-mcp/`](../examples/playwright-mcp/) — runnable end-to-end example.
+- [CRD reference](api/crd-reference.md) — full `McpServer` / `KarsSandbox` schema.
+- [MCP security top-10](security-mcp-top10.md) — the MCP threat model.
+- [Architecture diagrams](architecture-diagrams.md) — the MCP data path.
diff --git a/examples/README.md b/examples/README.md
index b4f7a2648..c46cb23fd 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -12,6 +12,7 @@ End-to-end blueprints you can `kubectl apply -f` after running `kars up`.
 | [`openai-agents-quickstart`](openai-agents-quickstart/) | OpenAIAgents (Python) | Hosts an unmodified OpenAI Agents SDK app inside a kars sandbox. Same security as default. |
 | [`maf-quickstart`](maf-quickstart/) | MicrosoftAgentFramework (Python) | Hosts an unmodified Microsoft Agent Framework app inside a kars sandbox. |
 | [`byo-quickstart`](byo-quickstart/) | BYO | Brings any container image under the BYO contract (`spec.runtime.kind: BYO`). Same isolation, same router. |
+| [`playwright-mcp`](playwright-mcp/) | OpenClaw + MCP | Browser-automation agent on the official Playwright MCP. Shows zero-config MCP wiring, **out-of-the-box egress auto-derivation**, and reliable multi-step browsing (session keepalive). |
 
 All examples share the same control-plane install and isolation guarantees
 — only the agent runtime image changes. See
diff --git a/examples/playwright-mcp/00-playwright-mcp.yaml b/examples/playwright-mcp/00-playwright-mcp.yaml
new file mode 100644
index 000000000..b55df984b
--- /dev/null
+++ b/examples/playwright-mcp/00-playwright-mcp.yaml
@@ -0,0 +1,85 @@
+# 00-playwright-mcp.yaml — the Playwright MCP server, in-cluster.
+#
+# This is an ordinary Deployment + Service running Microsoft's official
+# Playwright MCP image (a real Chromium driven over the Model Context
+# Protocol's Streamable-HTTP transport). It is NOT a kars component — it's
+# exactly the kind of third-party MCP server you'd bring yourself. kars treats
+# it as an untrusted upstream the agent reaches only through its per-pod
+# inference router.
+#
+# It lives in its own namespace (`kars-mcp`) so the example can show kars's
+# out-of-the-box egress: the controller parses the McpServer URL
+# (01-mcpserver.yaml) and auto-derives a NetworkPolicy rule admitting the
+# sandbox router to THIS namespace on :8931 — no hand-written egress needed.
+---
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: kars-mcp
+  labels:
+    # Auto-added by Kubernetes (>=1.21); the controller's derived egress rule
+    # selects this namespace by this well-known label, which is the only form
+    # Cilium honours for in-cluster pod destinations.
+    kubernetes.io/metadata.name: kars-mcp
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: playwright-mcp
+  namespace: kars-mcp
+  labels:
+    app: playwright-mcp
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: playwright-mcp
+  template:
+    metadata:
+      labels:
+        app: playwright-mcp
+    spec:
+      containers:
+        - name: mcp
+          image: mcr.microsoft.com/playwright/mcp:latest
+          imagePullPolicy: IfNotPresent
+          args:
+            - --port=8931
+            - --host=0.0.0.0
+            - --headless
+            - --browser=chromium
+            - --no-sandbox
+            # Playwright MCP rejects requests whose Host header isn't allow-listed.
+            # The router connects via the Service's cluster DNS name, so list it
+            # (and the short forms) here.
+            - --allowed-hosts=playwright-mcp.kars-mcp.svc.cluster.local:8931,playwright-mcp.kars-mcp:8931,playwright-mcp:8931,localhost:8931,localhost
+          ports:
+            - containerPort: 8931
+              name: mcp
+          readinessProbe:
+            tcpSocket:
+              port: 8931
+            initialDelaySeconds: 3
+            periodSeconds: 5
+          resources:
+            requests:
+              cpu: "250m"
+              memory: "512Mi"
+            limits:
+              cpu: "2"
+              memory: "2Gi"
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: playwright-mcp
+  namespace: kars-mcp
+  labels:
+    app: playwright-mcp
+spec:
+  selector:
+    app: playwright-mcp
+  ports:
+    - name: mcp
+      port: 8931
+      targetPort: 8931
diff --git a/examples/playwright-mcp/01-mcpserver.yaml b/examples/playwright-mcp/01-mcpserver.yaml
new file mode 100644
index 000000000..dc4123553
--- /dev/null
+++ b/examples/playwright-mcp/01-mcpserver.yaml
@@ -0,0 +1,53 @@
+# 01-mcpserver.yaml — register the Playwright MCP server with kars.
+#
+# The McpServer CR is the declarative "this MCP exists and these tools are
+# allowed" record. The controller mirrors it into any sandbox that references
+# it (02-karssandbox.yaml) and the per-pod inference router becomes the ONLY
+# path to it: the agent process calls tools on 127.0.0.1, the router dispatches
+# to this URL, and governance (trust, policy, audit, token budget) runs on the
+# way through. The agent never gets the raw network path or any credentials.
+#
+# Out-of-the-box egress: because `url` resolves to an in-cluster Service, the
+# controller derives the sandbox NetworkPolicy egress rule for the `kars-mcp`
+# namespace automatically. You do NOT add `playwright-mcp...:8931` to the
+# sandbox's `networkPolicy.allowedEndpoints` by hand.
+---
+apiVersion: kars.azure.com/v1alpha1
+kind: McpServer
+metadata:
+  name: playwright
+  namespace: kars-system
+  labels:
+    kars.azure.com/sandbox: browser
+spec:
+  # Streamable-HTTP MCP endpoint. In-cluster Service DNS → the controller
+  # auto-derives the matching egress rule. For a hosted MCP you'd use an
+  # https:// URL (e.g. https://mcp.example.com/mcp) and nothing else changes.
+  url: "http://playwright-mcp.kars-mcp.svc.cluster.local:8931/mcp"
+
+  # Pin the tool surface explicitly so an upstream change can't widen it
+  # without a CR edit. These are the browser-automation tools Playwright MCP
+  # advertises.
+  allowedTools:
+    - browser_navigate
+    - browser_click
+    - browser_type
+    - browser_snapshot
+    - browser_take_screenshot
+    - browser_evaluate
+    - browser_wait_for
+    - browser_press_key
+    - browser_select_option
+
+  # Which sandboxes may use this MCP. Empty = same-namespace only; here we
+  # gate on a label so only the browser sandbox(es) get it.
+  allowedSandboxes:
+    matchLabels:
+      kars.azure.com/sandbox: browser
+
+  # This MCP is unauthenticated (it's our own in-cluster Chromium). For a
+  # production/hosted MCP, set `productionMode: true` and an `oauth:` block;
+  # the controller then wires JWKS rotation + signed bearer tokens through the
+  # router and the agent still never sees a token.
+  productionMode: false
+  displayName: "Playwright (headless Chromium browser automation)"
diff --git a/examples/playwright-mcp/02-karssandbox.yaml b/examples/playwright-mcp/02-karssandbox.yaml
new file mode 100644
index 000000000..ce9539126
--- /dev/null
+++ b/examples/playwright-mcp/02-karssandbox.yaml
@@ -0,0 +1,104 @@
+# 02-karssandbox.yaml — a browser-automation agent that consumes the
+# Playwright MCP. OpenClaw runtime, default kars hardening, governance on.
+#
+# The only MCP-specific line is `governance.mcpServerRefs` — it names the
+# McpServer CR (01-mcpserver.yaml). From that single reference the controller:
+#   1. mirrors the MCP's per-server config into this sandbox's namespace and
+#      mounts it into the inference router,
+#   2. AUTO-DERIVES the NetworkPolicy egress rule to reach the in-cluster
+#      Playwright Service (namespace `kars-mcp`, port 8931) — note there is no
+#      hand-written `allowedEndpoints` entry for it below, and
+#   3. exposes the allow-listed `browser_*` tools to the agent on loopback.
+#
+# Browser automation "just works": the router holds the MCP session alive
+# (answers the server's heartbeat pings), so multi-step flows — navigate →
+# click → snapshot → evaluate — keep the SAME live page instead of resetting
+# to about:blank mid-task.
+---
+apiVersion: kars.azure.com/v1alpha1
+kind: InferencePolicy
+metadata:
+  name: browser-inference
+  namespace: kars-system
+  labels:
+    kars.azure.com/sandbox: browser
+spec:
+  appliesTo:
+    sandboxName: browser
+  modelPreference:
+    primary:
+      provider: azure-openai
+      deployment: gpt-4.1
+  contentSafety:
+    requirePromptShields: true
+  tokenBudget:
+    dailyTokens: 500000
+    perRequestTokens: 128000
+---
+apiVersion: kars.azure.com/v1alpha1
+kind: KarsSandbox
+metadata:
+  name: browser
+  namespace: kars-system
+  labels:
+    # Matches McpServer.spec.allowedSandboxes.matchLabels so this sandbox is
+    # permitted to use the Playwright MCP.
+    kars.azure.com/sandbox: browser
+spec:
+  runtime:
+    kind: OpenClaw
+    openclaw: {}
+
+  agent:
+    instructions: |-
+      You are a browser-automation agent. You have Playwright MCP tools
+      (browser_navigate, browser_click, browser_type, browser_snapshot,
+      browser_take_screenshot, browser_evaluate). Use them to drive a real
+      Chromium browser to fulfil the user's web tasks. Read the accessibility
+      snapshot to understand the page before you act.
+
+  sandbox:
+    isolation: "enhanced"
+    seccompProfile: "kars-strict"
+    readOnlyRootFilesystem: true
+    runAsNonRoot: true
+    allowPrivilegeEscalation: false
+    writablePaths:
+      - /sandbox
+      - /tmp
+
+  inferenceRef:
+    name: browser-inference
+
+  # Governance ON. `mcpServerRefs` is the consumer side of the McpServer CR.
+  governance:
+    enabled: true
+    toolPolicyRef:
+      name: kars-default
+    mcpServerRefs:
+      - name: playwright
+    # Demo trust threshold. Production should run `kars mesh setup-trust` and
+    # raise this to 500+.
+    trustThreshold: 0
+
+  # Egress is default-deny and actively enforced. We deliberately DO NOT list
+  # the Playwright Service here — the controller derives that rule from the
+  # referenced McpServer's URL. Everything the agent reaches goes through the
+  # router on loopback, so this list is for direct sandbox→external egress only
+  # (empty here: the browser runs inside the MCP server, not the agent pod).
+  networkPolicy:
+    defaultDeny: true
+    egressMode: Strict
+    allowedEndpoints: []
+
+  azureServices:
+    - service: ai-foundry
+      permissions: [inference]
+
+  resources:
+    requests:
+      cpu: "500m"
+      memory: "1Gi"
+    limits:
+      cpu: "2"
+      memory: "4Gi"
diff --git a/examples/playwright-mcp/README.md b/examples/playwright-mcp/README.md
new file mode 100644
index 000000000..20b624f88
--- /dev/null
+++ b/examples/playwright-mcp/README.md
@@ -0,0 +1,124 @@
+# kars Example: Browser-automation agent on a Playwright MCP
+
+A sandboxed OpenClaw agent that drives a **real headless Chromium** through the
+official [Playwright MCP](https://github.com/microsoft/playwright-mcp) server —
+navigating pages, clicking, typing, taking snapshots — entirely under kars
+governance and isolation.
+
+This is the canonical "**bring an MCP server and it just works**" example. The
+only MCP-specific line in the whole sandbox spec is a single
+`governance.mcpServerRefs` entry; kars does the rest.
+
+## What it shows
+
+| Capability | How |
+|---|---|
+| **Zero-config MCP wiring** | One `McpServer` CR + one `mcpServerRefs` entry. The controller mirrors config into the sandbox and exposes the allow-listed `browser_*` tools on the agent's loopback. |
+| **Out-of-the-box egress** | The controller parses the MCP's `url`, sees it's an in-cluster Service, and **auto-derives** the default-deny NetworkPolicy egress rule (`namespaceSelector` for `kars-mcp`, port 8931). You never hand-write that rule. |
+| **Reliable multi-step browsing** | The per-pod router holds the MCP session alive by answering Playwright's heartbeat pings, so `navigate → click → snapshot → evaluate` keeps the **same live page** instead of resetting to `about:blank` mid-task. |
+| **No credentials in the agent** | The agent calls tools on `127.0.0.1`; the router is the only path to the MCP. Trust, policy, audit and token budget run on every call. |
+| **Default kars hardening** | seccomp `kars-strict`, read-only rootfs, drop-ALL caps, non-root, enhanced isolation. |
+
+## Architecture
+
+```
+┌─ namespace: kars-system ───────────────────────────────────────────┐
+│  KarsSandbox "browser"  (OpenClaw, enhanced isolation)              │
+│   ├── openclaw container (UID 1000) ── tools on 127.0.0.1:8443 ──┐  │
+│   └── inference-router  (UID 1001) ─────────────────────────────┼─ │
+│         • holds MCP session alive (heartbeat pong)              │ │
+│         • governance on every tool call                         │ │
+└─────────────────────────────────────────────────────────────────┼─┘
+                  derived egress (namespaceSelector kars-mcp:8931)  │
+┌─ namespace: kars-mcp ─────────────────────────────────────────────▼┐
+│  Deployment/Service "playwright-mcp"  (mcr.microsoft.com/playwright/mcp)
+│   • real headless Chromium, Streamable-HTTP MCP on :8931           │
+└────────────────────────────────────────────────────────────────────┘
+```
+
+## Prerequisites
+
+A running kars control plane (`kars up`) on an AKS cluster, and `kubectl`
+pointed at it.
+
+## Deploy
+
+```bash
+# 1. The Playwright MCP server (in its own namespace).
+kubectl apply -f 00-playwright-mcp.yaml
+kubectl -n kars-mcp rollout status deploy/playwright-mcp
+
+# 2. Register the MCP with kars + the browser agent.
+kubectl apply -f 01-mcpserver.yaml
+kubectl apply -f 02-karssandbox.yaml
+kubectl -n kars-system wait --for=jsonpath='{.status.phase}'=Running \
+  karssandbox/browser --timeout=180s
+```
+
+## Use it
+
+```bash
+kars connect browser
+```
+
+Then ask it to do something on the web, e.g.:
+
+> Navigate to https://example.com, take a snapshot, and tell me the page heading.
+
+> Go to https://playwright.dev, click "Get started", and summarise the page.
+
+Multi-step flows work because the router keeps the browser session (and its
+live page) alive across calls.
+
+## Verify the out-of-the-box egress (optional)
+
+The controller derives the egress rule from the MCP URL — confirm it's there
+without you having written it:
+
+```bash
+kubectl -n kars-browser get networkpolicy -o yaml | grep -A4 namespaceSelector
+# → a rule selecting kubernetes.io/metadata.name: kars-mcp on TCP 8931
+```
+
+## How it works
+
+1. **`McpServer` CR** (`01-mcpserver.yaml`) declares the endpoint, the
+   allow-listed tools, and which sandboxes may use it.
+2. **`mcpServerRefs`** in the sandbox (`02-karssandbox.yaml`) opts the agent in.
+   The controller mirrors the MCP's config into the sandbox namespace, mounts
+   it into the router, and — because the URL is an in-cluster Service —
+   **auto-derives the NetworkPolicy egress** to `kars-mcp:8931`. (Cilium only
+   honours a `namespaceSelector` for in-cluster pod destinations, never an
+   `ipBlock`; the controller emits the correct form.)
+3. **The inference router** discovers the MCP's tools, namespaces them, and
+   exposes them to the agent on loopback. It **acts as a well-formed MCP
+   client**: it holds the standalone SSE stream open and answers the server's
+   heartbeat `ping`s with `pong`s, so Playwright never reaps the session. That
+   is what keeps a browsing flow on one live page.
+
+## Going hosted
+
+To point at a **hosted** MCP instead of an in-cluster one, change just the URL:
+
+```yaml
+spec:
+  url: "https://mcp.example.com/mcp"
+  productionMode: true
+  oauth:
+    issuer: "https://login.microsoftonline.com/<tenant>/v2.0"
+    audience: "api://your-mcp"
+```
+
+External `https://` hosts on 443 are already covered by the router's HTTPS
+path; the controller adds a coarse port rule for non-443 hosts and the router
+enforces the exact host. The agent still never sees a token.
+
+## Clean up
+
+```bash
+kubectl delete -f 02-karssandbox.yaml -f 01-mcpserver.yaml -f 00-playwright-mcp.yaml
+```
+
+See [`docs/mcp.md`](../../docs/mcp.md) for the full MCP guide and
+[`docs/api/crd-reference.md`](../../docs/api/crd-reference.md) for the
+`McpServer` / `KarsSandbox` schemas.

From 9d4e77053cc92f6a4779699e07e17ac92a641db6 Mon Sep 17 00:00:00 2001
From: Pal Lakatos-Toth <pallakatos@github.com>
Date: Tue, 30 Jun 2026 15:39:19 +0200
Subject: [PATCH 07/10] refactor(controller): extract reconciler::mcp_egress
 submodule

Move the MCP egress derivation (mcp_url_host_port / cluster_internal_namespace
/ mcp_egress_rule + the per-ref derive_mcp_egress_rules loop) and its unit
tests out of reconciler/mod.rs into a focused reconciler::mcp_egress module.
Keeps mod.rs under the ci/check-loc.sh phase0 budget (3700 LOC) and groups the
egress logic in one place. Pure code-move + the reconcile loop now calls
derive_mcp_egress_rules; no behaviour change. 854 controller tests pass.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 controller/src/reconciler/mcp_egress.rs | 275 +++++++++++++++++++++
 controller/src/reconciler/mod.rs        | 305 ++----------------------
 2 files changed, 299 insertions(+), 281 deletions(-)
 create mode 100644 controller/src/reconciler/mcp_egress.rs

diff --git a/controller/src/reconciler/mcp_egress.rs b/controller/src/reconciler/mcp_egress.rs
new file mode 100644
index 000000000..fe6368a91
--- /dev/null
+++ b/controller/src/reconciler/mcp_egress.rs
@@ -0,0 +1,275 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+//! MCP egress derivation.
+//!
+//! "Apply an `McpServer` and reference it — it just works." The per-pod
+//! inference router is the only network path to an MCP server, so the sandbox's
+//! default-deny `NetworkPolicy` must admit the router→server hop without the
+//! operator also hand-writing a `networkPolicy.allowedEndpoints` entry. These
+//! helpers turn an `McpServer.spec.url` into the matching egress rule; the
+//! reconciler walks every referenced server and adds the derived rules.
+
+use serde_json::json;
+
+/// Auto-derive the NetworkPolicy egress rules that admit the sandbox router to
+/// every `McpServer` the sandbox references. For each referent we fetch its
+/// `spec.url`, parse it, and build the matching rule (see [`mcp_egress_rule`]),
+/// skipping any that duplicate `existing` rules or each other.
+///
+/// A missing (`404`) referent is logged and skipped — the JWKS-mirror path
+/// surfaces it elsewhere. Any other API error is returned so the caller can
+/// requeue. Empty / unparseable URLs are skipped (those endpoints still need an
+/// explicit allowlist entry).
+pub(crate) async fn derive_mcp_egress_rules(
+    client: &kube::Client,
+    sandbox_self_ns: &str,
+    refs: &[&crate::mcp_server::LocalObjectRef],
+    sandbox_name: &str,
+    existing: &[serde_json::Value],
+) -> Result<Vec<serde_json::Value>, kube::Error> {
+    use kube::api::Api;
+    let mcp_api: Api<crate::mcp_server::McpServer> =
+        Api::namespaced(client.clone(), sandbox_self_ns);
+    let mut out: Vec<serde_json::Value> = Vec::new();
+    for mcp_ref in refs {
+        let ref_name = mcp_ref.name.trim();
+        if ref_name.is_empty() {
+            continue;
+        }
+        let url = match mcp_api.get(ref_name).await {
+            Ok(mcp) => mcp.spec.url.unwrap_or_default(),
+            Err(kube::Error::Api(ae)) if ae.code == 404 => {
+                tracing::warn!(
+                    sandbox = %sandbox_name,
+                    mcp = %ref_name,
+                    "referenced McpServer not found; egress not auto-derived",
+                );
+                continue;
+            }
+            Err(e) => return Err(e),
+        };
+        if url.is_empty() {
+            continue;
+        }
+        let Some((host, port)) = mcp_url_host_port(&url) else {
+            tracing::warn!(
+                sandbox = %sandbox_name,
+                mcp = %ref_name,
+                url = %url,
+                "McpServer url not parseable for egress derivation; skipping",
+            );
+            continue;
+        };
+        if let Some(rule) = mcp_egress_rule(&host, port)
+            && !existing.contains(&rule)
+            && !out.contains(&rule)
+        {
+            tracing::info!(
+                sandbox = %sandbox_name,
+                mcp = %ref_name,
+                host = %host,
+                port,
+                "auto-derived MCP egress rule",
+            );
+            out.push(rule);
+        }
+    }
+    Ok(out)
+}
+
+/// Parse an `McpServer.spec.url` into the `(host, port)` the sandbox's
+/// inference-router must be allowed to reach.
+///
+/// Returns `None` when the URL is empty or unparseable (the caller skips
+/// egress derivation and lets the router surface the misconfiguration).
+/// The port defaults from the scheme (`https` → 443, `http` → 80) when not
+/// explicitly given. Only `http`/`https` MCP transports are supported.
+///
+/// Examples:
+/// - `https://mcp.deepwiki.com/mcp` → `("mcp.deepwiki.com", 443)`
+/// - `http://playwright-mcp.default.svc.cluster.local:8931/mcp`
+///   → `("playwright-mcp.default.svc.cluster.local", 8931)`
+pub(crate) fn mcp_url_host_port(url: &str) -> Option<(String, u16)> {
+    let url = url.trim();
+    let (scheme, rest) = url.split_once("://")?;
+    let default_port: u16 = match scheme.to_ascii_lowercase().as_str() {
+        "https" => 443,
+        "http" => 80,
+        _ => return None,
+    };
+    // Strip path / query / fragment — everything from the first '/', '?' or '#'.
+    let authority = rest.split(['/', '?', '#']).next().unwrap_or("").trim();
+    if authority.is_empty() {
+        return None;
+    }
+    // Drop any userinfo ("user:pass@host").
+    let host_port = authority.rsplit('@').next().unwrap_or(authority);
+    // IPv6 literal `[::1]:8931` — not expected for MCP services; reject so the
+    // caller falls back rather than mis-splitting on the inner colons.
+    if host_port.starts_with('[') {
+        return None;
+    }
+    let (host, port) = match host_port.rsplit_once(':') {
+        Some((h, p)) => {
+            let port = p.parse::<u16>().ok()?;
+            (h, port)
+        }
+        None => (host_port, default_port),
+    };
+    if host.is_empty() {
+        return None;
+    }
+    Some((host.to_string(), port))
+}
+
+/// Build the NetworkPolicy egress rule that admits the sandbox router to an
+/// MCP server at `host:port`. In-cluster Service DNS names get a
+/// `namespaceSelector` rule (the only form Cilium honours for pod
+/// destinations); external hosts get the coarse port-level `ipBlock`
+/// (host-level enforcement lives in the router). Returns `None` for an
+/// external host on 443 — already covered by the blanket HTTPS rule.
+pub(crate) fn mcp_egress_rule(host: &str, port: u16) -> Option<serde_json::Value> {
+    if let Some(ns) = cluster_internal_namespace(host) {
+        Some(json!({
+            "to": [{
+                "namespaceSelector": {
+                    "matchLabels": { "kubernetes.io/metadata.name": ns }
+                }
+            }],
+            "ports": [{"protocol": "TCP", "port": port}]
+        }))
+    } else if port == 443 {
+        None
+    } else {
+        Some(json!({
+            "to": [{"ipBlock": {"cidr": "0.0.0.0/0"}}],
+            "ports": [{"protocol": "TCP", "port": port}]
+        }))
+    }
+}
+
+/// If `host` is a cluster-internal Kubernetes DNS name, return the namespace
+/// it resolves into; otherwise `None`.
+///
+/// In-cluster MCP servers / egress endpoints are addressed by their Service
+/// DNS name — `<svc>.<ns>.svc.cluster.local` (or the `.svc` short form). For
+/// those we MUST express the NetworkPolicy egress with a `namespaceSelector`,
+/// not an `ipBlock`: under the Cilium CNI a K8s NetworkPolicy `ipBlock` CIDR
+/// (including `0.0.0.0/0`) only matches the reserved `world` entity and is
+/// NOT applied to in-cluster pod endpoints, so an `ipBlock`-based rule silently
+/// fails to admit traffic to another pod. Selecting the destination namespace
+/// by its well-known `kubernetes.io/metadata.name` label is the portable way to
+/// open egress to an in-cluster Service.
+///
+/// Returns `None` for external hostnames (e.g. `api.openai.com`) and for bare,
+/// namespace-ambiguous names — those keep the coarse `ipBlock` port rule, with
+/// fine-grained host enforcement handled by the router's CONNECT allowlist.
+pub(crate) fn cluster_internal_namespace(host: &str) -> Option<String> {
+    let host = host.trim().trim_end_matches('.');
+    // `<svc>.<ns>.svc.cluster.local` or the cluster-suffixless `<svc>.<ns>.svc`.
+    let stripped = host
+        .strip_suffix(".svc.cluster.local")
+        .or_else(|| host.strip_suffix(".svc"))?;
+    // After stripping the `.svc[.cluster.local]` suffix, the remainder is
+    // `<svc>.<ns>`; the namespace is the last label.
+    let ns = stripped.rsplit('.').next()?;
+    if ns.is_empty() || ns == stripped {
+        // `stripped` had no `.` → no namespace label present.
+        return None;
+    }
+    Some(ns.to_string())
+}
+
+#[cfg(test)]
+mod tests {
+    use super::{cluster_internal_namespace, mcp_egress_rule, mcp_url_host_port};
+
+    #[test]
+    fn cluster_internal_namespace_parses_fqdn_forms() {
+        assert_eq!(
+            cluster_internal_namespace("playwright-mcp.default.svc.cluster.local"),
+            Some("default".to_string())
+        );
+        assert_eq!(
+            cluster_internal_namespace("playwright-mcp.default.svc"),
+            Some("default".to_string())
+        );
+        // Trailing dot (fully-qualified) is tolerated.
+        assert_eq!(
+            cluster_internal_namespace("svc-x.team-a.svc.cluster.local."),
+            Some("team-a".to_string())
+        );
+    }
+
+    #[test]
+    fn cluster_internal_namespace_rejects_external_and_ambiguous() {
+        // External hostnames keep the coarse ipBlock rule.
+        assert_eq!(cluster_internal_namespace("api.openai.com"), None);
+        assert_eq!(cluster_internal_namespace("example.com"), None);
+        // Bare names without a namespace label are ambiguous → None.
+        assert_eq!(cluster_internal_namespace("playwright-mcp"), None);
+        // A `.svc` suffix with no namespace label is not a valid in-cluster name.
+        assert_eq!(cluster_internal_namespace("svc"), None);
+    }
+
+    #[test]
+    fn mcp_url_host_port_parses_common_forms() {
+        // Remote HTTPS MCP — default port 443.
+        assert_eq!(
+            mcp_url_host_port("https://mcp.deepwiki.com/mcp"),
+            Some(("mcp.deepwiki.com".to_string(), 443))
+        );
+        assert_eq!(
+            mcp_url_host_port("https://api.githubcopilot.com/mcp"),
+            Some(("api.githubcopilot.com".to_string(), 443))
+        );
+        // In-cluster HTTP MCP with explicit port.
+        assert_eq!(
+            mcp_url_host_port("http://playwright-mcp.default.svc.cluster.local:8931/mcp"),
+            Some(("playwright-mcp.default.svc.cluster.local".to_string(), 8931))
+        );
+        // http default port 80, no path.
+        assert_eq!(
+            mcp_url_host_port("http://svc.ns.svc.cluster.local"),
+            Some(("svc.ns.svc.cluster.local".to_string(), 80))
+        );
+        // Query/fragment are stripped.
+        assert_eq!(
+            mcp_url_host_port("https://host.example:8443/mcp?x=1#frag"),
+            Some(("host.example".to_string(), 8443))
+        );
+    }
+
+    #[test]
+    fn mcp_url_host_port_rejects_invalid() {
+        assert_eq!(mcp_url_host_port(""), None);
+        assert_eq!(mcp_url_host_port("not-a-url"), None);
+        assert_eq!(mcp_url_host_port("ftp://host:21/x"), None);
+        assert_eq!(mcp_url_host_port("https://"), None);
+        // Non-numeric port.
+        assert_eq!(mcp_url_host_port("https://host:notaport/mcp"), None);
+        // IPv6 literals are not supported (fall back rather than mis-split).
+        assert_eq!(mcp_url_host_port("http://[::1]:8931/mcp"), None);
+    }
+
+    #[test]
+    fn mcp_egress_rule_shapes_by_destination() {
+        // In-cluster → namespaceSelector on the destination namespace + port.
+        let rule = mcp_egress_rule("playwright-mcp.default.svc.cluster.local", 8931)
+            .expect("in-cluster rule");
+        assert_eq!(
+            rule["to"][0]["namespaceSelector"]["matchLabels"]["kubernetes.io/metadata.name"],
+            "default"
+        );
+        assert_eq!(rule["ports"][0]["port"], 8931);
+
+        // External non-443 → coarse ipBlock + port.
+        let rule = mcp_egress_rule("mcp.example.com", 8080).expect("external non-443 rule");
+        assert_eq!(rule["to"][0]["ipBlock"]["cidr"], "0.0.0.0/0");
+        assert_eq!(rule["ports"][0]["port"], 8080);
+
+        // External 443 → None (blanket HTTPS rule already covers it).
+        assert!(mcp_egress_rule("mcp.deepwiki.com", 443).is_none());
+    }
+}
diff --git a/controller/src/reconciler/mod.rs b/controller/src/reconciler/mod.rs
index 018271a90..ce919983d 100644
--- a/controller/src/reconciler/mod.rs
+++ b/controller/src/reconciler/mod.rs
@@ -36,8 +36,11 @@ use crate::fedcred::{FedCredConfig, FedCredManager};
 pub(crate) mod byo_contract;
 mod dev_env;
 pub(crate) mod governance_mounts;
+mod mcp_egress;
 pub(crate) mod trustgraph_mount;
 
+use mcp_egress::mcp_egress_rule;
+
 /// Build pod security context, conditionally including SELinux options and
 /// choosing between RuntimeDefault and Localhost seccomp profiles.
 /// For Kata (confidential), we use RuntimeDefault since the VM provides isolation.
@@ -89,109 +92,6 @@ pub(crate) fn isolation_scheduling(isolation: &str) -> (Option<&'static str>, &'
     }
 }
 
-/// Parse an `McpServer.spec.url` into the `(host, port)` the sandbox's
-/// inference-router must be allowed to reach.
-///
-/// Returns `None` when the URL is empty or unparseable (the caller skips
-/// egress derivation and lets the router surface the misconfiguration).
-/// The port defaults from the scheme (`https` → 443, `http` → 80) when not
-/// explicitly given. Only `http`/`https` MCP transports are supported.
-///
-/// Examples:
-/// - `https://mcp.deepwiki.com/mcp` → `("mcp.deepwiki.com", 443)`
-/// - `http://playwright-mcp.default.svc.cluster.local:8931/mcp`
-///   → `("playwright-mcp.default.svc.cluster.local", 8931)`
-pub(crate) fn mcp_url_host_port(url: &str) -> Option<(String, u16)> {
-    let url = url.trim();
-    let (scheme, rest) = url.split_once("://")?;
-    let default_port: u16 = match scheme.to_ascii_lowercase().as_str() {
-        "https" => 443,
-        "http" => 80,
-        _ => return None,
-    };
-    // Strip path / query / fragment — everything from the first '/', '?' or '#'.
-    let authority = rest.split(['/', '?', '#']).next().unwrap_or("").trim();
-    if authority.is_empty() {
-        return None;
-    }
-    // Drop any userinfo ("user:pass@host").
-    let host_port = authority.rsplit('@').next().unwrap_or(authority);
-    // IPv6 literal `[::1]:8931` — not expected for MCP services; reject so the
-    // caller falls back rather than mis-splitting on the inner colons.
-    if host_port.starts_with('[') {
-        return None;
-    }
-    let (host, port) = match host_port.rsplit_once(':') {
-        Some((h, p)) => {
-            let port = p.parse::<u16>().ok()?;
-            (h, port)
-        }
-        None => (host_port, default_port),
-    };
-    if host.is_empty() {
-        return None;
-    }
-    Some((host.to_string(), port))
-}
-
-/// Build the NetworkPolicy egress rule that admits the sandbox router to an
-/// MCP server at `host:port`. In-cluster Service DNS names get a
-/// `namespaceSelector` rule (the only form Cilium honours for pod
-/// destinations); external hosts get the coarse port-level `ipBlock`
-/// (host-level enforcement lives in the router). Returns `None` for an
-/// external host on 443 — already covered by the blanket HTTPS rule.
-fn mcp_egress_rule(host: &str, port: u16) -> Option<serde_json::Value> {
-    if let Some(ns) = cluster_internal_namespace(host) {
-        Some(json!({
-            "to": [{
-                "namespaceSelector": {
-                    "matchLabels": { "kubernetes.io/metadata.name": ns }
-                }
-            }],
-            "ports": [{"protocol": "TCP", "port": port}]
-        }))
-    } else if port == 443 {
-        None
-    } else {
-        Some(json!({
-            "to": [{"ipBlock": {"cidr": "0.0.0.0/0"}}],
-            "ports": [{"protocol": "TCP", "port": port}]
-        }))
-    }
-}
-
-/// If `host` is a cluster-internal Kubernetes DNS name, return the namespace
-/// it resolves into; otherwise `None`.
-///
-/// In-cluster MCP servers / egress endpoints are addressed by their Service
-/// DNS name — `<svc>.<ns>.svc.cluster.local` (or the `.svc` short form). For
-/// those we MUST express the NetworkPolicy egress with a `namespaceSelector`,
-/// not an `ipBlock`: under the Cilium CNI a K8s NetworkPolicy `ipBlock` CIDR
-/// (including `0.0.0.0/0`) only matches the reserved `world` entity and is
-/// NOT applied to in-cluster pod endpoints, so an `ipBlock`-based rule silently
-/// fails to admit traffic to another pod. Selecting the destination namespace
-/// by its well-known `kubernetes.io/metadata.name` label is the portable way to
-/// open egress to an in-cluster Service.
-///
-/// Returns `None` for external hostnames (e.g. `api.openai.com`) and for bare,
-/// namespace-ambiguous names — those keep the coarse `ipBlock` port rule, with
-/// fine-grained host enforcement handled by the router's CONNECT allowlist.
-pub(crate) fn cluster_internal_namespace(host: &str) -> Option<String> {
-    let host = host.trim().trim_end_matches('.');
-    // `<svc>.<ns>.svc.cluster.local` or the cluster-suffixless `<svc>.<ns>.svc`.
-    let stripped = host
-        .strip_suffix(".svc.cluster.local")
-        .or_else(|| host.strip_suffix(".svc"))?;
-    // After stripping the `.svc[.cluster.local]` suffix, the remainder is
-    // `<svc>.<ns>`; the namespace is the last label.
-    let ns = stripped.rsplit('.').next()?;
-    if ns.is_empty() || ns == stripped {
-        // `stripped` had no `.` → no namespace label present.
-        return None;
-    }
-    Some(ns.to_string())
-}
-
 /// Build the egress-guard init-container command.
 ///
 /// Standard sandboxes (every kind except SRE) get the full lockdown:
@@ -354,99 +254,6 @@ mod egress_guard_tests {
             );
         }
     }
-
-    #[test]
-    fn cluster_internal_namespace_parses_fqdn_forms() {
-        use super::cluster_internal_namespace;
-        assert_eq!(
-            cluster_internal_namespace("playwright-mcp.default.svc.cluster.local"),
-            Some("default".to_string())
-        );
-        assert_eq!(
-            cluster_internal_namespace("playwright-mcp.default.svc"),
-            Some("default".to_string())
-        );
-        // Trailing dot (fully-qualified) is tolerated.
-        assert_eq!(
-            cluster_internal_namespace("svc-x.team-a.svc.cluster.local."),
-            Some("team-a".to_string())
-        );
-    }
-
-    #[test]
-    fn cluster_internal_namespace_rejects_external_and_ambiguous() {
-        use super::cluster_internal_namespace;
-        // External hostnames keep the coarse ipBlock rule.
-        assert_eq!(cluster_internal_namespace("api.openai.com"), None);
-        assert_eq!(cluster_internal_namespace("example.com"), None);
-        // Bare names without a namespace label are ambiguous → None.
-        assert_eq!(cluster_internal_namespace("playwright-mcp"), None);
-        // A `.svc` suffix with no namespace label is not a valid in-cluster name.
-        assert_eq!(cluster_internal_namespace("svc"), None);
-    }
-
-    #[test]
-    fn mcp_url_host_port_parses_common_forms() {
-        use super::mcp_url_host_port;
-        // Remote HTTPS MCP — default port 443.
-        assert_eq!(
-            mcp_url_host_port("https://mcp.deepwiki.com/mcp"),
-            Some(("mcp.deepwiki.com".to_string(), 443))
-        );
-        assert_eq!(
-            mcp_url_host_port("https://api.githubcopilot.com/mcp"),
-            Some(("api.githubcopilot.com".to_string(), 443))
-        );
-        // In-cluster HTTP MCP with explicit port.
-        assert_eq!(
-            mcp_url_host_port("http://playwright-mcp.default.svc.cluster.local:8931/mcp"),
-            Some(("playwright-mcp.default.svc.cluster.local".to_string(), 8931))
-        );
-        // http default port 80, no path.
-        assert_eq!(
-            mcp_url_host_port("http://svc.ns.svc.cluster.local"),
-            Some(("svc.ns.svc.cluster.local".to_string(), 80))
-        );
-        // Query/fragment are stripped.
-        assert_eq!(
-            mcp_url_host_port("https://host.example:8443/mcp?x=1#frag"),
-            Some(("host.example".to_string(), 8443))
-        );
-    }
-
-    #[test]
-    fn mcp_url_host_port_rejects_invalid() {
-        use super::mcp_url_host_port;
-        assert_eq!(mcp_url_host_port(""), None);
-        assert_eq!(mcp_url_host_port("not-a-url"), None);
-        assert_eq!(mcp_url_host_port("ftp://host:21/x"), None);
-        assert_eq!(mcp_url_host_port("https://"), None);
-        // Non-numeric port.
-        assert_eq!(mcp_url_host_port("https://host:notaport/mcp"), None);
-        // IPv6 literals are not supported (fall back rather than mis-split).
-        assert_eq!(mcp_url_host_port("http://[::1]:8931/mcp"), None);
-    }
-
-    #[test]
-    fn mcp_egress_rule_shapes_by_destination() {
-        use super::mcp_egress_rule;
-        // In-cluster → namespaceSelector on the destination namespace + port.
-        let rule = mcp_egress_rule("playwright-mcp.default.svc.cluster.local", 8931)
-            .expect("in-cluster rule");
-        assert_eq!(
-            rule["to"][0]["namespaceSelector"]["matchLabels"]["kubernetes.io/metadata.name"],
-            "default"
-        );
-        assert_eq!(rule["ports"][0]["port"], 8931);
-
-        // External non-443 → coarse ipBlock + port.
-        let rule = mcp_egress_rule("mcp.example.com", 8080).expect("external non-443 rule");
-        assert_eq!(rule["to"][0]["ipBlock"]["cidr"], "0.0.0.0/0");
-        assert_eq!(rule["ports"][0]["port"], 8080);
-
-        // External 443 → None (blanket HTTPS rule already covers it).
-        assert!(mcp_egress_rule("mcp.deepwiki.com", 443).is_none());
-    }
 }
 #[derive(Debug, thiserror::Error)]
 enum ReconcileError {
@@ -1382,96 +1189,32 @@ async fn reconcile(sandbox: Arc<KarsSandbox>, ctx: Arc<Context>) -> Result<Actio
                 // 443 is already covered by the blanket HTTPS rule.
                 continue;
             }
-            // In-cluster MCP servers / endpoints (`<svc>.<ns>.svc.cluster.local`)
-            // must be admitted with a `namespaceSelector`: a `0.0.0.0/0` ipBlock
-            // does NOT match in-cluster pods under Cilium, so the agent's router
-            // could never reach an in-cluster MCP server. External hosts keep the
-            // coarse port-level ipBlock (host-level enforcement lives in the
-            // router's CONNECT allowlist).
-            if let Some(ns) = cluster_internal_namespace(&ep.host) {
-                egress_rules.push(json!({
-                    "to": [{
-                        "namespaceSelector": {
-                            "matchLabels": { "kubernetes.io/metadata.name": ns }
-                        }
-                    }],
-                    "ports": [{"protocol": "TCP", "port": port}]
-                }));
-            } else {
-                egress_rules.push(json!({
-                    "to": [{"ipBlock": {"cidr": "0.0.0.0/0"}}],
-                    "ports": [{"protocol": "TCP", "port": port}]
-                }));
+            // In-cluster endpoints need a `namespaceSelector` (a `0.0.0.0/0`
+            // ipBlock doesn't match in-cluster pods under Cilium); external
+            // hosts keep the coarse ipBlock. `mcp_egress_rule` encodes both and
+            // only returns `None` for 443, already skipped above.
+            if let Some(rule) = mcp_egress_rule(&ep.host, port) {
+                egress_rules.push(rule);
             }
         }
     }
 
-    // ── Auto-derive egress to referenced McpServer endpoints ─────────────
-    //
-    // "Apply an McpServer CRD and reference it — it just works." The router
-    // is the only network path to an MCP server, so the sandbox NetworkPolicy
-    // must admit the router→server hop WITHOUT the operator also hand-writing
-    // a `networkPolicy.allowedEndpoints` entry. For every McpServer the
-    // sandbox references via `governance.mcpServerRefs`, we parse its
-    // `spec.url` and emit the matching egress rule:
-    //   - in-cluster Service DNS  → `namespaceSelector` (Cilium-correct)
-    //   - external host, non-443  → coarse `ipBlock` + port
-    //   - external host, 443      → already covered by the blanket HTTPS rule
-    //
-    // Remote HTTPS MCPs (DeepWiki, GitHub, hosted Playwright) therefore need
-    // zero extra config; self-hosted in-cluster MCPs on a non-443 port get
-    // their egress opened automatically.
+    // Auto-derive egress to every referenced McpServer so "apply an McpServer
+    // and reference it" just works — the router is the only path to it, so the
+    // default-deny NetworkPolicy must admit the router→server hop. De-duplicated.
+    match mcp_egress::derive_mcp_egress_rules(
+        client,
+        &sandbox_self_ns,
+        &governance_config.effective_mcp_server_refs(),
+        &name,
+        &egress_rules,
+    )
+    .await
     {
-        let mcp_api: Api<crate::mcp_server::McpServer> =
-            Api::namespaced(client.clone(), &sandbox_self_ns);
-        for mcp_ref in governance_config.effective_mcp_server_refs() {
-            let ref_name = mcp_ref.name.trim();
-            if ref_name.is_empty() {
-                continue;
-            }
-            let url = match mcp_api.get(ref_name).await {
-                Ok(mcp) => mcp.spec.url.unwrap_or_default(),
-                Err(kube::Error::Api(ae)) if ae.code == 404 => {
-                    // Missing referent is surfaced by the JWKS-mirror path
-                    // below; here we just skip egress derivation for it.
-                    tracing::warn!(
-                        sandbox = %name,
-                        mcp = %ref_name,
-                        "referenced McpServer not found; egress not auto-derived",
-                    );
-                    continue;
-                }
-                Err(e) => {
-                    tracing::error!(error = %e, sandbox = %name, mcp = %ref_name, "McpServer lookup failed");
-                    return Ok(Action::requeue(Duration::from_secs(15)));
-                }
-            };
-            // Empty when the signed-bundle authoring path supplies the url;
-            // those endpoints still need an explicit allowlist entry.
-            if url.is_empty() {
-                continue;
-            }
-            let Some((host, port)) = mcp_url_host_port(&url) else {
-                tracing::warn!(
-                    sandbox = %name,
-                    mcp = %ref_name,
-                    url = %url,
-                    "McpServer url not parseable for egress derivation; skipping",
-                );
-                continue;
-            };
-            if let Some(rule) = mcp_egress_rule(&host, port)
-                && !egress_rules.contains(&rule)
-            {
-                tracing::info!(
-                    sandbox = %name,
-                    mcp = %ref_name,
-                    host = %host,
-                    port,
-                    "auto-derived MCP egress rule",
-                );
-                egress_rules.push(rule);
-            }
+        Ok(rules) => egress_rules.extend(rules),
+        Err(e) => {
+            tracing::error!(error = %e, sandbox = %name, "MCP egress derivation failed");
+            return Ok(Action::requeue(Duration::from_secs(15)));
         }
     }
 

From 66c53ab2a434a7283055869517df1d0cac7e243e Mon Sep 17 00:00:00 2001
From: Pal Lakatos-Toth <pallakatos@github.com>
Date: Tue, 30 Jun 2026 15:40:38 +0200
Subject: [PATCH 08/10] docs(security): security audit for MCP OOTB + CLI
 update (v0.1.24)

Signed-off-by: Pal Lakatos-Toth <pallakatos@microsoft.com>
Signed-off-by: Copilot <223556219+Copilot@users.noreply.github.com>

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../2026-06-30-mcp-out-of-the-box.md          | 98 +++++++++++++++++++
 1 file changed, 98 insertions(+)
 create mode 100644 docs/security-audits/2026-06-30-mcp-out-of-the-box.md

diff --git a/docs/security-audits/2026-06-30-mcp-out-of-the-box.md b/docs/security-audits/2026-06-30-mcp-out-of-the-box.md
new file mode 100644
index 000000000..1c9721b93
--- /dev/null
+++ b/docs/security-audits/2026-06-30-mcp-out-of-the-box.md
@@ -0,0 +1,98 @@
+# Security Audit — MCP out-of-the-box: session keepalive, egress auto-derive, CLI update (v0.1.24)
+
+Date: 2026-06-30
+Scope: `inference-router/src/mcp/forwarder.rs`, `controller/src/reconciler/mod.rs`, `controller/src/reconciler/mcp_egress.rs`, `cli/src/commands/update.ts`, `cli/src/lib/update-check.ts`, `cli/src/commands/upgrade.ts`.
+Gated paths: `inference-router/src/mcp/forwarder.rs`, `controller/src/reconciler/...`, `cli/src/commands/...`.
+
+## Summary
+
+This change makes MCP servers work out of the box and adds a CLI self-update
+notice. Four capability-touching pieces:
+
+1. **Router MCP session keepalive** — the forwarder now holds the standalone
+   `GET /mcp` SSE stream open and answers a heartbeating MCP server's `ping`s
+   with `pong`s so the upstream doesn't reap the session mid-task.
+2. **Controller MCP egress auto-derivation** — the controller parses each
+   referenced `McpServer.spec.url` and emits the matching default-deny
+   NetworkPolicy egress rule (namespaceSelector for in-cluster, coarse ipBlock
+   for external non-443, nothing for external 443).
+3. **CLI `kars update` + automatic update notice** — best-effort npm version
+   check + optional `npm install -g @kars-runtime/cli@latest`.
+4. **`kars upgrade` pre-flight** — server-side-apply conflict detection before
+   the atomic Helm upgrade (read-only dry-run).
+
+## T1: New capability / attack surface? (YES — bounded)
+
+- **Keepalive (router→MCP):** opens one long-lived GET to an MCP endpoint the
+  router was *already* authorised to call (same URL, headers, session id as the
+  tools/call path). It only ever **reads** SSE frames and **writes** a JSON-RPC
+  `pong` (`{"id":<n>,"result":{}}`) — no new destinations, no new credentials,
+  no agent-reachable surface (the agent still only sees loopback tools). The
+  pong responder replies solely to server-initiated `ping`s; any other inbound
+  frame is ignored. The task is bounded by a per-request timeout and is
+  aborted/replaced on session re-init.
+- **Egress auto-derivation:** *narrows* rather than widens — it adds precisely
+  the rule needed to reach a *referenced* `McpServer` and nothing else. The
+  derivation is driven only by `governance.mcpServerRefs` (operator-authored)
+  and the referenced CR's `url`; an attacker cannot inject a rule without RBAC
+  to create an `McpServer` and to reference it from a sandbox. In-cluster URLs
+  yield a `namespaceSelector` scoped to the MCP's namespace + exact port;
+  external non-443 yields a coarse port rule with host enforcement still at the
+  router; external 443 adds nothing. Unparseable/missing referents are skipped
+  (fail-closed: no rule added).
+- **CLI update check:** introduces two outbound HTTPS calls from the operator's
+  workstation (npm registry `latest` + GitHub release notes), each bounded by a
+  1.5s timeout, cached ≤24h in `~/.kars/update-check.json`, off in CI/non-TTY
+  and via `KARS_NO_UPDATE_CHECK=1`. It never auto-executes an install: the
+  passive notice only prints text; the actual `npm install -g` runs only on an
+  explicit `kars update` (interactive confirm, or `--yes`). No tokens, no
+  cluster access, no code executed from the network response (only a version
+  string and a changelog line are read).
+- **Upgrade pre-flight:** a `helm upgrade --dry-run=server` — read-only,
+  mutates nothing — whose output is parsed for field-manager conflicts.
+
+## T2: Security-control change? (NEUTRAL→IMPROVED)
+
+- The router remains the sole network path to MCP servers; governance (trust,
+  ToolPolicy, audit, token budget) on tool calls is unchanged. Keepalive adds
+  no tool surface.
+- Egress stays **default-deny**; auto-derivation produces the *same* rules an
+  operator would have hand-written, but correctly (namespaceSelector under
+  Cilium, where an ipBlock silently fails for in-cluster pods). This removes a
+  foot-gun that previously led operators to widen egress by hand.
+- The session-loss classifier was tightened to fire only on genuine 4xx hard
+  signals, so a healthy 2xx tool result that merely mentions "session" no
+  longer triggers a destructive re-init.
+
+## T3: Availability / fail-open risk? (REDUCED)
+
+- Keepalive **reduces** an availability bug: stateful MCP sessions were being
+  reaped after ~5s, breaking multi-step flows. All keepalive failures are
+  swallowed and the task simply exits/reconnects; it never blocks a tool call.
+- Egress derivation that errors on the API returns a short requeue (no partial
+  NetworkPolicy is applied). A missing/unparseable referent is skipped rather
+  than failing the whole reconcile.
+- The CLI update check is fully best-effort: every error path returns silently;
+  it can never block or fail a `kars` command. The upgrade pre-flight is
+  additive and only *stops before* a known-bad apply (fail-closed, safer).
+
+## Verification
+
+- Rust: `cargo test --all` green (controller 854 + router 957 + integration);
+  `cargo clippy --all-targets -D warnings` clean; `cargo audit` /
+  `cargo deny check` clean (anyhow bumped to 1.0.103, RUSTSEC-2026-0190).
+- CLI: build + typecheck + `oxlint` (0 errors) + 928 vitest tests green
+  (incl. new `update-check` suite).
+- Live AKS validation: both sandbox routers hold a persistent GET SSE stream to
+  Playwright that survives well past the 5s reaper; zero `session lost` log
+  lines. Example CRDs validated with `kubectl apply --dry-run=server`.
+- `ci/check-loc.sh`, `ci/no-stubs.sh`, `ci/check-copyright-headers.sh` pass.
+
+## Verdict
+
+Accept — the new surface is bounded, operator-gated, and fail-closed; the net
+effect is improved availability (session keepalive), tighter-by-construction
+egress, and a non-intrusive, consent-gated CLI updater.
+
+Signed-off-by: Pal Lakatos-Toth <pallakatos@microsoft.com>
+Signed-off-by: Copilot <223556219+Copilot@users.noreply.github.com>

From b6859217ba73af77bf833957575281eadccbbbb8 Mon Sep 17 00:00:00 2001
From: Pal Lakatos-Toth <pallakatos@github.com>
Date: Tue, 30 Jun 2026 15:49:10 +0200
Subject: [PATCH 09/10] fix(cli): harden update-check version handling (CodeQL)

Address four CodeQL findings in the new update-check lib:
- Strictly validate every version token (VERSION_RE / sanitizeVersion) before
  it is cached to disk or interpolated into a registry/GitHub URL, so untrusted
  response/cache bytes can't reach the filesystem or an outbound request
  (fixes 'network data written to file' + 'file data in outbound request').
- Percent-encode every '/' in the package name (replaceAll), not just the first
  (fixes 'incomplete string escaping').
- Tighten the test's registry-URL match to startsWith (fixes 'incomplete URL
  substring sanitization').

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 cli/src/lib/update-check.test.ts | 21 ++++++++++++++++++++-
 cli/src/lib/update-check.ts      | 31 +++++++++++++++++++++++++------
 2 files changed, 45 insertions(+), 7 deletions(-)

diff --git a/cli/src/lib/update-check.test.ts b/cli/src/lib/update-check.test.ts
index cea271ea2..87a556a89 100644
--- a/cli/src/lib/update-check.test.ts
+++ b/cli/src/lib/update-check.test.ts
@@ -11,6 +11,7 @@ import {
   fetchLatestVersion,
   fetchChangelogSummary,
   checkForCliUpdate,
+  sanitizeVersion,
   CLI_PACKAGE,
 } from "./update-check.js";
 
@@ -72,6 +73,24 @@ describe("summarizeReleaseBody", () => {
   });
 });
 
+describe("sanitizeVersion", () => {
+  it("accepts well-formed semver tokens", () => {
+    expect(sanitizeVersion("0.1.24")).toBe("0.1.24");
+    expect(sanitizeVersion("10.20.30")).toBe("10.20.30");
+    expect(sanitizeVersion("1.2.3-interim.4")).toBe("1.2.3-interim.4");
+  });
+
+  it("rejects anything that isn't a clean version (no URLs, paths, junk)", () => {
+    expect(sanitizeVersion("../../etc/passwd")).toBeNull();
+    expect(sanitizeVersion("1.2")).toBeNull();
+    expect(sanitizeVersion("v1.2.3")).toBeNull(); // caller strips the leading v
+    expect(sanitizeVersion("1.2.3/evil")).toBeNull();
+    expect(sanitizeVersion("https://evil.example/1.2.3")).toBeNull();
+    expect(sanitizeVersion(42)).toBeNull();
+    expect(sanitizeVersion(undefined)).toBeNull();
+  });
+});
+
 describe("fetchLatestVersion", () => {
   const realFetch = globalThis.fetch;
   afterEach(() => {
@@ -142,7 +161,7 @@ describe("checkForCliUpdate", () => {
   it("reports an update when the registry has a newer version", async () => {
     globalThis.fetch = vi.fn(async (url: string | URL) => {
       const u = String(url);
-      if (u.includes("registry.npmjs.org")) {
+      if (u.startsWith("https://registry.npmjs.org/")) {
         return new Response(JSON.stringify({ version: "999.0.0" }), { status: 200 });
       }
       // changelog
diff --git a/cli/src/lib/update-check.ts b/cli/src/lib/update-check.ts
index 740d9f77a..3fe4d36e0 100644
--- a/cli/src/lib/update-check.ts
+++ b/cli/src/lib/update-check.ts
@@ -79,7 +79,9 @@ async function readCache(): Promise<UpdateCache | null> {
     const parsed = JSON.parse(raw) as Partial<UpdateCache>;
     if (typeof parsed.checkedAt !== "number") return null;
     return {
-      latest: typeof parsed.latest === "string" ? parsed.latest : null,
+      // Re-validate the cached version on read: the file is untrusted input, so
+      // only a well-formed version token is allowed back into comparisons / URLs.
+      latest: sanitizeVersion(parsed.latest),
       checkedAt: parsed.checkedAt,
       notifiedAt: typeof parsed.notifiedAt === "number" ? parsed.notifiedAt : 0,
     };
@@ -98,15 +100,27 @@ async function writeCache(cache: UpdateCache): Promise<void> {
   }
 }
 
+/** Strict semver-ish token: `MAJOR.MINOR.PATCH` with an optional pre-release of
+ *  dot-separated alphanumerics/hyphens. This is the ONLY shape we ever accept
+ *  from the network, cache to disk, or interpolate into a URL — it sanitises
+ *  the registry/GitHub responses so untrusted bytes can't reach the filesystem
+ *  or an outbound request. Exported for tests. */
+export const VERSION_RE = /^\d+\.\d+\.\d+(?:-[0-9A-Za-z][0-9A-Za-z.-]*)?$/;
+
+/** Return `v` iff it's a well-formed version token, else null. */
+export function sanitizeVersion(v: unknown): string | null {
+  return typeof v === "string" && VERSION_RE.test(v) ? v : null;
+}
+
 /** Fetch the `latest` dist-tag version from the npm registry. Best-effort:
- *  returns null on any error or timeout. Exported for tests. */
+ *  returns null on any error, timeout, or malformed version. Exported for tests. */
 export async function fetchLatestVersion(
   pkg: string = CLI_PACKAGE,
   timeoutMs: number = FETCH_TIMEOUT_MS,
 ): Promise<string | null> {
   // The `latest` endpoint returns just the dist-tag manifest — far smaller than
-  // the full packument.
-  const url = `https://registry.npmjs.org/${pkg.replace("/", "%2f")}/latest`;
+  // the full packument. Percent-encode every "/" in the (scoped) package name.
+  const url = `https://registry.npmjs.org/${pkg.replace(/\//g, "%2F")}/latest`;
   try {
     const resp = await fetch(url, {
       signal: AbortSignal.timeout(timeoutMs),
@@ -114,7 +128,9 @@ export async function fetchLatestVersion(
     });
     if (!resp.ok) return null;
     const body = (await resp.json()) as { version?: string };
-    return typeof body.version === "string" ? body.version : null;
+    // Only ever return a strictly-validated version — nothing else is allowed
+    // to flow on to the cache file or a downstream URL.
+    return sanitizeVersion(body.version);
   } catch {
     return null;
   }
@@ -126,7 +142,10 @@ export async function fetchChangelogSummary(
   version: string,
   timeoutMs: number = FETCH_TIMEOUT_MS,
 ): Promise<string | undefined> {
-  const tag = version.startsWith("v") ? version : `v${version}`;
+  // Guard: refuse to interpolate anything but a validated version into the URL.
+  const safe = sanitizeVersion(version.replace(/^v/, ""));
+  if (!safe) return undefined;
+  const tag = `v${safe}`;
   const url = `https://api.github.com/repos/Azure/kars/releases/tags/${tag}`;
   try {
     const resp = await fetch(url, {

From c189fbd8cfb1e2c5fbc8bc4667bd49d84ce4e80f Mon Sep 17 00:00:00 2001
From: Pal Lakatos-Toth <pallakatos@github.com>
Date: Tue, 30 Jun 2026 15:49:33 +0200
Subject: [PATCH 10/10] =?UTF-8?q?chore(release):=20v0.1.24=20=E2=80=94=20M?=
 =?UTF-8?q?CP=20out-of-the-box=20(egress=20auto-derive=20+=20session=20kee?=
 =?UTF-8?q?palive)=20+=20kars=20update?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 cli/package-lock.json | 4 ++--
 cli/package.json      | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/cli/package-lock.json b/cli/package-lock.json
index cf9e807f7..698fd19c1 100644
--- a/cli/package-lock.json
+++ b/cli/package-lock.json
@@ -1,12 +1,12 @@
 {
   "name": "@kars-runtime/cli",
-  "version": "0.1.23",
+  "version": "0.1.24",
   "lockfileVersion": 3,
   "requires": true,
   "packages": {
     "": {
       "name": "@kars-runtime/cli",
-      "version": "0.1.23",
+      "version": "0.1.24",
       "license": "MIT",
       "dependencies": {
         "@azure/identity": "^4.0.0",
diff --git a/cli/package.json b/cli/package.json
index bb4e48e28..1d1b8c323 100644
--- a/cli/package.json
+++ b/cli/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@kars-runtime/cli",
-  "version": "0.1.23",
+  "version": "0.1.24",
   "description": "Enterprise-grade runtime for running OpenClaw AI assistants safely on Azure",
   "license": "MIT",
   "repository": {