diff --git a/Cargo.lock b/Cargo.lock index b1a136e..96722f2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,21 @@ # It is not intended for manual editing. version = 4 +[[package]] +name = "addr2line" +version = "0.25.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b5d307320b3181d6d7954e663bd7c774a838b8220fe0593c86d9fb09f498b4b" +dependencies = [ + "gimli", +] + +[[package]] +name = "adler2" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa" + [[package]] name = "aead" version = "0.5.2" @@ -268,6 +283,21 @@ dependencies = [ "tracing", ] +[[package]] +name = "backtrace" +version = "0.3.76" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bb531853791a215d7c62a30daf0dde835f381ab5de4589cfe7c649d2cbe92bd6" +dependencies = [ + "addr2line", + "cfg-if", + "libc", + "miniz_oxide", + "object", + "rustc-demangle", + "windows-link", +] + [[package]] name = "base64" version = "0.22.1" @@ -289,6 +319,15 @@ dependencies = [ "generic-array", ] +[[package]] +name = "block2" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cdeb9d870516001442e364c5220d3574d2da8dc765554b4a617230d33fa58ef5" +dependencies = [ + "objc2", +] + [[package]] name = "bstr" version = "1.12.1" @@ -564,6 +603,16 @@ dependencies = [ "subtle", ] +[[package]] +name = "dispatch2" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e0e367e4e7da84520dedcac1901e4da967309406d1e51017ae1abfb97adbd38" +dependencies = [ + "bitflags", + "objc2", +] + [[package]] name = "displaydoc" version = "0.2.5" @@ -618,6 +667,18 @@ version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582" +[[package]] +name = "findshlibs" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40b9e59cd0f7e0806cca4be089683ecb6434e602038df21fe6bf6711b2f07f64" +dependencies = [ + "cc", + "lazy_static", + "libc", + "winapi", +] + [[package]] name = "float-cmp" version = "0.10.0" @@ -802,6 +863,12 @@ dependencies = [ "polyval", ] +[[package]] +name = "gimli" +version = "0.32.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e629b9b98ef3dd8afe6ca2bd0f89306cec16d43d907889945bc5d6687f2f13c7" + [[package]] name = "glob" version = "0.3.3" @@ -869,6 +936,17 @@ dependencies = [ "digest", ] +[[package]] +name = "hostname" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "617aaa3557aef3810a6369d0a99fac8a080891b68bd9f9812a1eeda0c0730cbd" +dependencies = [ + "cfg-if", + "libc", + "windows-link", +] + [[package]] name = "http" version = "1.4.0" @@ -1314,6 +1392,15 @@ version = "0.3.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" +[[package]] +name = "miniz_oxide" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fa76a2c86f704bdb222d66965fb3d63269ce38518b83cb0575fca855ebb6316" +dependencies = [ + "adler2", +] + [[package]] name = "mio" version = "1.2.0" @@ -1325,6 +1412,18 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "nix" +version = "0.31.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d6d0705320c1e6ba1d912b5e37cf18071b6c2e9b7fa8215a1e8a7651966f5d3" +dependencies = [ + "bitflags", + "cfg-if", + "cfg_aliases", + "libc", +] + [[package]] name = "normalize-line-endings" version = "0.3.0" @@ -1384,6 +1483,174 @@ dependencies = [ "libc", ] +[[package]] +name = "objc2" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a12a8ed07aefc768292f076dc3ac8c48f3781c8f2d5851dd3d98950e8c5a89f" +dependencies = [ + "objc2-encode", +] + +[[package]] +name = "objc2-cloud-kit" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73ad74d880bb43877038da939b7427bba67e9dd42004a18b809ba7d87cee241c" +dependencies = [ + "bitflags", + "objc2", + "objc2-foundation", +] + +[[package]] +name = "objc2-core-data" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b402a653efbb5e82ce4df10683b6b28027616a2715e90009947d50b8dd298fa" +dependencies = [ + "objc2", + "objc2-foundation", +] + +[[package]] +name = "objc2-core-foundation" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2a180dd8642fa45cdb7dd721cd4c11b1cadd4929ce112ebd8b9f5803cc79d536" +dependencies = [ + "bitflags", + "dispatch2", + "objc2", +] + +[[package]] +name = "objc2-core-graphics" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e022c9d066895efa1345f8e33e584b9f958da2fd4cd116792e15e07e4720a807" +dependencies = [ + "bitflags", + "dispatch2", + "objc2", + "objc2-core-foundation", + "objc2-io-surface", +] + +[[package]] +name = "objc2-core-image" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5d563b38d2b97209f8e861173de434bd0214cf020e3423a52624cd1d989f006" +dependencies = [ + "objc2", + "objc2-foundation", +] + +[[package]] +name = "objc2-core-location" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca347214e24bc973fc025fd0d36ebb179ff30536ed1f80252706db19ee452009" +dependencies = [ + "objc2", + "objc2-foundation", +] + +[[package]] +name = "objc2-core-text" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0cde0dfb48d25d2b4862161a4d5fcc0e3c24367869ad306b0c9ec0073bfed92d" +dependencies = [ + "bitflags", + "objc2", + "objc2-core-foundation", + "objc2-core-graphics", +] + +[[package]] +name = "objc2-encode" +version = "4.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef25abbcd74fb2609453eb695bd2f860d389e457f67dc17cafc8b8cbc89d0c33" + +[[package]] +name = "objc2-foundation" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3e0adef53c21f888deb4fa59fc59f7eb17404926ee8a6f59f5df0fd7f9f3272" +dependencies = [ + "bitflags", + "block2", + "libc", + "objc2", + "objc2-core-foundation", +] + +[[package]] +name = "objc2-io-surface" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "180788110936d59bab6bd83b6060ffdfffb3b922ba1396b312ae795e1de9d81d" +dependencies = [ + "bitflags", + "objc2", + "objc2-core-foundation", +] + +[[package]] +name = "objc2-quartz-core" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96c1358452b371bf9f104e21ec536d37a650eb10f7ee379fff67d2e08d537f1f" +dependencies = [ + "bitflags", + "objc2", + "objc2-core-foundation", + "objc2-foundation", +] + +[[package]] +name = "objc2-ui-kit" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d87d638e33c06f577498cbcc50491496a3ed4246998a7fbba7ccb98b1e7eab22" +dependencies = [ + "bitflags", + "block2", + "objc2", + "objc2-cloud-kit", + "objc2-core-data", + "objc2-core-foundation", + "objc2-core-graphics", + "objc2-core-image", + "objc2-core-location", + "objc2-core-text", + "objc2-foundation", + "objc2-quartz-core", + "objc2-user-notifications", +] + +[[package]] +name = "objc2-user-notifications" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9df9128cbbfef73cda168416ccf7f837b62737d748333bfe9ab71c245d76613e" +dependencies = [ + "objc2", + "objc2-foundation", +] + +[[package]] +name = "object" +version = "0.37.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff76201f031d8863c38aa7f905eca4f53abbfa15f609db4277d44cd8938f33fe" +dependencies = [ + "memchr", +] + [[package]] name = "once_cell" version = "1.21.4" @@ -1419,6 +1686,22 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7c87def4c32ab89d880effc9e097653c8da5d6ef28e6b539d313baaacfbafcbe" +[[package]] +name = "os_info" +version = "3.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9cf20a545b305cf1da722b236b5155c9bb35f1d5ceb28c048bd96ca842f41b5b" +dependencies = [ + "android_system_properties", + "log", + "nix", + "objc2", + "objc2-foundation", + "objc2-ui-kit", + "serde", + "windows-sys 0.61.2", +] + [[package]] name = "parking_lot" version = "0.12.5" @@ -1868,12 +2151,27 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "rustc-demangle" +version = "0.1.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b50b8869d9fc858ce7266cce0194bd74df58b9d0e3f6df3a9fc8eb470d95c09d" + [[package]] name = "rustc-hash" version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d" +[[package]] +name = "rustc_version" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92" +dependencies = [ + "semver", +] + [[package]] name = "rustix" version = "1.1.4" @@ -2039,13 +2337,42 @@ dependencies = [ "httpdate", "reqwest 0.13.2", "rustls", + "sentry-backtrace", + "sentry-contexts", "sentry-core", + "sentry-debug-images", "sentry-log", + "sentry-panic", "sentry-tracing", "tokio", "ureq", ] +[[package]] +name = "sentry-backtrace" +version = "0.47.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "46a8c2c1bd5c1f735e84f28b48e7d72efcaafc362b7541bc8253e60e8fcdffc6" +dependencies = [ + "backtrace", + "regex", + "sentry-core", +] + +[[package]] +name = "sentry-contexts" +version = "0.47.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b88a90baa654d7f0e1f4b667f6b434293d9f72c71bef16b197c76af5b7d5803" +dependencies = [ + "hostname", + "libc", + "os_info", + "rustc_version", + "sentry-core", + "uname", +] + [[package]] name = "sentry-core" version = "0.47.0" @@ -2059,6 +2386,16 @@ dependencies = [ "url", ] +[[package]] +name = "sentry-debug-images" +version = "0.47.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dd9646a972b57896d4a92ed200cf76139f8e30b3cfd03b6662ae59926d26633c" +dependencies = [ + "findshlibs", + "sentry-core", +] + [[package]] name = "sentry-log" version = "0.47.0" @@ -2070,6 +2407,16 @@ dependencies = [ "sentry-core", ] +[[package]] +name = "sentry-panic" +version = "0.47.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6127d3d304ba5ce0409401e85aae538e303a569f8dbb031bf64f9ba0f7174346" +dependencies = [ + "sentry-backtrace", + "sentry-core", +] + [[package]] name = "sentry-tracing" version = "0.47.0" @@ -2077,6 +2424,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "27701acc51e68db5281802b709010395bfcbcb128b1d0a4e5873680d3b47ff0c" dependencies = [ "bitflags", + "sentry-backtrace", "sentry-core", "tracing-core", "tracing-subscriber", @@ -2662,6 +3010,15 @@ version = "0.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2896d95c02a80c6d6a5d6e953d479f5ddf2dfdb6a244441010e373ac0fb88971" +[[package]] +name = "uname" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b72f89f0ca32e4db1c04e2a72f5345d59796d4866a1ee0609084569f73683dc8" +dependencies = [ + "libc", +] + [[package]] name = "unicode-ident" version = "1.0.24" diff --git a/Cargo.toml b/Cargo.toml index f097dcf..19e435c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -44,7 +44,7 @@ tracing-subscriber = { version = "0.3", features = ["env-filter", "json"] } # Optional Sentry integration (off by default, enable with --features sentry) # Includes errors, traces, AND Sentry Logs (structured log forwarding). -sentry = { version = "0.47", features = ["tracing", "logs", "reqwest", "rustls"], default-features = false, optional = true } +sentry = { version = "0.47", features = ["tracing", "logs", "reqwest", "rustls", "panic", "backtrace", "contexts", "debug-images", "test"], default-features = false, optional = true } sentry-tracing = { version = "0.47", optional = true } # Serialization diff --git a/src/core/logging.rs b/src/core/logging.rs index 8cdab1b..795e461 100644 --- a/src/core/logging.rs +++ b/src/core/logging.rs @@ -97,6 +97,37 @@ pub fn shutdown(guard: Option) { #[inline] pub fn shutdown(_guard: Option) {} +/// Sentry `before_send` hook. Runs on every event the SDK is about to +/// transport. Today's job is narrow: drop events whose +/// `upstream_error_class` tag identifies them as quota / rate-limited +/// noise. +/// +/// Why: 402 (out of credit) and 429 (rate limited) are real and worth +/// breadcrumbing, but they're billing/throttling outcomes — not code +/// bugs — and the user reported thousands of them spamming the issue +/// list and burning quota. The `report_upstream_error` helper sets +/// `tags.upstream_error_class` for every classified event; we read +/// that tag here. +/// +/// We keep the helper's call to `tracing::warn!` for the quota/rate- +/// limited classes so the breadcrumb buffer still records context for +/// the *next* real error — but we never let the standalone Sentry +/// event ship. +/// +/// Anything without the tag (panics, unrelated `tracing::error!` from +/// other code paths) passes through unchanged. +#[cfg(feature = "sentry")] +fn before_send( + event: sentry::protocol::Event<'static>, +) -> Option> { + if let Some(class) = event.tags.get("upstream_error_class").map(String::as_str) { + if class == "quota" || class == "rate_limited" { + return None; + } + } + Some(event) +} + /// Initialize Sentry if a DSN is configured. Returns `None` when Sentry is /// disabled (no DSN, or feature not compiled in). fn init_sentry() -> Option { @@ -139,7 +170,19 @@ fn init_sentry() -> Option { traces_sample_rate: sample_rate, attach_stacktrace: true, send_default_pii: false, + // Bumped from the SDK default (30) so that a slow-burn + // failure — where the eventual error is preceded by a + // long tail of tool_call info breadcrumbs — still shows + // the breadcrumbs that matter. The breadcrumb buffer is + // per-scope so there's no global memory concern. + max_breadcrumbs: 100, + // Sentry's Logs product — when enabled, structured + // tracing fields ride along on every event so we can + // search/filter in Sentry's log explorer too, not just + // in the issue grouping. + enable_logs: true, debug: sentry_debug, + before_send: Some(std::sync::Arc::new(before_send)), ..Default::default() }, )); @@ -156,3 +199,67 @@ fn init_sentry() -> Option { None } } + +#[cfg(all(test, feature = "sentry"))] +mod tests { + use super::*; + use sentry::protocol::Event; + + fn event_with_class(class: &str) -> Event<'static> { + let mut ev = Event::default(); + ev.tags + .insert("upstream_error_class".to_string(), class.to_string()); + ev + } + + #[test] + fn before_send_drops_quota_class_events() { + // 402 path — `report_upstream_error` classifies as `quota` and + // emits at Warning. Without before_send, the user reported these + // were spamming the issue list and burning Sentry quota. + let ev = event_with_class("quota"); + assert!(before_send(ev).is_none()); + } + + #[test] + fn before_send_drops_rate_limited_class_events() { + // 429 path — same noise problem. Breadcrumbs still record the + // context for the next real error; this just stops the + // standalone events from shipping. + let ev = event_with_class("rate_limited"); + assert!(before_send(ev).is_none()); + } + + #[test] + fn before_send_passes_bad_input_through() { + // 400 / 422 is a real bug we want to see. + let ev = event_with_class("bad_input"); + assert!(before_send(ev).is_some()); + } + + #[test] + fn before_send_passes_server_error_through() { + let ev = event_with_class("server_error"); + assert!(before_send(ev).is_some()); + } + + #[test] + fn before_send_passes_auth_error_through() { + let ev = event_with_class("auth_error"); + assert!(before_send(ev).is_some()); + } + + #[test] + fn before_send_passes_transport_error_through() { + let ev = event_with_class("transport_error"); + assert!(before_send(ev).is_some()); + } + + #[test] + fn before_send_passes_events_without_class_tag() { + // Panics, generic tracing::error! from elsewhere — no class tag. + // Must not be dropped. + let ev = Event::default(); + assert!(before_send(ev).is_some()); + } +} diff --git a/src/core/sentry_scope.rs b/src/core/sentry_scope.rs index 3f62508..0d44010 100644 --- a/src/core/sentry_scope.rs +++ b/src/core/sentry_scope.rs @@ -1,22 +1,129 @@ //! Sentry scope helpers for proxy-side upstream error classification. //! -//! Adds structured tags and a per-{provider, operation_id, upstream_status} -//! fingerprint so each root-cause bucket becomes a distinct Sentry issue -//! instead of one "ati command failed" mega-bucket. Also routes log level -//! by status class (info/warn/error). +//! Adds structured tags + a class-based dynamic event title so each error +//! class becomes a distinct Sentry issue bucket instead of every failure +//! collapsing into one generic "upstream server error" bucket. Also routes +//! log level by status class (info/warn/error). //! -//! See issue #81 for context. +//! Title strategy: Sentry's tracing bridge maps `tracing::error!` messages +//! to event titles, and Sentry groups events by title when no fingerprint +//! is set. We dispatch the report into 5 buckets — `auth_error`, +//! `bad_input`, `rate_limited`, `server_error`, `transport_error` — each +//! with its own literal-string `tracing::error!`. Provider/operation/ +//! status live as tags so each bucket is filterable per-provider without +//! splitting the buckets per-provider. +//! +//! See issue #81 for the original context; the redesign closes the +//! "Sentry errors really fucking suck" report — generic titles, missing +//! source chains, lying tags. + +use crate::core::jwt::TokenClaims; + +/// Coarse classification of an upstream failure. Each class maps to a +/// distinct Sentry bucket via a unique literal `tracing::error!` / +/// `tracing::warn!` message — that's how grouping survives across +/// providers without collapsing every error into one mega-bucket. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum UpstreamErrorClass { + /// 401 / 403 / 407 — credentials missing, revoked, or insufficient. + AuthError, + /// 400 / 404 (non-no-records) / 422 — request shape was wrong. + BadInput, + /// 402 — out of credit; not actionable code-side. Always Warning. + Quota, + /// 429 — rate limited. Often actionable as a retry; Warning level. + RateLimited, + /// 5xx — upstream service broke. + ServerError, + /// 0 — no HTTP status (DNS, TCP, TLS, MCP transport error, etc.). + TransportError, +} + +impl UpstreamErrorClass { + pub fn classify(upstream_status: u16) -> Self { + match upstream_status { + 401 | 403 | 407 => Self::AuthError, + 400 | 404 | 422 => Self::BadInput, + 402 => Self::Quota, + 429 => Self::RateLimited, + 500..=599 => Self::ServerError, + _ => Self::TransportError, + } + } + + /// Short, stable label used in Sentry tags + event extras. + pub fn as_tag(self) -> &'static str { + match self { + Self::AuthError => "auth_error", + Self::BadInput => "bad_input", + Self::Quota => "quota", + Self::RateLimited => "rate_limited", + Self::ServerError => "server_error", + Self::TransportError => "transport_error", + } + } + + /// True for classes that should NOT page on-call. The Sentry + /// `before_send` hook drops these silently to keep the issue list + /// useful — they still appear as breadcrumbs inside the next real + /// event so context isn't lost. + pub fn is_quota_class(self) -> bool { + matches!(self, Self::Quota | Self::RateLimited) + } +} + +/// Split a proxy tool reference into `(provider, operation_id)`. +/// +/// Three input shapes are common in the wild: +/// - `"finnhub:price_target"` — explicit `provider:op` (recent OpenAPI tools). +/// - `"pdl_person_enrichment"` — flat, no separator, prefixed with +/// provider name (PDL, datalastic, several others). +/// - `"web_search"` — flat, no separator, no prefix (`parallel` MCP-ish). +/// +/// We always know the resolved `provider_name` at the call site because +/// the registry told us. Use that as the source of truth, then derive +/// the operation by stripping a `{provider}:` or `{provider}_` prefix +/// when present. Never fabricate "unknown" — if we can't split cleanly, +/// the operation tag IS the tool_name verbatim (still useful for grouping +/// and search). The old "unknown" sentinel made every flat-named tool +/// land in one mega-bucket. +pub fn provider_and_op(provider_name: &str, tool_name: &str) -> (String, String) { + // Explicit `provider:op` wins regardless of provider_name (handles the + // colon-namespaced OpenAPI naming). + if let Some((p, op)) = tool_name.split_once(crate::core::manifest::TOOL_SEP) { + if !p.is_empty() && !op.is_empty() { + return (p.to_string(), op.to_string()); + } + } + // Flat name — try to strip the provider as a colon-or-underscore prefix. + // Underscore is the convention PDL uses (`pdl_person_enrichment` → + // operation = `person_enrichment`). + if let Some(rest) = tool_name + .strip_prefix(&format!("{provider_name}:")) + .or_else(|| tool_name.strip_prefix(&format!("{provider_name}_"))) + { + if !rest.is_empty() { + return (provider_name.to_string(), rest.to_string()); + } + } + // No prefix — the tool_name IS the operation under this provider. + (provider_name.to_string(), tool_name.to_string()) +} -/// Split a proxy tool_name (`"provider:operation_id"`) into its parts. -/// Tool names missing a separator are treated as having an unknown operation. +/// Back-compat shim for call sites that still don't know the resolved +/// provider name. New code should call `provider_and_op` directly. +/// +/// Falls back to the old colon-only split for `provider:op` style names. +/// For flat tool names with no separator, returns the tool name as the +/// operation (NOT "unknown" — never lie about what failed) and an empty +/// provider, which the caller is expected to overwrite. pub fn split_tool_name(tool_name: &str) -> (String, String) { - match tool_name.split_once(crate::core::manifest::TOOL_SEP) { - Some((p, op)) if !p.is_empty() && !op.is_empty() => (p.to_string(), op.to_string()), - // Preserve the bare provider prefix (no trailing colon) when op is - // missing or empty, so Sentry tags stay clean. - Some((p, _)) if !p.is_empty() => (p.to_string(), "unknown".to_string()), - _ => (tool_name.to_string(), "unknown".to_string()), + if let Some((p, op)) = tool_name.split_once(crate::core::manifest::TOOL_SEP) { + if !p.is_empty() && !op.is_empty() { + return (p.to_string(), op.to_string()); + } } + (String::new(), tool_name.to_string()) } /// Scrub obvious PII patterns (UUIDs, emails, IPv4s, long hex tokens) from a @@ -305,13 +412,22 @@ pub fn is_no_records_body(error_type: Option<&str>, error_message: Option<&str>) false } -/// Attach structured tags + fingerprint to the current Sentry scope and emit a -/// tracing event at the appropriate level for the given upstream status class. +/// Attach structured tags + class-based dynamic title to the current Sentry +/// scope and emit a tracing event at the appropriate level for the upstream +/// status class. +/// +/// Buckets (event title literal): +/// - `auth_error` → 401, 403, 407 (Error) +/// - `bad_input` → 400, 404 (non-no-records), 422 (Error) +/// - `quota` → 402 (Warning; dropped by `before_send`) +/// - `rate_limited` → 429 (Warning; dropped by `before_send`) +/// - `server_error` → 5xx (Error) +/// - `transport_error` → 0 / unknown (Error) /// -/// Levels: -/// 402 / 403 / 422 → warn (expected client-side upstream error, Sentry event -/// at warning level for filtering, does not page) -/// all others → error (includes 5xx, network failures, unknown) +/// Each title is a distinct literal so `sentry-tracing` creates a separate +/// Sentry issue per class. The (provider, operation_id, status) live as +/// tags so each bucket stays per-provider searchable without splitting +/// into one bucket per provider. /// /// When the `sentry` feature is off, emits the tracing event only. pub fn report_upstream_error( @@ -325,12 +441,15 @@ pub fn report_upstream_error( let msg_short = error_message .map(|m| scrub_and_truncate(m, 140)) .unwrap_or_default(); + let class = UpstreamErrorClass::classify(upstream_status); // `sentry::with_scope` pushes a temporary scope for the duration of the // closure, then pops it — so tags never leak across requests running on // the same tokio worker thread. The tracing macros inside the closure - // are picked up by `sentry_tracing::layer()` and emitted with these tags - // attached. + // are picked up by `sentry_tracing::layer()` and emitted with these + // tags attached. `class` becomes its own tag so operators can filter + // a single bucket by error shape without grouping each subclass into + // its own issue. with_upstream_scope( provider, operation_id, @@ -338,40 +457,227 @@ pub fn report_upstream_error( proxy_status, error_type, &msg_short, - || match upstream_status { - 402 | 403 | 422 => { - tracing::warn!( - provider, - operation_id, - upstream_status, - proxy_status, - error_type = error_type.unwrap_or(""), - msg = %msg_short, - "upstream client error" - ); - // sentry-tracing maps warn → breadcrumb by default. We want an - // actual event for warn-tier upstream errors so operators can - // search by tag — capture explicitly at Warning level. - #[cfg(feature = "sentry")] - sentry::capture_message( - &format!("upstream client error ({upstream_status}) {provider}:{operation_id}"), - sentry::Level::Warning, - ); - } - _ => tracing::error!( + class, + || { + emit_classified( + class, provider, operation_id, upstream_status, proxy_status, - error_type = error_type.unwrap_or(""), - msg = %msg_short, - "upstream server error" - ), + error_type, + &msg_short, + ) }, ); } +/// Each arm carries its own LITERAL `tracing::*!` message because the +/// `sentry-tracing` bridge maps the literal to the Sentry event title, +/// and Sentry groups events by title. Five literals → five distinct +/// Sentry issue buckets. All the variable detail lives in fields. +fn emit_classified( + class: UpstreamErrorClass, + provider: &str, + operation_id: &str, + upstream_status: u16, + proxy_status: u16, + error_type: Option<&str>, + msg_short: &str, +) { + let error_type = error_type.unwrap_or(""); + match class { + UpstreamErrorClass::AuthError => tracing::error!( + provider, + operation_id, + upstream_status, + proxy_status, + class = class.as_tag(), + error_type, + msg = %msg_short, + "upstream auth_error" + ), + UpstreamErrorClass::BadInput => tracing::error!( + provider, + operation_id, + upstream_status, + proxy_status, + class = class.as_tag(), + error_type, + msg = %msg_short, + "upstream bad_input" + ), + UpstreamErrorClass::Quota => tracing::warn!( + provider, + operation_id, + upstream_status, + proxy_status, + class = class.as_tag(), + error_type, + msg = %msg_short, + "upstream quota" + ), + UpstreamErrorClass::RateLimited => tracing::warn!( + provider, + operation_id, + upstream_status, + proxy_status, + class = class.as_tag(), + error_type, + msg = %msg_short, + "upstream rate_limited" + ), + UpstreamErrorClass::ServerError => tracing::error!( + provider, + operation_id, + upstream_status, + proxy_status, + class = class.as_tag(), + error_type, + msg = %msg_short, + "upstream server_error" + ), + UpstreamErrorClass::TransportError => tracing::error!( + provider, + operation_id, + upstream_status, + proxy_status, + class = class.as_tag(), + error_type, + msg = %msg_short, + "upstream transport_error" + ), + } +} + +/// Attach the structured Rust error (with its full `source()` chain) to +/// the current Sentry scope and capture it as an exception event. This +/// is what gives the issue page a real "Exception" block with the +/// reqwest / `HttpError` / `McpError` source chain — without this, all +/// Sentry sees is the tracing-bridge's flat title + tags. +/// +/// Pair with `report_upstream_error`: that one creates the +/// titled/grouped issue, this one attaches the structured context. +/// When the `sentry` feature is off, this is a no-op. +#[allow(unused_variables)] +pub fn capture_error_with_scope( + err: &(dyn std::error::Error + 'static), + provider: &str, + operation_id: &str, + upstream_status: u16, + proxy_status: u16, + error_type: Option<&str>, + error_message: Option<&str>, +) { + #[cfg(feature = "sentry")] + { + let msg_short = error_message + .map(|m| scrub_and_truncate(m, 140)) + .unwrap_or_default(); + let class = UpstreamErrorClass::classify(upstream_status); + // Skip quota / rate-limit classes — they're already dropped by + // before_send and we don't want to spend the quota encoding the + // backtrace just to throw it away. + if class.is_quota_class() { + return; + } + // Build the exception event manually so we can pin its `message` + // to the same class literal the tracing-bridge event uses. + // + // Why this matters: `report_upstream_error` fires a tracing event + // whose title is the class literal (e.g. `"upstream bad_input"`), + // and Sentry groups it by that title under the shared fingerprint. + // If we let `sentry::capture_error` build the second event with + // its default behavior, Sentry sets the new event's title from + // `exception[0].value` (the underlying error's Display, like + // `"HttpError: ApiError { status: 400, … }"`). Sentry's "latest + // event wins for title" rule then drifts the operator-visible + // bucket title away from the clean class literal — the headline + // improvement the PR is built around (Greptile P2 on PR #137). + // + // Fix: set `Event.message` to the class literal up front. The + // exception chain is still attached, the fingerprint still + // groups, and the title stays stable. + let class_literal = match class { + UpstreamErrorClass::AuthError => "upstream auth_error", + UpstreamErrorClass::BadInput => "upstream bad_input", + UpstreamErrorClass::Quota => "upstream quota", + UpstreamErrorClass::RateLimited => "upstream rate_limited", + UpstreamErrorClass::ServerError => "upstream server_error", + UpstreamErrorClass::TransportError => "upstream transport_error", + }; + let mut event = sentry::event_from_error(err); + event.message = Some(class_literal.to_string()); + with_upstream_scope( + provider, + operation_id, + upstream_status, + proxy_status, + error_type, + &msg_short, + class, + || { + sentry::capture_event(event); + }, + ); + } +} + +/// Attach JWT identity (sub, sandbox_id, job_id) to the current Sentry +/// scope so every event raised during this request is searchable by +/// those facets. Set once at the start of `/call`, `/mcp`, and `/help` +/// handlers. +/// +/// **Tags MUST be cleared, not just conditionally set.** `configure_scope` +/// mutates the thread-local hub scope, and tokio reuses worker threads +/// across requests. If request A had `sandbox_id = "alice"` and request +/// B on the same worker thread has no `sandbox_id` in its JWT, a naive +/// `if let Some(id) = ... { set_tag(...) }` leaves "alice" stamped on +/// every event raised during request B — a multi-tenant data bleed +/// flagged by Greptile P1 on PR #137. The fix: unconditionally +/// `remove_tag` for every optional field at the top of the helper, then +/// set the ones we actually have. +/// +/// `sub` is the only required JWT claim (auth middleware would reject +/// the request otherwise), so we always set it. `user` is similarly +/// always present. +#[allow(unused_variables)] +pub fn set_jwt_sentry_scope(claims: &TokenClaims) { + #[cfg(feature = "sentry")] + sentry::configure_scope(|scope| { + // Clear optional tags from any prior request that ran on this + // worker thread before stamping the new request's identity. + // remove_tag is a no-op if the tag isn't present, so it's safe + // to call on cold workers too. + scope.remove_tag("sandbox_id"); + scope.remove_tag("job_id"); + + scope.set_tag("sub", claims.sub.as_str()); + if let Some(ref id) = claims.sandbox_id { + scope.set_tag("sandbox_id", id.as_str()); + } + if let Some(ref id) = claims.job_id { + scope.set_tag("job_id", id.as_str()); + } + // Sentry's `user.id` is the standard slot for the auth subject — + // populating it lets the issue page show "users affected" counts + // and gives the search UI a first-class facet. set_user(Some(..)) + // overwrites any prior value so no `set_user(None)` clear is + // needed (unlike the optional tags above which would silently + // bleed through). + scope.set_user(Some(sentry::User { + id: Some(claims.sub.clone()), + ..Default::default() + })); + }); +} + +// 8 parameters by design: each is a distinct Sentry tag value resolved +// at the call site. Bundling into a struct would just move the noise +// without removing it, and the helper is private. Same allow on the +// no-sentry stub below for signature parity. #[cfg(feature = "sentry")] +#[allow(clippy::too_many_arguments)] fn with_upstream_scope( provider: &str, operation_id: &str, @@ -379,16 +685,19 @@ fn with_upstream_scope( proxy_status: u16, error_type: Option<&str>, msg_short: &str, + class: UpstreamErrorClass, body: F, ) { let upstream_s = upstream_status.to_string(); let proxy_s = proxy_status.to_string(); + let class_tag = class.as_tag(); sentry::with_scope( |scope| { scope.set_tag("provider", provider); scope.set_tag("operation_id", operation_id); scope.set_tag("upstream_status", &upstream_s); scope.set_tag("proxy_status", &proxy_s); + scope.set_tag("upstream_error_class", class_tag); if let Some(t) = error_type { scope.set_tag("upstream_error_type", t); } @@ -398,12 +707,18 @@ fn with_upstream_scope( serde_json::Value::String(msg_short.to_string()), ); } + // Fingerprint keys on the error CLASS, not the upstream status, + // so a provider that returns 502 vs 504 doesn't fork into two + // buckets. Combined with the per-class literal title, this + // gives one issue per (provider, operation, class) — fine- + // grained enough to alert on a specific tool going bad but + // not so fine that every transient blip is a new issue. scope.set_fingerprint(Some( [ "ati.proxy.upstream_error", provider, operation_id, - &upstream_s, + class_tag, ] .as_slice(), )); @@ -413,6 +728,7 @@ fn with_upstream_scope( } #[cfg(not(feature = "sentry"))] +#[allow(clippy::too_many_arguments)] fn with_upstream_scope( _provider: &str, _operation_id: &str, @@ -420,6 +736,7 @@ fn with_upstream_scope( _proxy_status: u16, _error_type: Option<&str>, _msg_short: &str, + _class: UpstreamErrorClass, body: F, ) { body(); @@ -429,8 +746,62 @@ fn with_upstream_scope( mod tests { use super::*; + // --- provider_and_op: the path call sites should actually use --- + + #[test] + fn provider_and_op_explicit_colon() { + assert_eq!( + provider_and_op("finnhub", "finnhub:price_target"), + ("finnhub".into(), "price_target".into()) + ); + } + + #[test] + fn provider_and_op_strips_underscore_prefix() { + // The bug from the user's screenshot: PDL ships flat tool names + // like `pdl_person_enrichment`. The fix peels the provider as a + // `{provider}_` prefix when present. + assert_eq!( + provider_and_op("pdl", "pdl_person_enrichment"), + ("pdl".into(), "person_enrichment".into()) + ); + } + + #[test] + fn provider_and_op_strips_colon_prefix() { + assert_eq!( + provider_and_op("github", "github:search_repositories"), + ("github".into(), "search_repositories".into()) + ); + } + + #[test] + fn provider_and_op_no_prefix_keeps_tool_name() { + // No prefix to strip — operation is the tool name verbatim. This + // is the correct behavior; the OLD code returned "unknown" here, + // which clumped every prefixless-tool failure into one mega- + // bucket and forced the on-call to read the body to find out + // which tool failed. + assert_eq!( + provider_and_op("parallel", "web_search"), + ("parallel".into(), "web_search".into()) + ); + } + #[test] - fn split_tool_name_ok() { + fn provider_and_op_empty_op_after_strip_keeps_tool_name() { + // Edge case: `pdl_` — stripping leaves an empty operation, so we + // keep the tool_name as-is to avoid generating empty tags. + assert_eq!( + provider_and_op("pdl", "pdl_"), + ("pdl".into(), "pdl_".into()) + ); + } + + // --- split_tool_name back-compat shim (no longer fabricates "unknown") --- + + #[test] + fn split_tool_name_colon_form() { assert_eq!( split_tool_name("finnhub:price_target"), ("finnhub".into(), "price_target".into()) @@ -438,18 +809,103 @@ mod tests { } #[test] - fn split_tool_name_missing_op() { + fn split_tool_name_flat_returns_tool_name_as_op() { + // The OLD behavior was `("bare_tool", "unknown")` — that lied + // about what operation failed. New behavior keeps the tool name + // as the op; the caller is expected to overwrite the empty + // provider with the registry-resolved name. assert_eq!( split_tool_name("bare_tool"), - ("bare_tool".into(), "unknown".into()) + ("".into(), "bare_tool".into()) ); } + // --- UpstreamErrorClass: classification --- + #[test] - fn split_tool_name_empty_op() { + fn classify_400_is_bad_input() { + assert_eq!( + UpstreamErrorClass::classify(400), + UpstreamErrorClass::BadInput + ); + assert_eq!( + UpstreamErrorClass::classify(404), + UpstreamErrorClass::BadInput + ); assert_eq!( - split_tool_name("provider:"), - ("provider".into(), "unknown".into()) + UpstreamErrorClass::classify(422), + UpstreamErrorClass::BadInput + ); + } + + #[test] + fn classify_401_is_auth_error() { + assert_eq!( + UpstreamErrorClass::classify(401), + UpstreamErrorClass::AuthError + ); + assert_eq!( + UpstreamErrorClass::classify(403), + UpstreamErrorClass::AuthError + ); + assert_eq!( + UpstreamErrorClass::classify(407), + UpstreamErrorClass::AuthError + ); + } + + #[test] + fn classify_402_is_quota() { + assert_eq!(UpstreamErrorClass::classify(402), UpstreamErrorClass::Quota); + assert!(UpstreamErrorClass::Quota.is_quota_class()); + } + + #[test] + fn classify_429_is_rate_limited() { + assert_eq!( + UpstreamErrorClass::classify(429), + UpstreamErrorClass::RateLimited + ); + assert!(UpstreamErrorClass::RateLimited.is_quota_class()); + } + + #[test] + fn classify_5xx_is_server_error() { + assert_eq!( + UpstreamErrorClass::classify(500), + UpstreamErrorClass::ServerError + ); + assert_eq!( + UpstreamErrorClass::classify(503), + UpstreamErrorClass::ServerError + ); + assert_eq!( + UpstreamErrorClass::classify(599), + UpstreamErrorClass::ServerError + ); + assert!(!UpstreamErrorClass::ServerError.is_quota_class()); + } + + #[test] + fn classify_zero_is_transport_error() { + // The proxy passes `upstream_status = 0` for MCP/CLI/transport + // failures with no HTTP exchange. Those need their own bucket. + assert_eq!( + UpstreamErrorClass::classify(0), + UpstreamErrorClass::TransportError + ); + } + + #[test] + fn class_tags_are_stable_strings() { + assert_eq!(UpstreamErrorClass::AuthError.as_tag(), "auth_error"); + assert_eq!(UpstreamErrorClass::BadInput.as_tag(), "bad_input"); + assert_eq!(UpstreamErrorClass::Quota.as_tag(), "quota"); + assert_eq!(UpstreamErrorClass::RateLimited.as_tag(), "rate_limited"); + assert_eq!(UpstreamErrorClass::ServerError.as_tag(), "server_error"); + assert_eq!( + UpstreamErrorClass::TransportError.as_tag(), + "transport_error" ); } @@ -608,4 +1064,214 @@ mod tests { fn truncate_short_message_untouched() { assert_eq!(scrub_and_truncate("short", 100), "short"); } + + // --- Greptile P2 on #137: title pinning on the capture_error path --- + + /// Mirrors what `capture_error_with_scope` does internally to build + /// the event it captures. Test surface for the title-pinning fix: + /// without setting `message`, the issue title would drift to the + /// raw error Display and overwrite the clean class literal that + /// `report_upstream_error` is built around. + #[cfg(feature = "sentry")] + fn build_captured_event_for_test( + err: &(dyn std::error::Error + 'static), + class: UpstreamErrorClass, + ) -> sentry::protocol::Event<'static> { + let class_literal = match class { + UpstreamErrorClass::AuthError => "upstream auth_error", + UpstreamErrorClass::BadInput => "upstream bad_input", + UpstreamErrorClass::Quota => "upstream quota", + UpstreamErrorClass::RateLimited => "upstream rate_limited", + UpstreamErrorClass::ServerError => "upstream server_error", + UpstreamErrorClass::TransportError => "upstream transport_error", + }; + let mut event = sentry::event_from_error(err); + event.message = Some(class_literal.to_string()); + event + } + + #[cfg(feature = "sentry")] + #[test] + fn captured_event_message_pins_class_literal_not_error_display() { + // The user-visible bug Greptile flagged: `sentry::event_from_error` + // leaves `Event.message = None`, and Sentry's UI falls back to + // the underlying error's Display for the issue title. Two events + // share the same fingerprint, but the second one's Display + // clobbers the first's clean class literal in the title bar. + // + // Fix shape: pin `Event.message` to the class literal BEFORE + // capturing the event. The exception block is unchanged, so the + // chain still ships; only the title is pinned. + let inner = std::io::Error::other("operation timed out"); + let event = build_captured_event_for_test(&inner, UpstreamErrorClass::BadInput); + assert_eq!(event.message.as_deref(), Some("upstream bad_input")); + // Sanity: the exception chain is still there (the whole point of + // calling `capture_error` in the first place). + assert!( + !event.exception.is_empty(), + "expected the exception block to be populated by event_from_error" + ); + // And the inner error's Display is preserved in the exception + // value, just not bleeding into the title field. + let exc = event + .exception + .values + .first() + .expect("at least one exception"); + assert_eq!(exc.value.as_deref(), Some("operation timed out")); + } + + // --- Greptile P1 on #137: set_jwt_sentry_scope tag-clear regression --- + // + // Async runtime workers are reused across requests, and + // `sentry::configure_scope` mutates the per-thread hub permanently. + // Without an explicit `remove_tag`, a prior request's `sandbox_id` / + // `job_id` would bleed into the next request on the same worker + // thread — a multi-tenant data leak. These tests use sentry's test + // transport to inspect actual emitted events. + #[cfg(feature = "sentry")] + #[test] + fn set_jwt_sentry_scope_clears_stale_sandbox_id_between_requests() { + let events = sentry::test::with_captured_events(|| { + // Request A: full identity. + let claims_a = TokenClaims { + iss: None, + sub: "agent_alice".into(), + aud: "ati-proxy".into(), + iat: 0, + exp: 0, + jti: None, + scope: String::new(), + ati: None, + sandbox_id: Some("sandbox_alpha".into()), + job_id: Some("job_42".into()), + }; + set_jwt_sentry_scope(&claims_a); + sentry::capture_message("during request A", sentry::Level::Error); + + // Request B on the same worker thread: claims have no + // sandbox_id and no job_id. The stale values from request + // A MUST NOT bleed onto request B's events. + let claims_b = TokenClaims { + iss: None, + sub: "agent_bob".into(), + aud: "ati-proxy".into(), + iat: 0, + exp: 0, + jti: None, + scope: String::new(), + ati: None, + sandbox_id: None, + job_id: None, + }; + set_jwt_sentry_scope(&claims_b); + sentry::capture_message("during request B", sentry::Level::Error); + }); + + assert_eq!(events.len(), 2, "expected two captured events"); + let a = events + .iter() + .find(|e| e.message.as_deref() == Some("during request A")) + .expect("event A"); + let b = events + .iter() + .find(|e| e.message.as_deref() == Some("during request B")) + .expect("event B"); + + // Request A's event has the full identity. + assert_eq!(a.tags.get("sub").map(String::as_str), Some("agent_alice")); + assert_eq!( + a.tags.get("sandbox_id").map(String::as_str), + Some("sandbox_alpha") + ); + assert_eq!(a.tags.get("job_id").map(String::as_str), Some("job_42")); + + // Request B's event has ONLY the sub it carried. No bleed. + assert_eq!(b.tags.get("sub").map(String::as_str), Some("agent_bob")); + assert!( + !b.tags.contains_key("sandbox_id"), + "sandbox_id from prior request bled through: {:?}", + b.tags.get("sandbox_id") + ); + assert!( + !b.tags.contains_key("job_id"), + "job_id from prior request bled through: {:?}", + b.tags.get("job_id") + ); + } + + #[cfg(feature = "sentry")] + #[test] + fn set_jwt_sentry_scope_replaces_user_between_requests() { + // `set_user(Some(..))` always overwrites the prior value, so a + // dedicated `set_user(None)` clear isn't needed. This test pins + // that invariant so future refactors don't silently break it. + fn claims_for(sub: &str) -> TokenClaims { + TokenClaims { + iss: None, + sub: sub.into(), + aud: "ati-proxy".into(), + iat: 0, + exp: 0, + jti: None, + scope: String::new(), + ati: None, + sandbox_id: None, + job_id: None, + } + } + let events = sentry::test::with_captured_events(|| { + set_jwt_sentry_scope(&claims_for("agent_alice")); + sentry::capture_message("A", sentry::Level::Error); + + set_jwt_sentry_scope(&claims_for("agent_bob")); + sentry::capture_message("B", sentry::Level::Error); + }); + + let a = events + .iter() + .find(|e| e.message.as_deref() == Some("A")) + .unwrap(); + let b = events + .iter() + .find(|e| e.message.as_deref() == Some("B")) + .unwrap(); + assert_eq!( + a.user.as_ref().and_then(|u| u.id.as_deref()), + Some("agent_alice") + ); + assert_eq!( + b.user.as_ref().and_then(|u| u.id.as_deref()), + Some("agent_bob"), + "Request B's user.id should be its own sub, not Alice's stale value" + ); + } + + #[cfg(feature = "sentry")] + #[test] + fn captured_event_message_pins_every_class() { + // Lock in the class → literal mapping so future renames of the + // tracing-event literals don't silently desync with the + // capture-error path. + let err = std::io::Error::other("x"); + let cases = [ + (UpstreamErrorClass::AuthError, "upstream auth_error"), + (UpstreamErrorClass::BadInput, "upstream bad_input"), + (UpstreamErrorClass::Quota, "upstream quota"), + (UpstreamErrorClass::RateLimited, "upstream rate_limited"), + (UpstreamErrorClass::ServerError, "upstream server_error"), + ( + UpstreamErrorClass::TransportError, + "upstream transport_error", + ), + ]; + for (class, expected_title) in cases { + let event = build_captured_event_for_test(&err, class); + assert_eq!( + event.message.as_deref(), + Some(expected_title), + "class {class:?} should pin title to {expected_title:?}" + ); + } + } } diff --git a/src/proxy/server.rs b/src/proxy/server.rs index 2bd2ba6..dcf8b3c 100644 --- a/src/proxy/server.rs +++ b/src/proxy/server.rs @@ -627,6 +627,15 @@ async fn handle_call( ) -> impl IntoResponse { // Extract JWT claims from request extensions (set by auth middleware) let claims = req.extensions().get::().cloned(); + // Stamp Sentry's scope with the request identity so every event raised + // during this request — exceptions, the dynamic-title upstream-error + // bucket, panics — is searchable by sub / sandbox_id / job_id. The + // scope is per-thread and gets cleared by subsequent `with_scope` + // pushes inside the report helpers; the tags we set here ride along + // as the bottom layer. + if let Some(ref c) = claims { + sentry_scope::set_jwt_sentry_scope(c); + } // Raw inbound bearer (Bearer-token path only — dev/no-JWT paths leave // this empty). Used by `GenContext.jwt_token` so auth_generator scripts // can forward the sandbox's identity to an upstream MCP. See issue #115. @@ -856,15 +865,32 @@ async fn handle_call( }), ), Err(e) => { + // Use the registry-resolved provider name (not the + // tool_name) — the old `split_tool_name` lied when the + // tool name had no `:` separator. The error_message + // path also gets the underlying error's `to_string` + // for the Sentry "extra"; the full source chain is + // attached separately via `capture_error_with_scope` + // so the Sentry issue page shows real exception data. let (provider_name, operation_id) = - sentry_scope::split_tool_name(&call_req.tool_name); + sentry_scope::provider_and_op(&provider.name, &call_req.tool_name); + let msg = e.to_string(); sentry_scope::report_upstream_error( &provider_name, &operation_id, 0, 502, None, - Some(&e.to_string()), + Some(&msg), + ); + sentry_scope::capture_error_with_scope( + &e, + &provider_name, + &operation_id, + 0, + 502, + None, + Some(&msg), ); ( StatusCode::BAD_GATEWAY, @@ -895,15 +921,29 @@ async fn handle_call( }), ), Err(e) => { + // Same registry-resolved provider treatment as the + // MCP arm — and the structured error attached via + // `capture_error_with_scope` so Sentry sees the real + // exception chain instead of just the flat title. let (provider_name, operation_id) = - sentry_scope::split_tool_name(&call_req.tool_name); + sentry_scope::provider_and_op(&provider.name, &call_req.tool_name); + let msg = e.to_string(); sentry_scope::report_upstream_error( &provider_name, &operation_id, 0, 502, None, - Some(&e.to_string()), + Some(&msg), + ); + sentry_scope::capture_error_with_scope( + &e, + &provider_name, + &operation_id, + 0, + 502, + None, + Some(&msg), ); ( StatusCode::BAD_GATEWAY, @@ -970,8 +1010,13 @@ async fn handle_call( } Err(e) => { let duration = start.elapsed(); + // Use the registry-resolved provider name so the + // Sentry tags actually identify the provider — the + // old `split_tool_name` returned the whole flat tool + // name as the "provider" and "unknown" as the + // operation for tools like `pdl_person_enrichment`. let (provider_name, operation_id) = - sentry_scope::split_tool_name(&call_req.tool_name); + sentry_scope::provider_and_op(&provider.name, &call_req.tool_name); let (upstream_status, error_type, error_message) = match &e { http::HttpError::ApiError { status, @@ -989,6 +1034,20 @@ async fn handle_call( error_type.as_deref(), error_message.as_deref(), ); + // Attach the structured error (with source chain) as + // a separate exception event so the Sentry issue + // page shows real Rust frames + Display, not just + // the scrubbed body. Quota / rate-limited classes + // short-circuit inside the helper. + sentry_scope::capture_error_with_scope( + &e, + &provider_name, + &operation_id, + upstream_status, + 502, + error_type.as_deref(), + error_message.as_deref(), + ); write_proxy_audit( &call_req, &agent_sub,