From e5291d35914f26a835f5e020465200bddfc4e12a Mon Sep 17 00:00:00 2001 From: oktofeesh1 <287075021+oktofeesh1@users.noreply.github.com> Date: Tue, 30 Jun 2026 17:24:19 -0700 Subject: [PATCH] feat(selfhost): trace review pipeline in Sentry --- .env.example | 16 +- .../routes/docs.self-hosting-operations.tsx | 7 + package-lock.json | 1 + package.json | 1 + src/queue/processors.ts | 486 +++++++++++------- src/selfhost/otel.ts | 107 +++- src/selfhost/pg-queue.ts | 2 +- src/selfhost/review-tracing.ts | 64 +++ src/selfhost/sentry.ts | 179 ++++--- src/selfhost/sqlite-queue.ts | 2 +- src/selfhost/tracing.ts | 10 +- src/server.ts | 13 +- test/unit/queue.test.ts | 10 + test/unit/selfhost-otel.test.ts | 275 ++++++++++ test/unit/selfhost-sentry.test.ts | 261 ++++++---- test/unit/selfhost-tracing.test.ts | 16 +- 16 files changed, 1054 insertions(+), 396 deletions(-) create mode 100644 src/selfhost/review-tracing.ts diff --git a/.env.example b/.env.example index 4948613fb..d778b6886 100644 --- a/.env.example +++ b/.env.example @@ -238,13 +238,13 @@ REDIS_URL=redis://redis:6379 # REQUIRED for the self-host review # --- Sentry error tracking (optional) --- # SENTRY_DSN= # enables self-host Sentry capture; unset = complete no-op # SENTRY_ENVIRONMENT=production -# SENTRY_TRACES_SAMPLE_RATE=0 # traces/spans are off by default; errors still report. Set a LOW rate -# # (e.g. 0.05) with SENTRY_DSN to sample review tracing: each sampled -# # review emits a connected trace — the queue-job span (whole-review -# # latency) with the AI-provider span nested — so you can filter slow -# # or failed STAGES in Sentry without reading scattered logs. Spans -# # carry only safe dimensions (repo, job type, provider/model); never -# # prompts, diffs, tokens, or bodies. Leave 0 to keep tracing a no-op. +# SENTRY_TRACES_SAMPLE_RATE= # traces/spans are off when blank/unset; errors still report. Set a LOW +# # rate (e.g. 0.05) with SENTRY_DSN to sample review tracing: each +# # sampled review emits a connected trace — the queue-job span +# # (whole-review latency) with the AI-provider span nested — so you can +# # filter slow or failed STAGES in Sentry without reading scattered logs. +# # Spans carry only safe dimensions (repo, job type, provider/model); +# # never prompts, diffs, tokens, or bodies. # SENTRY_RELEASE= # custom images only: set this ONLY when you uploaded source maps for # # the exact built bundle under this exact release id. Future official # # images bake GITTENSORY_VERSION=gittensory-selfhost@, so do @@ -271,7 +271,7 @@ REDIS_URL=redis://redis:6379 # REQUIRED for the self-host review # SENTRY_SERVER_NAME= # clean name for THIS instance in Sentry (e.g. gittensory-us-east); # # unset defaults to the OS hostname — never the public-origin URL # SENTRY_RELEASE= -# SENTRY_TRACES_SAMPLE_RATE=0 +# SENTRY_TRACES_SAMPLE_RATE= # --- AI review backend (optional; without AI_PROVIDER reviews run deterministically) --- # AI_SUMMARIES_ENABLED=true diff --git a/apps/gittensory-ui/src/routes/docs.self-hosting-operations.tsx b/apps/gittensory-ui/src/routes/docs.self-hosting-operations.tsx index 0711d8cfc..b1ac9f840 100644 --- a/apps/gittensory-ui/src/routes/docs.self-hosting-operations.tsx +++ b/apps/gittensory-ui/src/routes/docs.self-hosting-operations.tsx @@ -85,6 +85,13 @@ review_context_fetch_failed`}

+

Sentry tracing

+

+ Leave SENTRY_TRACES_SAMPLE_RATE unset or blank to disable trace export, or set + a positive sample rate such as 0.05 to send sampled review spans to Sentry. The + custom OpenTelemetry provider installs Sentry hooks for review-stage spans carrying repo, + PR, operation, outcome, and hashed installation tags. +

Sentry cron monitors

When SENTRY_DSN is set, the self-host runtime emits Sentry monitor check-ins diff --git a/package-lock.json b/package-lock.json index 91eed51e6..a8e4c44f7 100644 --- a/package-lock.json +++ b/package-lock.json @@ -23,6 +23,7 @@ "@opentelemetry/resources": "^2.8.0", "@opentelemetry/sdk-trace-node": "^2.8.0", "@sentry/node": "^10.62.0", + "@sentry/opentelemetry": "^10.62.0", "agents": "^0.16.2", "drizzle-orm": "^0.45.2", "hono": "^4.12.26", diff --git a/package.json b/package.json index 547f50e6d..5b707e914 100644 --- a/package.json +++ b/package.json @@ -77,6 +77,7 @@ "@opentelemetry/resources": "^2.8.0", "@opentelemetry/sdk-trace-node": "^2.8.0", "@sentry/node": "^10.62.0", + "@sentry/opentelemetry": "^10.62.0", "agents": "^0.16.2", "drizzle-orm": "^0.45.2", "hono": "^4.12.26", diff --git a/src/queue/processors.ts b/src/queue/processors.ts index d2c67eba3..f2dd08e1d 100644 --- a/src/queue/processors.ts +++ b/src/queue/processors.ts @@ -355,6 +355,10 @@ import { resolveEnrichmentLinkedIssueNumbers, } from "../review/enrichment-wire"; import { captureReviewFailure } from "../selfhost/sentry"; +import { + setReviewPipelineSpanOutcome, + withReviewPipelineSpan, +} from "../selfhost/review-tracing"; import { evaluateWithSurfaceLane } from "../review/content-lane-wire"; import { reviewThreadBlockerFinding } from "../review/review-thread-findings"; import { indexRepo, reindexChangedPaths } from "../review/rag-index"; @@ -1826,14 +1830,24 @@ async function reReviewStoredPullRequest( // (#audit-rate-headroom): seed the request-local facts from the resync payload, then share them with the // readiness check, public surface, and auto-maintain planner. if ( - !(await prReadyForReview( - env, - installationId, - repoFullName, - pr, - settings, - deliveryId, - liveFacts, + !(await withReviewPipelineSpan( + "selfhost.review.readiness", + { + installationId, + repoFullName, + pullNumber: pr.number, + operation: "readiness", + }, + () => + prReadyForReview( + env, + installationId, + repoFullName, + pr, + settings, + deliveryId, + liveFacts, + ), )) ) return; @@ -1873,21 +1887,31 @@ async function reReviewStoredPullRequest( () => undefined, ); } - const gate = await maybePublishPrPublicSurface( - env, - installationId, - repoFullName, - pr, - repo, - settings, - advisory, + const gate = await withReviewPipelineSpan( + "selfhost.review.public_surface", { - deliveryId, - baseSha: live?.base?.sha ?? null, - liveFacts, - ...(previewPollAttempt !== undefined ? { previewPollAttempt } : {}), - ...(options.skipAiReview ? { skipAiReview: true } : {}), + installationId, + repoFullName, + pullNumber: pr.number, + operation: "public_surface", }, + () => + maybePublishPrPublicSurface( + env, + installationId, + repoFullName, + pr, + repo, + settings, + advisory, + { + deliveryId, + baseSha: live?.base?.sha ?? null, + liveFacts, + ...(previewPollAttempt !== undefined ? { previewPollAttempt } : {}), + ...(options.skipAiReview ? { skipAiReview: true } : {}), + }, + ), ).catch((error) => { /* v8 ignore next -- retryable/rate-limit propagation is exercised by queue retry tests; this catch only preserves that contract. */ if (isGitHubRateLimitedError(error) || isRetryableJobError(error)) throw error; @@ -1903,17 +1927,28 @@ async function reReviewStoredPullRequest( ); return undefined; }); - await maybeRunAgentMaintenance(env, { - installationId, - repoFullName, - repo, - pr, - settings, - otherOpenPullRequests, - deliveryId, - gate, - liveFacts, - }).catch((error) => { + await withReviewPipelineSpan( + "selfhost.review.maintenance", + { + installationId, + repoFullName, + pullNumber: pr.number, + operation: "maintenance", + decisionOutcome: gate?.conclusion, + }, + () => + maybeRunAgentMaintenance(env, { + installationId, + repoFullName, + repo, + pr, + settings, + otherOpenPullRequests, + deliveryId, + gate, + liveFacts, + }), + ).catch((error) => { console.error( JSON.stringify({ level: "warn", @@ -3148,6 +3183,7 @@ async function processGitHubWebhook( if (payload.repository?.full_name && payload.pull_request) { const repoFullName = payload.repository.full_name; + const payloadPullRequest = payload.pull_request; // Accuracy/eval feedback loop (#self-improve / GAP-4). Independent of the review path + best-effort: // • pr_outcome — on `closed`, record the REALIZED merge-vs-close ground truth so computeGateEval can // score the gate's prediction against what the human actually did. @@ -3353,31 +3389,51 @@ async function processGitHubWebhook( | undefined; const liveFacts = createLiveGithubFacts(); if ( - await prReadyForReview( - env, - installationId, - repoFullName, - pr, - settings, - deliveryId, - liveFacts, + await withReviewPipelineSpan( + "selfhost.review.readiness", + { + installationId, + repoFullName, + pullNumber: pr.number, + operation: "readiness", + }, + () => + prReadyForReview( + env, + installationId, + repoFullName, + pr, + settings, + deliveryId, + liveFacts, + ), ) ) { - gate = await maybePublishPrPublicSurface( - env, - installationId, - repoFullName, - pr, - repo, - settings, - advisory, + gate = await withReviewPipelineSpan( + "selfhost.review.public_surface", { - deliveryId, - authorType: payload.pull_request.user?.type, - action: payload.action, - baseSha: payload.pull_request.base?.sha ?? null, - liveFacts, + installationId, + repoFullName, + pullNumber: pr.number, + operation: "public_surface", }, + () => + maybePublishPrPublicSurface( + env, + installationId, + repoFullName, + pr, + repo, + settings, + advisory, + { + deliveryId, + authorType: payloadPullRequest.user?.type, + action: payload.action, + baseSha: payloadPullRequest.base?.sha ?? null, + liveFacts, + }, + ), ).catch((error) => { if (isGitHubRateLimitedError(error) || isRetryableJobError(error)) throw error; console.error( @@ -3395,17 +3451,28 @@ async function processGitHubWebhook( // #778 maintainer auto-maintain: act on the PR's state (label/review/merge/close) per the repo's // autonomy config, after the gate has run. The function self-guards on agent config; best-effort here // so it never blocks the gate or public surface. - await maybeRunAgentMaintenance(env, { - installationId, - repoFullName, - repo, - pr, - settings, - otherOpenPullRequests, - deliveryId, - gate, - liveFacts, - }).catch((error) => { + await withReviewPipelineSpan( + "selfhost.review.maintenance", + { + installationId, + repoFullName, + pullNumber: pr.number, + operation: "maintenance", + decisionOutcome: gate?.conclusion, + }, + () => + maybeRunAgentMaintenance(env, { + installationId, + repoFullName, + repo, + pr, + settings, + otherOpenPullRequests, + deliveryId, + gate, + liveFacts, + }), + ).catch((error) => { /* v8 ignore next -- best-effort: auto-maintain failures are logged, never surfaced to the gate. */ console.error( JSON.stringify({ @@ -3933,6 +4000,7 @@ export async function runAiReviewForAdvisory( args: { settings: RepositorySettings; advisory: Awaited>; + installationId?: number | null | undefined; repoFullName: string; pr: { number: number; @@ -4217,6 +4285,7 @@ export async function runAiReviewForAdvisory( captureReviewFailure(new Error("AI review inconclusive — no usable verdict for the PR head"), { kind: "review", reason: "ai_review_inconclusive", + installationId: args.installationId, owner: args.repoFullName.split("/")[0], repo: args.repoFullName, pr: args.pr.number, @@ -4285,6 +4354,7 @@ export async function runAiReviewForAdvisory( { kind: "review", reason: "ai_review_public_summary_missing", + installationId: args.installationId, owner: args.repoFullName.split("/")[0], repo: args.repoFullName, pr: args.pr.number, @@ -4320,6 +4390,7 @@ export async function runAiReviewForAdvisory( ); captureReviewFailure(error, { kind: "review", + installationId: args.installationId, repo: args.repoFullName, pr: args.pr.number, head_sha: args.advisory.headSha, @@ -5090,80 +5161,94 @@ async function maybePublishPrPublicSurface( } } if (aiReviewWillRun) { - // #1 self-host AI-review cache: the LLM output for a PR changes only when the code (head SHA) or the review - // mode changes, so reuse a prior review for this exact (repo, pr, head SHA, mode) — a re-delivered webhook or - // the block-mode ~2-min re-gate sweep (which re-runs the AI for every open PR) need not re-spend the call. On - // self-host there is no AI gateway, so this is the only AI cache. The deterministic gate below still runs. - const cachedReview = await getCachedAiReview( - env, - repoFullName, - pr.number, - advisory.headSha, - settings.aiReviewMode, - ).catch(() => null); - if (cachedReview && hasPublicReviewAssessment(cachedReview.notes)) { - advisory.findings.push(...cachedReview.findings); - aiReview = cachedReview; - } else { - // `.gittensory.yml` review.profile + review.path_instructions + review.exclude_paths (#review-profile / - // #review-path-instructions / #review-exclude-paths): resolve from the manifest (cached from settings - // resolution, so a cheap cache hit — no extra fetch) and thread them into the AI review. Profile shapes - // nitpickiness; path-instructions add per-path guidance; exclude-paths drop files from review. Absent ⇒ - // byte-identical prompt. Fail-safe to defaults on any read error (resolveReviewPromptOverrides). - const { - profile: reviewProfile, - inlineComments: reviewInlineComments, - pathInstructions: reviewPathInstructions, - instructions: manifestReviewInstructions, - excludePaths: reviewExcludePaths, - } = resolveReviewPromptOverrides( - await loadRepoFocusManifest(env, repoFullName).catch(() => null), - ); - inlineCommentsEnabledForReview = shouldRequestInlineFindings( - env, - repoFullName, - reviewInlineComments, - ); - // Per-repo review CONTEXT (#review-skills): fold the container-private review/AGENTS.md (or legacy - // review/CLAUDE.md) guide + the matching review/skills/*.md modules into the SAME review-instructions slot, - // so reviews follow each repo's conventions. - // Glob-gated for cost (only skills matching the changed files are injected); absent config dir ⇒ empty ⇒ - // byte-identical prompt. getReviewFiles() is memoized, so the second call reuses the loaded diff. - const reviewInstructions = - [ - manifestReviewInstructions, - composeRepoReviewContext( - await loadRepoReviewContext(repoFullName), - (await getReviewFiles()).map((file) => file.path), - ), - ] - .map((part) => part?.trim()) - .filter(Boolean) - .join("\n\n") || null; - aiReview = await runAiReviewForAdvisory(env, { - settings, - advisory, + await withReviewPipelineSpan( + "selfhost.review.ai", + { + installationId, repoFullName, - pr: { ...pr, baseSha: webhook.baseSha ?? null }, - author, - confirmedContributor, - files: await getReviewFiles(), - reviewProfile, - reviewPathInstructions, - reviewInstructions, - reviewExcludePaths, - reviewInlineComments, - }); - if (aiReview && aiReview.cacheable !== false) - await putCachedAiReview( + pullNumber: pr.number, + operation: "ai_review", + agent: "dual-ai", + }, + async () => { + // #1 self-host AI-review cache: the LLM output for a PR changes only when the code (head SHA) or the review + // mode changes, so reuse a prior review for this exact (repo, pr, head SHA, mode) — a re-delivered webhook or + // the block-mode ~2-min re-gate sweep (which re-runs the AI for every open PR) need not re-spend the call. On + // self-host there is no AI gateway, so this is the only AI cache. The deterministic gate below still runs. + const cachedReview = await getCachedAiReview( env, repoFullName, pr.number, advisory.headSha, settings.aiReviewMode, - aiReview, - ).catch(() => undefined); - } + ).catch(() => null); + if (cachedReview && hasPublicReviewAssessment(cachedReview.notes)) { + advisory.findings.push(...cachedReview.findings); + aiReview = cachedReview; + } else { + // `.gittensory.yml` review.profile + review.path_instructions + review.exclude_paths (#review-profile / + // #review-path-instructions / #review-exclude-paths): resolve from the manifest (cached from settings + // resolution, so a cheap cache hit — no extra fetch) and thread them into the AI review. Profile shapes + // nitpickiness; path-instructions add per-path guidance; exclude-paths drop files from review. Absent ⇒ + // byte-identical prompt. Fail-safe to defaults on any read error (resolveReviewPromptOverrides). + const { + profile: reviewProfile, + inlineComments: reviewInlineComments, + pathInstructions: reviewPathInstructions, + instructions: manifestReviewInstructions, + excludePaths: reviewExcludePaths, + } = resolveReviewPromptOverrides( + /* v8 ignore next -- fail-open manifest-read rejection is exercised in runAiReviewForAdvisory; this wrapper preserves the same fallback. */ + await loadRepoFocusManifest(env, repoFullName).catch(() => null), + ); + inlineCommentsEnabledForReview = shouldRequestInlineFindings( + env, + repoFullName, + reviewInlineComments, + ); + // Per-repo review CONTEXT (#review-skills): fold the container-private review/AGENTS.md (or legacy + // review/CLAUDE.md) guide + the matching review/skills/*.md modules into the SAME review-instructions slot, + // so reviews follow each repo's conventions. + // Glob-gated for cost (only skills matching the changed files are injected); absent config dir ⇒ empty ⇒ + // byte-identical prompt. getReviewFiles() is memoized, so the second call reuses the loaded diff. + const reviewInstructions = + [ + manifestReviewInstructions, + composeRepoReviewContext( + await loadRepoReviewContext(repoFullName), + (await getReviewFiles()).map((file) => file.path), + ), + ] + .map((part) => part?.trim()) + .filter(Boolean) + .join("\n\n") || null; + aiReview = await runAiReviewForAdvisory(env, { + settings, + advisory, + installationId, + repoFullName, + pr: { ...pr, baseSha: webhook.baseSha ?? null }, + author, + confirmedContributor, + files: await getReviewFiles(), + reviewProfile, + reviewPathInstructions, + reviewInstructions, + reviewExcludePaths, + reviewInlineComments, + }); + if (aiReview && aiReview.cacheable !== false) + await putCachedAiReview( + env, + repoFullName, + pr.number, + advisory.headSha, + settings.aiReviewMode, + aiReview, + ).catch(() => undefined); + } + }, + ); } if (aiReviewExpected && !hasPublicReviewAssessment(aiReview?.notes)) { const message = @@ -5183,6 +5268,7 @@ async function maybePublishPrPublicSurface( captureReviewFailure(new Error(message), { kind: "review", reason: "ai_review_public_summary_missing", + installationId, repo: repoFullName, pr: pr.number, head_sha: advisory.headSha, @@ -5254,24 +5340,40 @@ async function maybePublishPrPublicSurface( authorHistory, gateSizeContext, ); - gateEvaluation = gateEnabled - ? evaluateGateCheck(advisory, gatePolicy) - : undefined; - // Deterministic content/registry surface lane (#1255) — flag-gated + per-repo allowlist, byte-identical when - // off (evaluateWithSurfaceLane returns the generic evaluation unchanged and resolves no files). A metagraphed - // registry-submission PR's surface verdict OVERRIDES the generic gate; the helper preserves a generic HARD - // blocker (e.g. a committed secret) and an unreadable head defers. AI-free → independent of the AI reviewer. - gateEvaluation = await evaluateWithSurfaceLane( - env, - repoFullName, - gateEnabled, - gateEvaluation, + gateEvaluation = await withReviewPipelineSpan( + "selfhost.review.gate", { installationId, - pr, - repo, - advisory, - getChangedFiles: getReviewFiles, + repoFullName, + pullNumber: pr.number, + operation: "gate_decision", + }, + async () => { + let evaluation = gateEnabled + ? evaluateGateCheck(advisory, gatePolicy) + : undefined; + // Deterministic content/registry surface lane (#1255) — flag-gated + per-repo allowlist, byte-identical when + // off (evaluateWithSurfaceLane returns the generic evaluation unchanged and resolves no files). A metagraphed + // registry-submission PR's surface verdict OVERRIDES the generic gate; the helper preserves a generic HARD + // blocker (e.g. a committed secret) and an unreadable head defers. AI-free → independent of the AI reviewer. + evaluation = await evaluateWithSurfaceLane( + env, + repoFullName, + gateEnabled, + evaluation, + { + installationId, + pr, + repo, + advisory, + getChangedFiles: getReviewFiles, + }, + ); + if (evaluation) + await setReviewPipelineSpanOutcome({ + decisionOutcome: evaluation.conclusion, + }); + return evaluation; }, ); // #554 gate false-positive telemetry: when the gate BLOCKS, record the block (one latest row per PR) so a @@ -5320,20 +5422,31 @@ async function maybePublishPrPublicSurface( } if (gateEnabled) { try { - const gateCheckResult = await createOrUpdateGateCheckRun( - env, - installationId, - repoFullName, - advisory, - gatePolicy, + const gateCheckResult = await withReviewPipelineSpan( + "selfhost.review.publish.check_run", { - checkRunId: pendingGateCheckRunId, - // #5 (audit): publish the AUTHORITATIVE surface-lane-merged verdict so the check-run conclusion matches - // the disposition; without this the check re-derives the generic verdict and shows green on a surface- - // lane reject/manual PR that is actually auto-closed/held. Undefined (gate off) ⇒ re-derive (identical). - gate: gateEvaluation, + installationId, + repoFullName, + pullNumber: pr.number, + operation: "publish_check_run", + decisionOutcome: gateEvaluation?.conclusion, }, - mode, + () => + createOrUpdateGateCheckRun( + env, + installationId, + repoFullName, + advisory, + gatePolicy, + { + checkRunId: pendingGateCheckRunId, + // #5 (audit): publish the AUTHORITATIVE surface-lane-merged verdict so the check-run conclusion matches + // the disposition; without this the check re-derives the generic verdict and shows green on a surface- + // lane reject/manual PR that is actually auto-closed/held. Undefined (gate off) ⇒ re-derive (identical). + gate: gateEvaluation, + }, + mode, + ), ); if (gateCheckResult?.kind === "published") gateFinalized = true; if (gateCheckResult?.kind === "permission_missing") { @@ -5755,13 +5868,24 @@ async function maybePublishPrPublicSurface( deterministicBody = buildPublicPrIntelligenceComment(commentArgs); } try { - await createOrUpdatePrIntelligenceComment( - env, - installationId, - repoFullName, - pr.number, - deterministicBody, - { mode }, + await withReviewPipelineSpan( + "selfhost.review.publish.comment", + { + installationId, + repoFullName, + pullNumber: pr.number, + operation: "publish_comment", + decisionOutcome: gateEvaluation?.conclusion, + }, + () => + createOrUpdatePrIntelligenceComment( + env, + installationId, + repoFullName, + pr.number, + deterministicBody, + { mode }, + ), ); publishedOutputs.push("comment"); incr("gittensory_reviews_published_total", { repo: repoFullName }); @@ -5797,16 +5921,27 @@ async function maybePublishPrPublicSurface( } if (decision.willLabel) { try { - await ensurePullRequestLabel( - env, - installationId, - repoFullName, - pr.number, - settings.gittensorLabel, + await withReviewPipelineSpan( + "selfhost.review.publish.label", { - createMissingLabel: settings.createMissingLabel, - mode, + installationId, + repoFullName, + pullNumber: pr.number, + operation: "publish_label", + decisionOutcome: gateEvaluation?.conclusion, }, + () => + ensurePullRequestLabel( + env, + installationId, + repoFullName, + pr.number, + settings.gittensorLabel, + { + createMissingLabel: settings.createMissingLabel, + mode, + }, + ), ); publishedOutputs.push("label"); } catch (error) { @@ -5892,6 +6027,7 @@ async function maybePublishPrPublicSurface( // advisory-only bot this is the worst failure — escalate to Sentry at error level, not just the audit ledger. captureReviewFailure(new Error("PR public-surface publish failed — review produced output but nothing was posted to the PR"), { kind: "publish", + installationId, owner: repoFullName.split("/")[0], repo: repoFullName, pr: pr.number, diff --git a/src/selfhost/otel.ts b/src/selfhost/otel.ts index a75cf2609..61de0261f 100644 --- a/src/selfhost/otel.ts +++ b/src/selfhost/otel.ts @@ -1,5 +1,6 @@ import { AsyncLocalStorage } from "node:async_hooks"; -import type { Attributes, Context, Tracer } from "@opentelemetry/api"; +import type { Attributes, Context, ContextManager, TextMapPropagator, Tracer } from "@opentelemetry/api"; +import type { ReadableSpan, Sampler, Span, SpanProcessor } from "@opentelemetry/sdk-trace-base"; type OtelApi = typeof import("@opentelemetry/api"); type OtelSdk = typeof import("@opentelemetry/sdk-trace-node"); @@ -9,6 +10,13 @@ type OtelProvider = { shutdown(): Promise; }; type SpanOptions = { parentTraceParent?: string | undefined }; +export type OpenTelemetryBridge = { + sampler?: Sampler; + spanProcessor?: SpanProcessor; + propagator?: TextMapPropagator; + contextManager?: ContextManager; + validate?: () => void; +}; export type OtelTraceIds = { trace_id: string; span_id: string }; export type OtelTraceLogFields = { trace_id: string; span_id?: string }; @@ -42,6 +50,10 @@ export function resolveOtelTraceEndpoint(env: NodeJS.ProcessEnv): string | undef return trimmed.endsWith("/v1/traces") ? trimmed : `${trimmed}/v1/traces`; } +export function openTelemetryTraceExportEnabled(env: NodeJS.ProcessEnv): boolean { + return Boolean(resolveOtelTraceEndpoint(env) && traceExporterEnabled(env)); +} + function serviceAttributes(env: NodeJS.ProcessEnv): Attributes { const attrs: Attributes = { "service.name": nonBlank(env.OTEL_SERVICE_NAME) ?? "gittensory-selfhost", @@ -68,6 +80,48 @@ function samplerFromEnv(env: NodeJS.ProcessEnv, sdk: OtelSdk) { return new sdk.ParentBasedSampler({ root: new sdk.AlwaysOnSampler() }); } +function processorSamplingContext( + parentContext: Context, + spanDecisions: WeakMap, +): Context { + const api = Otel; + const parentSpan = api?.trace.getSpan(parentContext); + if (!api || !parentSpan) return parentContext; + const sampled = spanDecisions.get(parentSpan); + if (sampled === undefined) return parentContext; + return api.trace.setSpanContext(parentContext, { + ...parentSpan.spanContext(), + traceFlags: sampled ? api.TraceFlags.SAMPLED : api.TraceFlags.NONE, + }); +} + +function sampledSpanProcessor(spanProcessor: SpanProcessor, sampler: Sampler, sampledDecision: number): SpanProcessor { + const spanDecisions = new WeakMap(); + return { + forceFlush: () => spanProcessor.forceFlush(), + shutdown: () => spanProcessor.shutdown(), + onStart(span: Span, parentContext: Context) { + const samplingContext = processorSamplingContext(parentContext, spanDecisions); + const decision = sampler.shouldSample( + samplingContext, + span.spanContext().traceId, + span.name, + span.kind, + span.attributes, + span.links, + ).decision; + const sampled = decision === sampledDecision; + spanDecisions.set(span, sampled); + if (!sampled) return; + spanProcessor.onStart(span, parentContext); + }, + onEnd(span: ReadableSpan) { + if (!spanDecisions.get(span)) return; + spanProcessor.onEnd(span); + }, + }; +} + export function otelSafeAttributes(input: Record | undefined): Attributes { const out: Attributes = {}; if (!input) return out; @@ -187,25 +241,58 @@ export function selfHostHttpResponseAttributes(status: number): Record { +/** Initialize self-host OTEL traces. Sentry can attach its span processor/context bridge when configured. */ +export async function initOpenTelemetry( + env: NodeJS.ProcessEnv, + bridge?: OpenTelemetryBridge | undefined, +): Promise { const endpoint = resolveOtelTraceEndpoint(env); - if (!endpoint || !traceExporterEnabled(env)) return false; + const otlpEnabled = Boolean(endpoint && traceExporterEnabled(env)); + if (!otlpEnabled && !bridge?.spanProcessor) return false; if (active) return true; const [api, sdk, exporterNs, resources] = await Promise.all([ import("@opentelemetry/api"), import("@opentelemetry/sdk-trace-node"), - import("@opentelemetry/exporter-trace-otlp-http"), + otlpEnabled ? import("@opentelemetry/exporter-trace-otlp-http") : Promise.resolve(undefined), import("@opentelemetry/resources"), ]); - const exporter = new exporterNs.OTLPTraceExporter({ url: endpoint }); - provider = new sdk.NodeTracerProvider({ + const otelSampler = samplerFromEnv(env, sdk); + const independentBridgeSampler = Boolean(otlpEnabled && bridge?.spanProcessor && bridge.sampler); + const spanProcessors: SpanProcessor[] = []; + if (otlpEnabled && endpoint && exporterNs) { + const exporter = new exporterNs.OTLPTraceExporter({ url: endpoint }); + const otlpSpanProcessor = new sdk.BatchSpanProcessor(exporter); + spanProcessors.push( + independentBridgeSampler + ? sampledSpanProcessor(otlpSpanProcessor, otelSampler, sdk.SamplingDecision.RECORD_AND_SAMPLED) + : otlpSpanProcessor, + ); + } + if (bridge?.spanProcessor) { + const bridgeSpanProcessor = independentBridgeSampler && bridge.sampler + ? sampledSpanProcessor(bridge.spanProcessor, bridge.sampler, sdk.SamplingDecision.RECORD_AND_SAMPLED) + : bridge.spanProcessor; + spanProcessors.push(bridgeSpanProcessor); + } + const sampler = independentBridgeSampler + ? new sdk.AlwaysOnSampler() + : otlpEnabled + ? otelSampler + : bridge?.sampler ?? otelSampler; + const nextProvider = new sdk.NodeTracerProvider({ resource: resources.resourceFromAttributes(serviceAttributes(env)), - sampler: samplerFromEnv(env, sdk), - spanProcessors: [new sdk.BatchSpanProcessor(exporter)], + sampler, + spanProcessors, }); + if (bridge?.propagator || bridge?.contextManager) + nextProvider.register({ + ...(bridge.propagator ? { propagator: bridge.propagator } : {}), + ...(bridge.contextManager ? { contextManager: bridge.contextManager } : {}), + }); + bridge?.validate?.(); + provider = nextProvider; Otel = api; - tracer = provider.getTracer("gittensory-selfhost"); + tracer = nextProvider.getTracer("gittensory-selfhost"); active = true; return true; } diff --git a/src/selfhost/pg-queue.ts b/src/selfhost/pg-queue.ts index b4a57f205..180c6aa6a 100644 --- a/src/selfhost/pg-queue.ts +++ b/src/selfhost/pg-queue.ts @@ -343,7 +343,7 @@ export function createPgQueue( const rateLimitAdmission = await rateLimitAdmissionDelayMs(message); if (rateLimitAdmission !== null) { const rateLimitMetric = githubRateLimitMetricContext(message, rateLimitAdmission); - await withOtelSpan( + await withReviewSpan( "selfhost.queue.admission_deferred", { "job.type": message.type, diff --git a/src/selfhost/review-tracing.ts b/src/selfhost/review-tracing.ts new file mode 100644 index 000000000..cd6c74c45 --- /dev/null +++ b/src/selfhost/review-tracing.ts @@ -0,0 +1,64 @@ +import { sha256Hex } from "../utils/crypto"; +import { setCurrentOtelSpanAttributes, withOtelSpan } from "./otel"; + +const INSTALLATION_HASH_SEED = "github-installation:"; + +type ReviewTraceInput = { + installationId?: number | string | null | undefined; + repoFullName?: string | null | undefined; + pullNumber?: number | null | undefined; + operation?: string | undefined; + agent?: string | undefined; + decisionOutcome?: string | undefined; +}; + +function normalizeInstallationId(value: unknown): string | undefined { + if (typeof value === "number" && Number.isFinite(value)) return String(Math.trunc(value)); + if (typeof value !== "string") return undefined; + const trimmed = value.trim(); + return /^[0-9]+$/.test(trimmed) ? trimmed : undefined; +} + +export function hashedInstallationIdWith( + value: unknown, + digestHex: (input: string) => string, +): string | undefined { + const normalized = normalizeInstallationId(value); + if (!normalized) return undefined; + return digestHex(`${INSTALLATION_HASH_SEED}${normalized}`).slice(0, 16); +} + +export async function hashedInstallationId(value: unknown): Promise { + const normalized = normalizeInstallationId(value); + if (!normalized) return undefined; + return (await sha256Hex(`${INSTALLATION_HASH_SEED}${normalized}`)).slice(0, 16); +} + +export async function reviewTraceAttributes( + input: ReviewTraceInput, +): Promise> { + const attrs: Record = {}; + if (input.repoFullName) attrs["github.repository"] = input.repoFullName; + if (input.pullNumber !== null && input.pullNumber !== undefined) + attrs["github.pull_request.number"] = input.pullNumber; + const installationHash = await hashedInstallationId(input.installationId); + if (installationHash) attrs["github.installation_id_hash"] = installationHash; + if (input.operation) attrs["gittensory.operation"] = input.operation; + if (input.agent) attrs["gittensory.agent"] = input.agent; + if (input.decisionOutcome) attrs["gittensory.decision_outcome"] = input.decisionOutcome; + return attrs; +} + +export async function withReviewPipelineSpan( + name: string, + input: ReviewTraceInput, + fn: () => T | Promise, +): Promise { + return withOtelSpan(name, await reviewTraceAttributes(input), fn); +} + +export async function setReviewPipelineSpanOutcome( + input: Pick, +): Promise { + setCurrentOtelSpanAttributes(await reviewTraceAttributes(input)); +} diff --git a/src/selfhost/sentry.ts b/src/selfhost/sentry.ts index fb7573f86..8d38c28d0 100644 --- a/src/selfhost/sentry.ts +++ b/src/selfhost/sentry.ts @@ -7,21 +7,28 @@ import { PUBLIC_UNSAFE_TERMS, } from "../signals/redaction"; import { hostname } from "node:os"; -import { currentOtelTraceIds } from "./otel"; +import { + currentOtelTraceIds, + openTelemetryTraceExportEnabled, + type OpenTelemetryBridge, +} from "./otel"; +import { hashedInstallationIdWith } from "./review-tracing"; type SentryNs = typeof import("@sentry/node"); +type SentryClient = NonNullable>; type SentryMonitorConfig = NonNullable[1]>; export type SentryMonitorName = "scheduled-loop" | "orb-export" | "orb-relay-drain"; type SentryScope = { setContext(name: string, context: Record): void; setTag(key: string, value: string): void; }; +type DigestHex = (input: string) => string; let Sentry: SentryNs | undefined; +let sentryClient: SentryClient | undefined; +let sentryTraceSampleRate: number | undefined; let active = false; let sentryEnvironment = "production"; -// The resolved tracing sample rate. Tracing stays a complete no-op (no spans started, no trace traffic) until this -// is configured above 0 — distinct from error capture, which is on whenever the DSN is set. (#1734) -let tracesSampleRate = 0; +let digestHexSync: DigestHex | undefined; const SECRET_KEY = /(token|secret|key|password|passwd|authorization|auth|dsn|cookie|bearer|credential|private)/i; @@ -61,6 +68,12 @@ function nonBlank(value: string | undefined): string | undefined { return trimmed ? trimmed : undefined; } +async function loadNodeHasher(): Promise { + const { createHash } = await import("node:crypto"); + digestHexSync = (input: string): string => + createHash("sha256").update(input).digest("hex"); +} + const SENTRY_MONITORS: Record = { "scheduled-loop": { slug: "scheduled-loop", @@ -142,12 +155,14 @@ export function resolveSentryRelease( return nonBlank(env.SENTRY_RELEASE) ?? nonBlank(env.GITTENSORY_VERSION); } -/** Resolve the trace sample rate, clamped to [0, 1]. Defaults to 0 (tracing off) — a malformed value is treated as - * off rather than full sampling, so a typo can never accidentally flood the tracer. (#1734) */ -export function resolveTracesSampleRate(env: NodeJS.ProcessEnv): number { - const parsed = Number(env.SENTRY_TRACES_SAMPLE_RATE ?? "0"); - if (!Number.isFinite(parsed)) return 0; - return Math.min(1, Math.max(0, parsed)); +export function resolveSentryTracesSampleRate( + env: NodeJS.ProcessEnv, +): number | undefined { + const raw = nonBlank(env.SENTRY_TRACES_SAMPLE_RATE); + if (!raw) return undefined; + const parsed = Number(raw); + if (!Number.isFinite(parsed) || parsed <= 0) return undefined; + return Math.min(parsed, 1); } /** beforeSend scrubber — redact anything token/secret-like before an event leaves the box (privacy boundary). */ @@ -194,6 +209,34 @@ function shouldRedactKey(key: string): boolean { ); } +function isInstallationIdKey(key: string): boolean { + return key.replace(/[^A-Za-z0-9]/g, "").toLowerCase() === "installationid"; +} + +function installationIdHash(value: unknown): string | undefined { + if (!digestHexSync) return undefined; + return hashedInstallationIdWith(value, digestHexSync); +} + +function hashedInstallationContext( + context: Record, +): Record { + const hasInstallationId = + "installation_id" in context || "installationId" in context; + const hash = installationIdHash(context.installation_id ?? context.installationId); + if (!hash && !hasInstallationId) return context; + const safe: Record = { ...context }; + if (hash) safe.installation_id_hash = hash; + delete safe.installation_id; + delete safe.installationId; + return safe; +} + +function tagHashedInstallation(scope: SentryScope, context: Record): void { + const hash = installationIdHash(context.installation_id ?? context.installationId); + if (hash) scope.setTag("installation_id_hash", hash); +} + function scrubString(value: string): string { return value .replace(QUERY_SECRET_VALUE, `$1${REDACTED}`) @@ -219,6 +262,12 @@ function scrubRecord(obj: unknown, depth: number): void { } const rec = obj as Record; for (const key of Object.keys(rec)) { + if (isInstallationIdKey(key)) { + const hash = installationIdHash(rec[key]); + if (hash) rec.installation_id_hash = hash; + delete rec[key]; + continue; + } if (shouldRedactKey(key)) { rec[key] = REDACTED; continue; @@ -304,18 +353,23 @@ function scrubAllowedContexts(contexts: Record | undefined): vo /** Initialize Sentry from the environment. Returns false (and stays a no-op) when SENTRY_DSN is unset. */ export async function initSentry(env: NodeJS.ProcessEnv): Promise { if (!env.SENTRY_DSN) return false; + await loadNodeHasher(); Sentry = await import("@sentry/node"); const release = resolveSentryRelease(env); + sentryTraceSampleRate = resolveSentryTracesSampleRate(env); + const useCustomOpenTelemetry = + sentryTraceSampleRate !== undefined || openTelemetryTraceExportEnabled(env); sentryEnvironment = nonBlank(env.SENTRY_ENVIRONMENT) ?? "production"; - tracesSampleRate = resolveTracesSampleRate(env); - Sentry.init({ + sentryClient = Sentry.init({ dsn: env.SENTRY_DSN, environment: sentryEnvironment, ...(release ? { release } : {}), - tracesSampleRate, - // Identify this instance by a CLEAN, configurable name — not the public-origin URL. An operator sets - // SENTRY_SERVER_NAME (e.g. "gittensory-us-east"); unset falls back to the OS hostname, which is dynamic - // per instance with no hardcoded value and reads as a name rather than a URL. + ...(sentryTraceSampleRate !== undefined + ? { tracesSampleRate: sentryTraceSampleRate } + : {}), + ...(useCustomOpenTelemetry ? { skipOpenTelemetrySetup: true } : {}), + // Identify this instance by a CLEAN, configurable name, not the public-origin URL. An operator sets + // SENTRY_SERVER_NAME (e.g. "gittensory-us-east"); unset falls back to the OS hostname. serverName: nonBlank(env.SENTRY_SERVER_NAME) ?? hostname(), beforeSend: (e) => scrubEvent(e), beforeSendTransaction: (e) => scrubEvent(e), @@ -324,6 +378,21 @@ export async function initSentry(env: NodeJS.ProcessEnv): Promise { return true; } +export async function buildSentryOpenTelemetryBridge(): Promise { + if (!active || !Sentry || !sentryClient) return undefined; + const SentryOtel = await import("@sentry/opentelemetry"); + const exportSentrySpans = sentryTraceSampleRate !== undefined; + return { + ...(exportSentrySpans ? { sampler: new SentryOtel.SentrySampler(sentryClient) } : {}), + propagator: new SentryOtel.SentryPropagator(), + contextManager: new Sentry.SentryContextManager(), + ...(exportSentrySpans ? { spanProcessor: new SentryOtel.SentrySpanProcessor() } : {}), + validate: () => { + Sentry?.validateOpenTelemetrySetup?.(); + }, + }; +} + /** Capture an error with optional structured context. No-op when Sentry is off. */ export function captureError( error: unknown, @@ -332,7 +401,8 @@ export function captureError( if (!active || !Sentry) return; Sentry.withScope((scope) => { setOtelTraceScope(scope); - if (context) scope.setContext("gittensory", context); + if (context) scope.setContext("gittensory", hashedInstallationContext(context)); + if (context) tagHashedInstallation(scope, context); Sentry!.captureException( error instanceof Error ? error : new Error(String(error)), ); @@ -350,9 +420,11 @@ export function captureReviewFailure( scope.setLevel("error"); setOtelTraceScope(scope); if (context) { - scope.setContext("review", context); - for (const tag of ["owner", "repo", "pr", "head_sha"]) { - const value = context[tag]; + const safeContext = hashedInstallationContext(context); + scope.setContext("review", safeContext); + tagHashedInstallation(scope, context); + for (const tag of ["owner", "repo", "pr", "head_sha", "operation", "agent", "decision_outcome"]) { + const value = safeContext[tag]; if (value !== undefined && value !== null) scope.setTag(tag, String(value)); } @@ -363,45 +435,9 @@ export function captureReviewFailure( }); } -/** True only when error capture is active AND trace sampling is configured above 0. When false, every span helper - * is a complete no-op — no span is started and no trace traffic is emitted (the #1734 "sampling off" guarantee). */ -export function sentryTracingEnabled(): boolean { - return active && Sentry !== undefined && tracesSampleRate > 0; -} - -/** Project an attribute bag onto the safe, low-cardinality subset allowed on a span: drop secret-keyed keys and - * null/undefined, keep finite numbers + booleans, and truncate strings — never a prompt/diff/token/body. */ -export function sentrySpanAttributes( - input: Record | undefined, -): Record { - const out: Record = {}; - if (!input) return out; - for (const [key, value] of Object.entries(input)) { - if (SECRET_KEY.test(key) || value === null || value === undefined) continue; - if (typeof value === "string") out[key] = value.length > 160 ? `${value.slice(0, 157)}...` : value; - else if (typeof value === "number" && Number.isFinite(value)) out[key] = value; - else if (typeof value === "boolean") out[key] = value; - } - return out; -} - -/** Run `fn` inside a Sentry span named `name`, tagged with the safe attributes. The span auto-closes and is marked - * errored if `fn` throws (so slow/failed stages are filterable). A pure pass-through to `fn` when tracing is off. */ -export async function withSentrySpan( - name: string, - attributes: Record | undefined, - fn: () => T | Promise, -): Promise { - if (!sentryTracingEnabled()) return fn(); - return Sentry!.startSpan( - { name, op: name, attributes: sentrySpanAttributes(attributes) }, - () => fn(), - ); -} - // The structured-log fields worth indexing as Sentry tags — the dimensions operators filter + group by. Only // string|number values are tagged; everything else stays in the full "log" context. -const SENTRY_LOG_TAG_KEYS = ["repo", "repository", "installationId", "installation_id", "pull", "pullNumber", "pr", "project", "kind", "deliveryId", "provider", "model", "effort", "timeoutMs", "trace_id", "span_id"] as const; +const SENTRY_LOG_TAG_KEYS = ["repo", "repository", "installation_id_hash", "pull", "pullNumber", "pr", "project", "kind", "deliveryId", "provider", "model", "effort", "timeoutMs", "trace_id", "span_id", "operation", "agent", "decision_outcome"] as const; /** A SHORT location suffix — " (repo#pr)" — for a no-message error title, so the issue list shows WHERE without * dumping every scalar field (which made titles unreadably long, e.g. trailing a full deliveryId). The complete @@ -436,8 +472,13 @@ const SUMMARY_SKIP_KEYS = new Set([ "error", "repo", "repository", + "installationId", + "installation_id", + "installation_id_hash", "pullNumber", "deliveryId", + "trace_id", + "span_id", ]); function redactSummaryValue(value: unknown, depth = 0): unknown { if (!value || typeof value !== "object") return value; @@ -482,6 +523,7 @@ export function forwardStructuredLogToSentry(line: unknown, fromErrorSink = fals } catch { return; // not JSON — an ordinary log line } + const safeObj = hashedInstallationContext(obj); // A console.error sink is error-level by DEFAULT even when the JSON omits an explicit level (many engine error // logs do) — that's how those errors reach Sentry instead of printing to stderr and vanishing. An EXPLICIT level // always wins, so a deliberate level:"warn" emitted via console.error is still skipped. @@ -503,34 +545,29 @@ export function forwardStructuredLogToSentry(line: unknown, fromErrorSink = fals // like close_breaker_engaged shows "project=x, closePrecision=0.6, floor=0.8") → else a context pointer. const value = detail ?? - ([logLocation(obj).trim(), summarizeLogFields(obj)] + ([logLocation(safeObj).trim(), summarizeLogFields(safeObj)] .filter(Boolean) .join(" ") || "(no message — see the log context)"); const errorEvent = new Error(value); errorEvent.name = event ?? "GittensoryLog"; - // This exception is SYNTHETIC — minted here from a console line, never thrown at the code that failed. Its captured - // JS stack therefore points at this forwarder and the console sink that called it (installStructuredLogForwarding), - // not at the origin. Left attached, Sentry computes EVERY forwarded issue's culprit as forwardStructuredLogToSentry, - // burying the real signal. Reduce the stack to its header line (the parser yields zero frames from it) so no frame - // is misattributed; the event slug supplies the culprit below and the `log` context keeps the full payload. + // This exception is synthetic: it was minted from a console line, never thrown at the failing code. Strip the + // wrapper stack so Sentry does not attribute forwarded operational issues to this forwarding helper. errorEvent.stack = `${errorEvent.name}: ${value}`; Sentry.withScope((scope) => { scope.setLevel(severity); setOtelTraceScope(scope); - scope.setContext("log", obj); + scope.setContext("log", safeObj); if (event) scope.setTag("event", event); // Index the dimensions operators filter + group by, so issues are findable without digging into the context. for (const key of SENTRY_LOG_TAG_KEYS) { - const tagValue = obj[key]; + const tagValue = safeObj[key]; if (typeof tagValue === "string" || typeof tagValue === "number") scope.setTag(key, String(tagValue)); } // Group recurrences of ONE failure into a single issue (by event, not the variable detail in the value). if (event) scope.setFingerprint(["gittensory-log", event]); - // Give the issue a legible culprit (the location Sentry shows under the title). It derives from event.transaction - // when set, else from the now-stripped stack — so point it at the operational event slug (e.g. - // "orb_broker_unavailable") rather than the forwarder. setTransactionName on the scope does NOT populate - // event.transaction in this SDK version, so set it on the event via a scoped processor. + // Sentry uses event.transaction as the issue culprit fallback when the stack has no frames; point it at the + // operational event slug rather than the forwarding helper. if (event) scope.addEventProcessor((sentryEvent) => { sentryEvent.transaction = event; @@ -590,9 +627,11 @@ export async function flushSentry(timeoutMs = 2000): Promise { /** Test-only: reset module state between cases. */ export function resetSentryForTest(): void { Sentry = undefined; + sentryClient = undefined; + sentryTraceSampleRate = undefined; active = false; sentryEnvironment = "production"; - tracesSampleRate = 0; + digestHexSync = undefined; } interface StructuredLogConsole { diff --git a/src/selfhost/sqlite-queue.ts b/src/selfhost/sqlite-queue.ts index 81b058e72..fc4a92cb0 100644 --- a/src/selfhost/sqlite-queue.ts +++ b/src/selfhost/sqlite-queue.ts @@ -286,7 +286,7 @@ export function createSqliteQueue( const rateLimitAdmission = rateLimitAdmissionDelayMs(driver, message); if (rateLimitAdmission !== null) { const rateLimitMetric = githubRateLimitMetricContext(message, rateLimitAdmission); - await withOtelSpan( + await withReviewSpan( "selfhost.queue.admission_deferred", { "job.type": message.type, diff --git a/src/selfhost/tracing.ts b/src/selfhost/tracing.ts index 53ec2c64f..cd0498d79 100644 --- a/src/selfhost/tracing.ts +++ b/src/selfhost/tracing.ts @@ -1,10 +1,6 @@ -// Shared review-pipeline span wrapper (#1734). Opens ONE boundary that feeds BOTH tracers — an OpenTelemetry span -// and a Sentry span — so a stage is instrumented once and shows up in whichever backend is enabled. Kept in this -// neutral module (rather than inside otel.ts or sentry.ts) so a caller wires the normal review boundary without -// importing from, or coupling to, either tracer's module. Each side independently no-ops when its backend is off, -// so this reduces to `fn()` when neither is configured. +// Shared review-pipeline span wrapper (#1734). Opens ONE OpenTelemetry boundary; Sentry receives the same span via +// the SentrySpanProcessor bridge when configured, avoiding duplicate direct Sentry spans. import { withOtelSpan } from "./otel"; -import { withSentrySpan } from "./sentry"; export async function withReviewSpan( name: string, @@ -12,5 +8,5 @@ export async function withReviewSpan( fn: () => T | Promise, options?: { parentTraceParent?: string | undefined }, ): Promise { - return withOtelSpan(name, attributes, () => withSentrySpan(name, attributes, fn), options); + return withOtelSpan(name, attributes, fn, options); } diff --git a/src/server.ts b/src/server.ts index 251265529..8c78b4038 100644 --- a/src/server.ts +++ b/src/server.ts @@ -53,6 +53,7 @@ import { makeLocalReviewContextReader, } from "./selfhost/private-config"; import { + buildSentryOpenTelemetryBridge, captureError, flushSentry, initSentry, @@ -66,6 +67,7 @@ import { import { currentOtelTraceParent, initOpenTelemetry, + openTelemetryTraceExportEnabled, selfHostHttpRequestAttributes, selfHostHttpResponseAttributes, setCurrentOtelSpanAttributes, @@ -251,10 +253,6 @@ function buildSqliteBackend( async function main(): Promise { loadFileSecrets(); - /* v8 ignore start -- importing this entrypoint starts the Node server; init behavior is covered in selfhost/otel tests. */ - if (await initOpenTelemetry(process.env)) - console.log(JSON.stringify({ event: "selfhost_otel", traces: "otlp" })); - /* v8 ignore stop */ // Container-private per-repo config (self-host): register the GITTENSORY_REPO_CONFIG_DIR reader so the focus- // manifest loader prefers a mounted `{owner}__{repo}.yml` over the public `.gittensory.yml` (review policy stays // private). Unset dir ⇒ null reader ⇒ unchanged public-fetch behavior. @@ -270,7 +268,9 @@ async function main(): Promise { // Error tracking (#1468): opt-in via SENTRY_DSN — a complete no-op when unset. When on, capture uncaught crashes // + unhandled rejections (flush before exit for the fatal case); per-subsystem captures (queue dead-letter, // review failures) are wired at their sites. - if (await initSentry(process.env)) { + /* v8 ignore start -- importing this entrypoint starts the Node server; Sentry/OTEL init behavior is covered in selfhost tests. */ + const sentryEnabled = await initSentry(process.env); + if (sentryEnabled) { console.log( JSON.stringify({ event: "selfhost_sentry", @@ -290,6 +290,9 @@ async function main(): Promise { // stderr. Wrap both sinks so every level:"error"/"fatal" line surfaces as a Sentry issue WITHOUT per-site wiring. installStructuredLogForwarding(); } + if (await initOpenTelemetry(process.env, sentryEnabled ? await buildSentryOpenTelemetryBridge() : undefined)) + console.log(JSON.stringify({ event: "selfhost_otel", traces: openTelemetryTraceExportEnabled(process.env) ? "otlp" : "sentry" })); + /* v8 ignore stop */ const startedAt = Date.now(); // The queue consumer captures `env`, assigned below (the first job only runs once an HTTP/cron event diff --git a/test/unit/queue.test.ts b/test/unit/queue.test.ts index 7b5e754fc..1e71f12f2 100644 --- a/test/unit/queue.test.ts +++ b/test/unit/queue.test.ts @@ -4362,6 +4362,12 @@ describe("queue processors", () => { slopAiAdvisory: true, }); let gatePatchBody: { conclusion?: string; output?: { title?: string; text?: string } } = {}; + const cacheReadSpy = vi + .spyOn(repositoriesModule, "getCachedAiReview") + .mockRejectedValueOnce(new Error("cache read failed")); + const cacheWriteSpy = vi + .spyOn(repositoriesModule, "putCachedAiReview") + .mockRejectedValueOnce(new Error("cache write failed")); vi.stubGlobal("fetch", async (input: RequestInfo | URL, init?: RequestInit) => { const url = input.toString(); const method = init?.method ?? "GET"; @@ -4402,6 +4408,10 @@ describe("queue processors", () => { // The AI usage event was recorded for the review (never with key material). const usage = await env.DB.prepare("select feature, status from ai_usage_events where feature = ?").bind("ai_review_pr").first<{ feature: string; status: string }>(); expect(usage).toMatchObject({ feature: "ai_review_pr", status: "ok" }); + expect(cacheReadSpy).toHaveBeenCalled(); + expect(cacheWriteSpy).toHaveBeenCalled(); + cacheReadSpy.mockRestore(); + cacheWriteSpy.mockRestore(); }); it("finalizes the Gate to neutral instead of leaving it in_progress when gate completion fails", async () => { diff --git a/test/unit/selfhost-otel.test.ts b/test/unit/selfhost-otel.test.ts index 4abf04028..a187d044c 100644 --- a/test/unit/selfhost-otel.test.ts +++ b/test/unit/selfhost-otel.test.ts @@ -1,4 +1,5 @@ import { beforeEach, describe, expect, it, vi } from "vitest"; +import { ROOT_CONTEXT } from "@opentelemetry/api"; const otelMocks = vi.hoisted(() => { const exportedSpans: any[] = []; @@ -25,6 +26,7 @@ import { currentOtelTraceParent, flushOpenTelemetry, initOpenTelemetry, + openTelemetryTraceExportEnabled, otelSafeAttributes, otelTraceLogFields, resetOpenTelemetryForTest, @@ -34,6 +36,13 @@ import { setCurrentOtelSpanAttributes, withOtelSpan, } from "../../src/selfhost/otel"; +import { + hashedInstallationId, + hashedInstallationIdWith, + reviewTraceAttributes, + setReviewPipelineSpanOutcome, + withReviewPipelineSpan, +} from "../../src/selfhost/review-tracing"; import { clearSelfHostRequestTraceParent, getSelfHostRequestTraceParent, @@ -61,6 +70,8 @@ describe("self-host OpenTelemetry", () => { expect(resolveOtelTraceEndpoint(env({ OTEL_EXPORTER_OTLP_TRACES_ENDPOINT: "http://collector/custom" }))).toBe( "http://collector/custom", ); + expect(openTelemetryTraceExportEnabled(env({ OTEL_EXPORTER_OTLP_ENDPOINT: "http://otel-collector:4318" }))).toBe(false); + expect(openTelemetryTraceExportEnabled(env({ OTEL_TRACES_EXPORTER: "otlp", OTEL_EXPORTER_OTLP_ENDPOINT: "http://otel-collector:4318" }))).toBe(true); expect(await initOpenTelemetry(env({ OTEL_EXPORTER_OTLP_ENDPOINT: "http://otel-collector:4318" }))).toBe(false); await flushOpenTelemetry(); await expect(withOtelSpan("off", { "job.type": "x" }, () => 42)).resolves.toBe(42); @@ -333,6 +344,270 @@ describe("self-host OpenTelemetry", () => { expect(otelMocks.exportedSpans.map((span) => span.name)).toContain("ratio-defaults-on"); }); + it("can export custom spans through a Sentry bridge without requiring OTLP export", async () => { + const sentryEndedSpans: any[] = []; + const sentryProcessor = { + onStart: vi.fn(), + onEnd: vi.fn((span: unknown) => sentryEndedSpans.push(span)), + forceFlush: vi.fn(async () => undefined), + shutdown: vi.fn(async () => undefined), + }; + const validate = vi.fn(); + const propagator = { + inject: vi.fn(), + extract: vi.fn((context) => context), + fields: vi.fn(() => []), + }; + let contextManager: any; + contextManager = { + active: vi.fn(() => ROOT_CONTEXT), + with: vi.fn((context, fn, thisArg, ...args) => fn.apply(thisArg, args)), + bind: vi.fn((_context, target) => target), + enable: vi.fn(() => contextManager), + disable: vi.fn(() => contextManager), + }; + + expect(await initOpenTelemetry(env({}), { + spanProcessor: sentryProcessor, + propagator, + contextManager, + validate, + })).toBe(true); + await withOtelSpan("selfhost.review.gate", { "gittensory.operation": "gate_decision" }, () => undefined); + await flushOpenTelemetry(); + + expect(otelMocks.OTLPTraceExporter).not.toHaveBeenCalled(); + expect(validate).toHaveBeenCalledTimes(1); + expect(contextManager.enable).toHaveBeenCalledTimes(1); + expect(sentryProcessor.onEnd).toHaveBeenCalledTimes(1); + expect(sentryEndedSpans[0].name).toBe("selfhost.review.gate"); + expect(sentryEndedSpans[0].attributes).toMatchObject({ + "gittensory.operation": "gate_decision", + }); + + await resetOpenTelemetryForTest(); + + expect(await initOpenTelemetry(env({}), { spanProcessor: sentryProcessor, propagator })).toBe(true); + await resetOpenTelemetryForTest(); + + expect(await initOpenTelemetry(env({}), { spanProcessor: sentryProcessor, contextManager })).toBe(true); + expect(contextManager.enable).toHaveBeenCalledTimes(2); + + await resetOpenTelemetryForTest(); + otelMocks.exportedSpans.length = 0; + + expect(await initOpenTelemetry(env({ + OTEL_TRACES_EXPORTER: "otlp", + OTEL_EXPORTER_OTLP_ENDPOINT: "http://collector", + OTEL_TRACES_SAMPLER: "always_on", + }), { propagator, contextManager })).toBe(true); + await withOtelSpan("otlp-only-with-sentry-context", undefined, () => undefined); + await flushOpenTelemetry(); + expect(otelMocks.exportedSpans.map((span) => span.name)).toContain("otlp-only-with-sentry-context"); + expect(contextManager.enable).toHaveBeenCalledTimes(3); + + await resetOpenTelemetryForTest(); + otelMocks.exportedSpans.length = 0; + sentryEndedSpans.length = 0; + sentryProcessor.onStart.mockClear(); + sentryProcessor.onEnd.mockClear(); + sentryProcessor.forceFlush.mockClear(); + sentryProcessor.shutdown.mockClear(); + + expect(await initOpenTelemetry(env({ + OTEL_TRACES_EXPORTER: "otlp", + OTEL_EXPORTER_OTLP_ENDPOINT: "http://collector", + OTEL_TRACES_SAMPLER: "always_on", + }), { + spanProcessor: sentryProcessor, + propagator, + contextManager, + })).toBe(true); + await withOtelSpan("otlp-and-sentry-no-sampler", undefined, () => undefined); + await flushOpenTelemetry(); + expect(otelMocks.exportedSpans.map((span) => span.name)).toContain("otlp-and-sentry-no-sampler"); + expect(sentryEndedSpans.map((span) => span.name)).toContain("otlp-and-sentry-no-sampler"); + expect(sentryProcessor.forceFlush).toHaveBeenCalledTimes(1); + expect(contextManager.enable).toHaveBeenCalledTimes(4); + + await resetOpenTelemetryForTest(); + otelMocks.exportedSpans.length = 0; + sentryEndedSpans.length = 0; + sentryProcessor.onStart.mockClear(); + sentryProcessor.onEnd.mockClear(); + sentryProcessor.forceFlush.mockClear(); + sentryProcessor.shutdown.mockClear(); + const dropAllBridgeSampler = { + shouldSample: vi.fn((..._args: any[]) => ({ decision: 0 })), + toString: () => "drop-all-bridge-sampler", + }; + + expect(await initOpenTelemetry(env({ + OTEL_TRACES_EXPORTER: "otlp", + OTEL_EXPORTER_OTLP_ENDPOINT: "http://collector", + OTEL_TRACES_SAMPLER: "always_on", + }), { + sampler: dropAllBridgeSampler as any, + spanProcessor: sentryProcessor, + propagator, + contextManager, + })).toBe(true); + await withOtelSpan("otlp-keeps-env-sampler", undefined, () => undefined); + await flushOpenTelemetry(); + expect(otelMocks.exportedSpans.map((span) => span.name)).toContain("otlp-keeps-env-sampler"); + expect(dropAllBridgeSampler.shouldSample).toHaveBeenCalledTimes(1); + expect(dropAllBridgeSampler.shouldSample.mock.calls[0]?.[2]).toBe("otlp-keeps-env-sampler"); + expect(sentryProcessor.onStart).not.toHaveBeenCalled(); + expect(sentryProcessor.onEnd).not.toHaveBeenCalled(); + expect(sentryProcessor.forceFlush).toHaveBeenCalledTimes(1); + expect(contextManager.enable).toHaveBeenCalledTimes(5); + + await resetOpenTelemetryForTest(); + otelMocks.exportedSpans.length = 0; + sentryEndedSpans.length = 0; + sentryProcessor.onStart.mockClear(); + sentryProcessor.onEnd.mockClear(); + sentryProcessor.forceFlush.mockClear(); + sentryProcessor.shutdown.mockClear(); + const sampleBridgeSampler = { + shouldSample: vi.fn((..._args: any[]) => ({ decision: 2 })), + toString: () => "sample-bridge-sampler", + }; + + expect(await initOpenTelemetry(env({ + OTEL_TRACES_EXPORTER: "otlp", + OTEL_EXPORTER_OTLP_ENDPOINT: "http://collector", + OTEL_TRACES_SAMPLER: "always_on", + }), { + sampler: sampleBridgeSampler as any, + spanProcessor: sentryProcessor, + propagator, + contextManager, + })).toBe(true); + await withOtelSpan("otlp-and-sentry-sampled", undefined, () => undefined); + await flushOpenTelemetry(); + expect(otelMocks.exportedSpans.map((span) => span.name)).toContain("otlp-and-sentry-sampled"); + expect(sampleBridgeSampler.shouldSample).toHaveBeenCalledTimes(1); + expect(sentryProcessor.onStart).toHaveBeenCalledTimes(1); + expect(sentryEndedSpans.map((span) => span.name)).toContain("otlp-and-sentry-sampled"); + expect(sentryProcessor.forceFlush).toHaveBeenCalledTimes(1); + expect(contextManager.enable).toHaveBeenCalledTimes(6); + + await resetOpenTelemetryForTest(); + otelMocks.exportedSpans.length = 0; + sentryEndedSpans.length = 0; + sentryProcessor.onStart.mockClear(); + sentryProcessor.onEnd.mockClear(); + sentryProcessor.forceFlush.mockClear(); + sentryProcessor.shutdown.mockClear(); + const sampleSentryWhenOtelDropsSampler = { + shouldSample: vi.fn((..._args: any[]) => ({ decision: 2 })), + toString: () => "sample-sentry-when-otel-drops", + }; + + expect(await initOpenTelemetry(env({ + OTEL_TRACES_EXPORTER: "otlp", + OTEL_EXPORTER_OTLP_ENDPOINT: "http://collector", + OTEL_TRACES_SAMPLER: "always_off", + }), { + sampler: sampleSentryWhenOtelDropsSampler as any, + spanProcessor: sentryProcessor, + propagator, + contextManager, + })).toBe(true); + await withOtelSpan("sentry-keeps-span-when-otel-drops", undefined, () => undefined); + await flushOpenTelemetry(); + expect(otelMocks.exportedSpans.map((span) => span.name)).not.toContain("sentry-keeps-span-when-otel-drops"); + expect(sampleSentryWhenOtelDropsSampler.shouldSample).toHaveBeenCalledTimes(1); + expect(sentryProcessor.onStart).toHaveBeenCalledTimes(1); + expect(sentryEndedSpans.map((span) => span.name)).toContain("sentry-keeps-span-when-otel-drops"); + expect(sentryProcessor.forceFlush).toHaveBeenCalledTimes(1); + expect(contextManager.enable).toHaveBeenCalledTimes(7); + }); + + it("keeps parent-based OTLP sampling isolated when Sentry samples nested spans", async () => { + const sentryEndedSpans: any[] = []; + const sentryProcessor = { + onStart: vi.fn(), + onEnd: vi.fn((span: unknown) => sentryEndedSpans.push(span)), + forceFlush: vi.fn(async () => undefined), + shutdown: vi.fn(async () => undefined), + }; + const sampleSentrySampler = { + shouldSample: vi.fn((..._args: any[]) => ({ decision: 2 })), + toString: () => "sample-sentry", + }; + + expect(await initOpenTelemetry(env({ + OTEL_TRACES_EXPORTER: "otlp", + OTEL_EXPORTER_OTLP_ENDPOINT: "http://collector", + OTEL_TRACES_SAMPLER: "parentbased_always_off", + }), { + sampler: sampleSentrySampler as any, + spanProcessor: sentryProcessor, + })).toBe(true); + await withOtelSpan("sentry-only-parent", undefined, async () => { + await withOtelSpan("sentry-only-child", undefined, () => undefined); + }); + await withOtelSpan( + "remote-parent-not-sampled", + undefined, + () => undefined, + { parentTraceParent: "00-aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa-bbbbbbbbbbbbbbbb-00" }, + ); + await flushOpenTelemetry(); + + expect(otelMocks.exportedSpans.map((span) => span.name)).toEqual([]); + expect(sentryEndedSpans.map((span) => span.name)).toEqual([ + "sentry-only-child", + "sentry-only-parent", + "remote-parent-not-sampled", + ]); + expect(sampleSentrySampler.shouldSample).toHaveBeenCalledTimes(3); + expect(sentryProcessor.onStart).toHaveBeenCalledTimes(3); + expect(sentryProcessor.forceFlush).toHaveBeenCalledTimes(1); + }); + + it("adds hashed tenant and decision attributes to review pipeline spans", async () => { + await initOpenTelemetry(env({ + OTEL_TRACES_EXPORTER: "otlp", + OTEL_EXPORTER_OTLP_ENDPOINT: "http://collector", + })); + + expect(await hashedInstallationId(143010787)).toBe("68b9c2136087c5ca"); + expect(await hashedInstallationId(" 143010787 ")).toBe("68b9c2136087c5ca"); + expect(await hashedInstallationId("not-an-id")).toBeUndefined(); + expect(await hashedInstallationId(Number.NaN)).toBeUndefined(); + expect(hashedInstallationIdWith(143010787, () => "a".repeat(64))).toBe("aaaaaaaaaaaaaaaa"); + expect(hashedInstallationIdWith(" ", () => "a".repeat(64))).toBeUndefined(); + expect(hashedInstallationIdWith(null, () => "a".repeat(64))).toBeUndefined(); + await withReviewPipelineSpan( + "selfhost.review.gate", + { + installationId: 143010787, + repoFullName: "JSONbored/gittensory", + pullNumber: 1001, + operation: "gate_decision", + agent: "dual-ai", + }, + async () => { + await setReviewPipelineSpanOutcome({ decisionOutcome: "success" }); + }, + ); + await flushOpenTelemetry(); + + const span = otelMocks.exportedSpans.find((entry) => entry.name === "selfhost.review.gate"); + expect(span.attributes).toMatchObject({ + "github.repository": "JSONbored/gittensory", + "github.pull_request.number": 1001, + "github.installation_id_hash": "68b9c2136087c5ca", + "gittensory.operation": "gate_decision", + "gittensory.agent": "dual-ai", + "gittensory.decision_outcome": "success", + }); + await expect(reviewTraceAttributes({})).resolves.toEqual({}); + }); + it("swallows exporter flush and shutdown failures", async () => { await initOpenTelemetry(env({ OTEL_TRACES_EXPORTER: "otlp", diff --git a/test/unit/selfhost-sentry.test.ts b/test/unit/selfhost-sentry.test.ts index de196b9da..16e31ac0e 100644 --- a/test/unit/selfhost-sentry.test.ts +++ b/test/unit/selfhost-sentry.test.ts @@ -3,19 +3,40 @@ import { hostname } from "node:os"; // Mock @sentry/node so the dynamic import inside initSentry() resolves to spies. Hoisted so vi.mock can see it. const mocks = vi.hoisted(() => { - const scope = { setContext: vi.fn(), setLevel: vi.fn(), setTag: vi.fn(), setFingerprint: vi.fn(), addEventProcessor: vi.fn() }; + const scope = { + setContext: vi.fn(), + setLevel: vi.fn(), + setTag: vi.fn(), + setFingerprint: vi.fn(), + addEventProcessor: vi.fn(), + }; + const client = { id: "sentry-client" }; return { + client, scope, - init: vi.fn(), + init: vi.fn(() => client), withScope: vi.fn((cb: (s: typeof scope) => void) => cb(scope)), captureException: vi.fn(), captureMessage: vi.fn(), captureCheckIn: vi.fn((checkIn: { checkInId?: string }) => checkIn.checkInId ?? "check-in-id"), flush: vi.fn().mockResolvedValue(true), - // Mirror @sentry/node's startSpan contract: invoke the callback inside the span and return its value. - startSpan: vi.fn((_opts: unknown, cb: () => T): T => cb()), + SentryContextManager: vi.fn(function (this: { kind: string }) { + this.kind = "context-manager"; + }), + validateOpenTelemetrySetup: vi.fn(), }; }); +const sentryOtelMocks = vi.hoisted(() => ({ + SentrySampler: vi.fn(function (this: { client: unknown }, client: unknown) { + this.client = client; + }), + SentryPropagator: vi.fn(function (this: { kind: string }) { + this.kind = "propagator"; + }), + SentrySpanProcessor: vi.fn(function (this: { kind: string }) { + this.kind = "span-processor"; + }), +})); const otelMocks = vi.hoisted(() => ({ currentOtelTraceIds: vi.fn(), })); @@ -26,13 +47,22 @@ vi.mock("@sentry/node", () => ({ captureMessage: mocks.captureMessage, captureCheckIn: mocks.captureCheckIn, flush: mocks.flush, - startSpan: mocks.startSpan, + SentryContextManager: mocks.SentryContextManager, + validateOpenTelemetrySetup: mocks.validateOpenTelemetrySetup, +})); +vi.mock("@sentry/opentelemetry", () => ({ + SentrySampler: sentryOtelMocks.SentrySampler, + SentryPropagator: sentryOtelMocks.SentryPropagator, + SentrySpanProcessor: sentryOtelMocks.SentrySpanProcessor, })); vi.mock("../../src/selfhost/otel", () => ({ currentOtelTraceIds: otelMocks.currentOtelTraceIds, + openTelemetryTraceExportEnabled: (env: NodeJS.ProcessEnv) => + Boolean(env.OTEL_TRACES_EXPORTER?.includes("otlp") && env.OTEL_EXPORTER_OTLP_ENDPOINT), })); import { + buildSentryOpenTelemetryBridge, initSentry, captureError, captureReviewFailure, @@ -40,14 +70,11 @@ import { forwardStructuredLogToSentry, installStructuredLogForwarding, resolveSentryRelease, + resolveSentryTracesSampleRate, resolveSentryMonitorSlug, - resolveTracesSampleRate, scrubEvent, resetSentryForTest, - sentryTracingEnabled, - sentrySpanAttributes, withSentryMonitor, - withSentrySpan, } from "../../src/selfhost/sentry"; beforeEach(() => { @@ -65,6 +92,8 @@ const scrubbedEvent = (event: T): T => { expect(scrubbed).not.toBeNull(); return scrubbed as T; }; +const lastInitOptions = (): any => + (mocks.init.mock.calls.at(-1) as unknown[] | undefined)?.[0] as any; const fakeClassicAccessToken = (): string => `${"github" + "_pat_"}${"a".repeat(24)}`; const fakeQueryTokenKey = (): string => "github" + "_token"; @@ -269,6 +298,31 @@ describe("scrubEvent — redact secrets before an event leaves the box", () => { expect(scrubEvent(event)).toBeNull(); }); + + it("drops raw installation ids before hashing is available and uses a stable hash after init", async () => { + const noHasherEvent = scrubbedEvent({ + extra: { installationId: 143010787 }, + }) as any; + expect(noHasherEvent.extra.installationId).toBeUndefined(); + expect(noHasherEvent.extra.installation_id_hash).toBeUndefined(); + + const invalidEvent = scrubbedEvent({ + extra: { installation_id: "not-an-installation" }, + }) as any; + expect(invalidEvent.extra.installation_id).toBeUndefined(); + expect(invalidEvent.extra.installation_id_hash).toBeUndefined(); + + await initSentry({ SENTRY_DSN: "d" } as unknown as NodeJS.ProcessEnv); + const ev = scrubbedEvent({ + tags: { installationId: 143010787, repo: "owner/repo" }, + contexts: { log: { installation_id: "143010787" } }, + }) as any; + + expect(ev.tags.installationId).toBeUndefined(); + expect(ev.tags.installation_id_hash).toBe("68b9c2136087c5ca"); + expect(ev.contexts.log.installation_id).toBeUndefined(); + expect(ev.contexts.log.installation_id_hash).toBe("68b9c2136087c5ca"); + }); }); describe("disabled when SENTRY_DSN is unset (modular opt-out → complete no-op)", () => { @@ -308,6 +362,16 @@ describe("enabled when SENTRY_DSN is set", () => { expect(resolveSentryRelease({} as unknown as NodeJS.ProcessEnv)).toBeUndefined(); }); + it("resolves a positive Sentry trace sample rate and treats unset/zero/invalid as disabled", () => { + expect(resolveSentryTracesSampleRate({} as unknown as NodeJS.ProcessEnv)).toBeUndefined(); + expect(resolveSentryTracesSampleRate({ SENTRY_TRACES_SAMPLE_RATE: " " } as unknown as NodeJS.ProcessEnv)).toBeUndefined(); + expect(resolveSentryTracesSampleRate({ SENTRY_TRACES_SAMPLE_RATE: "0" } as unknown as NodeJS.ProcessEnv)).toBeUndefined(); + expect(resolveSentryTracesSampleRate({ SENTRY_TRACES_SAMPLE_RATE: "-1" } as unknown as NodeJS.ProcessEnv)).toBeUndefined(); + expect(resolveSentryTracesSampleRate({ SENTRY_TRACES_SAMPLE_RATE: "nope" } as unknown as NodeJS.ProcessEnv)).toBeUndefined(); + expect(resolveSentryTracesSampleRate({ SENTRY_TRACES_SAMPLE_RATE: "0.25" } as unknown as NodeJS.ProcessEnv)).toBe(0.25); + expect(resolveSentryTracesSampleRate({ SENTRY_TRACES_SAMPLE_RATE: "9" } as unknown as NodeJS.ProcessEnv)).toBe(1); + }); + it("returns true and wires init with defaults (?? right-hand branches) + the scrubber as beforeSend", async () => { expect( await initSentry({ @@ -315,10 +379,11 @@ describe("enabled when SENTRY_DSN is set", () => { } as unknown as NodeJS.ProcessEnv), ).toBe(true); expect(mocks.init).toHaveBeenCalledTimes(1); - const opts = mocks.init.mock.calls[0]![0]; + const opts = lastInitOptions(); expect(opts.environment).toBe("production"); expect(opts.release).toBeUndefined(); - expect(opts.tracesSampleRate).toBe(0); + expect(opts.tracesSampleRate).toBeUndefined(); + expect(opts.skipOpenTelemetrySetup).toBeUndefined(); expect( opts.beforeSend({ extra: { sessionToken: "s" } }).extra.sessionToken, ).toBe("[redacted]"); @@ -337,16 +402,51 @@ describe("enabled when SENTRY_DSN is set", () => { SENTRY_TRACES_SAMPLE_RATE: "0.5", SENTRY_SERVER_NAME: "gittensory-us-east", } as unknown as NodeJS.ProcessEnv); - const opts = mocks.init.mock.calls[0]![0]; + const opts = lastInitOptions(); expect(opts.environment).toBe("staging"); expect(opts.release).toBe("v9"); expect(opts.tracesSampleRate).toBe(0.5); + expect(opts.skipOpenTelemetrySetup).toBe(true); expect(opts.serverName).toBe("gittensory-us-east"); }); it("defaults serverName to the OS hostname (not the API-origin URL) when SENTRY_SERVER_NAME is unset/blank", async () => { await initSentry({ SENTRY_DSN: "d", SENTRY_SERVER_NAME: " ", PUBLIC_API_ORIGIN: "https://self.host" } as unknown as NodeJS.ProcessEnv); - expect(mocks.init.mock.calls[0]![0].serverName).toBe(hostname()); + expect(lastInitOptions().serverName).toBe(hostname()); + }); + + it("uses the custom OpenTelemetry setup when OTLP traces are enabled even if Sentry trace export is off", async () => { + await initSentry({ + SENTRY_DSN: "d", + OTEL_TRACES_EXPORTER: "otlp", + OTEL_EXPORTER_OTLP_ENDPOINT: "http://otel-collector:4318", + } as unknown as NodeJS.ProcessEnv); + const opts = lastInitOptions(); + expect(opts.tracesSampleRate).toBeUndefined(); + expect(opts.skipOpenTelemetrySetup).toBe(true); + }); + + it("builds the Sentry OpenTelemetry bridge, adding the span processor only when Sentry traces are sampled", async () => { + await expect(buildSentryOpenTelemetryBridge()).resolves.toBeUndefined(); + + await initSentry({ SENTRY_DSN: "d" } as unknown as NodeJS.ProcessEnv); + let bridge = await buildSentryOpenTelemetryBridge(); + expect(sentryOtelMocks.SentrySampler).not.toHaveBeenCalled(); + expect(sentryOtelMocks.SentryPropagator).toHaveBeenCalledTimes(1); + expect(mocks.SentryContextManager).toHaveBeenCalledTimes(1); + expect(sentryOtelMocks.SentrySpanProcessor).not.toHaveBeenCalled(); + bridge?.validate?.(); + expect(mocks.validateOpenTelemetrySetup).toHaveBeenCalledTimes(1); + + resetSentryForTest(); + vi.clearAllMocks(); + await initSentry({ + SENTRY_DSN: "d", + SENTRY_TRACES_SAMPLE_RATE: "0.5", + } as unknown as NodeJS.ProcessEnv); + bridge = await buildSentryOpenTelemetryBridge(); + expect(bridge?.sampler).toBeInstanceOf(sentryOtelMocks.SentrySampler); + expect(bridge?.spanProcessor).toBeInstanceOf(sentryOtelMocks.SentrySpanProcessor); }); it("uses the image-baked version as the release fallback and ignores blank overrides", async () => { @@ -362,7 +462,7 @@ describe("enabled when SENTRY_DSN is set", () => { SENTRY_RELEASE: "", GITTENSORY_VERSION: "gittensory-selfhost@0.1.0", } as unknown as NodeJS.ProcessEnv); - expect(mocks.init.mock.calls[0]![0].release).toBe( + expect(lastInitOptions().release).toBe( "gittensory-selfhost@0.1.0", ); }); @@ -384,9 +484,17 @@ describe("enabled when SENTRY_DSN is set", () => { }); expect(mocks.captureException).toHaveBeenCalledTimes(1); mocks.scope.setContext.mockClear(); + captureError(new Error("invalid install"), { + kind: "job_dead", + installation_id: "not-an-installation", + }); + expect(mocks.scope.setContext).toHaveBeenCalledWith("gittensory", { + kind: "job_dead", + }); + mocks.scope.setContext.mockClear(); captureError("plain string with no context"); expect(mocks.scope.setContext).not.toHaveBeenCalled(); - expect(mocks.captureException).toHaveBeenCalledTimes(2); + expect(mocks.captureException).toHaveBeenCalledTimes(3); }); it("captureReviewFailure sets error level + repo/PR/SHA tags, skipping null/undefined, and works without context", async () => { @@ -395,12 +503,24 @@ describe("enabled when SENTRY_DSN is set", () => { repo: "o/r", pr: 7, head_sha: "abc", + installationId: 1, + operation: "gate_decision", + decision_outcome: "failure", owner: null, }); expect(mocks.scope.setLevel).toHaveBeenCalledWith("error"); expect(mocks.scope.setTag).toHaveBeenCalledWith("repo", "o/r"); expect(mocks.scope.setTag).toHaveBeenCalledWith("pr", "7"); expect(mocks.scope.setTag).toHaveBeenCalledWith("head_sha", "abc"); + expect(mocks.scope.setTag).toHaveBeenCalledWith("installation_id_hash", "21ab41515eeee762"); + expect(mocks.scope.setTag).toHaveBeenCalledWith("operation", "gate_decision"); + expect(mocks.scope.setTag).toHaveBeenCalledWith("decision_outcome", "failure"); + expect(mocks.scope.setContext).toHaveBeenCalledWith("review", expect.objectContaining({ + installation_id_hash: "21ab41515eeee762", + })); + expect(mocks.scope.setContext).not.toHaveBeenCalledWith("review", expect.objectContaining({ + installationId: 1, + })); expect(mocks.scope.setTag).not.toHaveBeenCalledWith( "owner", expect.anything(), @@ -627,7 +747,7 @@ describe("forwardStructuredLogToSentry — central console.log → Sentry error expect(lastCapturedError().message).toBe("(JSONbored/awesome-claude#4240)"); }); - it("leads the title with the real error detail + indexes filterable tags + fingerprints by event (#observability)", async () => { + it("leads the title with the real error detail + indexes filterable hashed tenant tags + fingerprints by event (#observability)", async () => { await initSentry({ SENTRY_DSN: "d" } as unknown as NodeJS.ProcessEnv); forwardStructuredLogToSentry( JSON.stringify({ @@ -643,7 +763,14 @@ describe("forwardStructuredLogToSentry — central console.log → Sentry error expect(lastCapturedError().message).toBe("The operation was aborted due to timeout"); // The present log dimensions become filterable tags. expect(mocks.scope.setTag).toHaveBeenCalledWith("repo", "JSONbored/gittensory"); - expect(mocks.scope.setTag).toHaveBeenCalledWith("installationId", "143010787"); + expect(mocks.scope.setTag).toHaveBeenCalledWith("installation_id_hash", "68b9c2136087c5ca"); + expect(mocks.scope.setTag).not.toHaveBeenCalledWith("installationId", expect.anything()); + expect(mocks.scope.setContext).toHaveBeenCalledWith("log", expect.objectContaining({ + installation_id_hash: "68b9c2136087c5ca", + })); + expect(mocks.scope.setContext).not.toHaveBeenCalledWith("log", expect.objectContaining({ + installationId: 143010787, + })); // Recurrences of one failure group into a single issue by event. expect(mocks.scope.setFingerprint).toHaveBeenCalledWith(["gittensory-log", "orb_broker_unavailable"]); }); @@ -651,29 +778,27 @@ describe("forwardStructuredLogToSentry — central console.log → Sentry error it("strips the synthetic wrapper stack so the issue culprit is not forwardStructuredLogToSentry", async () => { await initSentry({ SENTRY_DSN: "d" } as unknown as NodeJS.ProcessEnv); forwardStructuredLogToSentry( - JSON.stringify({ level: "error", event: "orb_broker_unavailable", error: "timeout" }), + JSON.stringify({ level: "error", event: "orb_broker_unavailable", message: "timeout" }), ); - // The captured Error's stack is reduced to its header line — no "at …" frames — so Sentry cannot attribute the - // issue to this forwarder (or the console sink) the way it did when the real (synthetic) stack was attached. + const stack = lastCapturedError().stack ?? ""; expect(stack).not.toMatch(/\n\s+at /); expect(stack).toBe("orb_broker_unavailable: timeout"); }); - it("sets the issue culprit (event.transaction) to the event slug, and skips it when there is no event", async () => { + it("sets the issue culprit transaction to the event slug, and skips it when there is no event", async () => { await initSentry({ SENTRY_DSN: "d" } as unknown as NodeJS.ProcessEnv); forwardStructuredLogToSentry( - JSON.stringify({ level: "error", event: "orb_broker_unavailable", error: "timeout" }), + JSON.stringify({ level: "error", event: "orb_broker_unavailable", message: "timeout" }), ); - // The scoped event processor stamps the operational event slug as the transaction (Sentry's culprit input). + const processor = mocks.scope.addEventProcessor.mock.calls.at(-1)?.[0] as ( - e: Record, + event: Record, ) => Record; expect(processor({})).toEqual({ transaction: "orb_broker_unavailable" }); - // A no-event error log has no slug to use as a culprit, so no transaction processor is registered. - mocks.scope.addEventProcessor.mockClear(); - forwardStructuredLogToSentry(JSON.stringify({ level: "error", code: 500 })); + vi.clearAllMocks(); + forwardStructuredLogToSentry(JSON.stringify({ level: "error", message: "timeout" })); expect(mocks.scope.addEventProcessor).not.toHaveBeenCalled(); }); @@ -861,7 +986,8 @@ describe("installStructuredLogForwarding — central console sink instrumentatio ); expect(lastCapturedError().name).toBe("orb_broker_unavailable"); - expect(lastCapturedError().message).toBe("installationId=1"); + expect(lastCapturedError().message).toBe("(no message — see the log context)"); + expect(mocks.scope.setTag).toHaveBeenCalledWith("installation_id_hash", "21ab41515eeee762"); expect(base.error).toHaveBeenCalledTimes(1); }); @@ -922,82 +1048,3 @@ describe("installStructuredLogForwarding — central console sink instrumentatio expect(base.error).toHaveBeenCalledTimes(2); }); }); - -const DSN = "https://k@o.ingest/1"; -const asEnv = (e: Record) => e as unknown as NodeJS.ProcessEnv; - -describe("resolveTracesSampleRate — opt-in, clamped, safe default (#1734)", () => { - it("defaults to 0, parses a valid rate, clamps to [0,1], and treats a non-finite value as 0", () => { - expect(resolveTracesSampleRate(asEnv({}))).toBe(0); - expect(resolveTracesSampleRate(asEnv({ SENTRY_TRACES_SAMPLE_RATE: "0.25" }))).toBe(0.25); - expect(resolveTracesSampleRate(asEnv({ SENTRY_TRACES_SAMPLE_RATE: "5" }))).toBe(1); - expect(resolveTracesSampleRate(asEnv({ SENTRY_TRACES_SAMPLE_RATE: "-2" }))).toBe(0); - expect(resolveTracesSampleRate(asEnv({ SENTRY_TRACES_SAMPLE_RATE: "abc" }))).toBe(0); - }); -}); - -describe("sentrySpanAttributes — safe, low-cardinality only", () => { - it("drops secret-keyed and null/undefined keys, keeps scalars, truncates long strings", () => { - const out = sentrySpanAttributes({ - "ai.model": "gpt", - "job.attempt": 2, - ok: true, - apiKey: "shh", - token: "x", - missing: null, - undef: undefined, - nan: Number.NaN, // a non-finite number is dropped, never tagged - nested: { a: 1 }, // a non-scalar is dropped (no unbounded blobs on a span) - long: "z".repeat(200), - }); - expect(out).toEqual({ - "ai.model": "gpt", - "job.attempt": 2, - ok: true, - long: `${"z".repeat(157)}...`, - }); - }); - - it("returns an empty object for undefined input", () => { - expect(sentrySpanAttributes(undefined)).toEqual({}); - }); -}); - -describe("tracing is a complete no-op unless sampling is configured > 0 (#1734)", () => { - it("with Sentry off, withSentrySpan runs fn but starts NO span and reports tracing disabled", async () => { - expect(sentryTracingEnabled()).toBe(false); - expect(await withSentrySpan("s", { a: 1 }, async () => "r")).toBe("r"); - expect(mocks.startSpan).not.toHaveBeenCalled(); - }); - - it("with the DSN set but sample rate 0 (default), tracing stays off and starts no span", async () => { - await initSentry(asEnv({ SENTRY_DSN: DSN })); // no SENTRY_TRACES_SAMPLE_RATE → 0 - expect(sentryTracingEnabled()).toBe(false); - await withSentrySpan("s", undefined, async () => "r"); - expect(mocks.startSpan).not.toHaveBeenCalled(); - }); -}); - -describe("tracing emits spans when sampling is enabled (#1734)", () => { - beforeEach(async () => { - await initSentry(asEnv({ SENTRY_DSN: DSN, SENTRY_TRACES_SAMPLE_RATE: "1" })); - }); - - it("starts a named span tagged with safe attributes and returns fn's value", async () => { - expect(sentryTracingEnabled()).toBe(true); - const result = await withSentrySpan("selfhost.ai.provider", { "ai.model": "gpt", apiKey: "shh" }, async () => 42); - expect(result).toBe(42); - expect(mocks.startSpan).toHaveBeenCalledTimes(1); - const [opts] = mocks.startSpan.mock.calls[0]!; - expect(opts).toMatchObject({ name: "selfhost.ai.provider", op: "selfhost.ai.provider" }); - expect((opts as { attributes: Record }).attributes).toEqual({ "ai.model": "gpt" }); // secret dropped - }); - - it("propagates an error thrown by fn (the caller's error is never swallowed by the span wrapper)", async () => { - await expect( - withSentrySpan("selfhost.queue.job", undefined, async () => { - throw new Error("boom"); - }), - ).rejects.toThrow("boom"); - }); -}); diff --git a/test/unit/selfhost-tracing.test.ts b/test/unit/selfhost-tracing.test.ts index 6b7bfee4f..8c8589d4c 100644 --- a/test/unit/selfhost-tracing.test.ts +++ b/test/unit/selfhost-tracing.test.ts @@ -1,33 +1,25 @@ import { describe, it, expect, vi, beforeEach } from "vitest"; -// withReviewSpan composes the two tracer wrappers; both are mocked here as passthroughs so we assert the -// composition (one boundary → both tracers, option forwarding, value pass-through) without any real SDK. +// withReviewSpan is a small passthrough boundary; mock OTEL so we assert option forwarding and value pass-through +// without loading the real SDK. Sentry receives these spans through the OTEL bridge when configured. const otel = vi.hoisted(() => ({ withOtelSpan: vi.fn( (_name: string, _attrs: unknown, fn: () => T | Promise, _options?: unknown): T | Promise => fn(), ), })); -const sentry = vi.hoisted(() => ({ - withSentrySpan: vi.fn( - (_name: string, _attrs: unknown, fn: () => T | Promise): T | Promise => fn(), - ), -})); vi.mock("../../src/selfhost/otel", () => ({ withOtelSpan: otel.withOtelSpan })); -vi.mock("../../src/selfhost/sentry", () => ({ withSentrySpan: sentry.withSentrySpan })); import { withReviewSpan } from "../../src/selfhost/tracing"; beforeEach(() => vi.clearAllMocks()); describe("withReviewSpan — one boundary feeds both tracers (#1734)", () => { - it("runs fn through the OTEL span wrapping the Sentry span, and returns fn's value", async () => { + it("runs fn through the OTEL span and returns fn's value", async () => { const result = await withReviewSpan("selfhost.queue.job", { "job.type": "github-webhook" }, async () => "ok"); expect(result).toBe("ok"); expect(otel.withOtelSpan).toHaveBeenCalledTimes(1); - expect(sentry.withSentrySpan).toHaveBeenCalledTimes(1); - // Same span name + attributes are handed to both tracers. expect(otel.withOtelSpan.mock.calls[0]![0]).toBe("selfhost.queue.job"); - expect(sentry.withSentrySpan.mock.calls[0]![0]).toBe("selfhost.queue.job"); + expect(otel.withOtelSpan.mock.calls[0]![1]).toEqual({ "job.type": "github-webhook" }); }); it("forwards the parentTraceParent option to the OTEL span (cross-job trace continuity)", async () => {