From ed73a13a92502b63f577d28209c86cf99fc56fef Mon Sep 17 00:00:00 2001 From: Kevin Kyburz <30409887+swissky@users.noreply.github.com> Date: Thu, 18 Jun 2026 16:28:45 +0200 Subject: [PATCH 1/2] fix(core): prevent isolate-wide hang from a stalled session.get on Workers (#1274) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A request cancelled mid-`session.get("user")` on Cloudflare Workers can leave the session-store read as a promise that never settles, poisoning the isolate so every later session-bearing request hangs (≈0 CPU, multi-minute wall, `canceled`) — reliably reproducible on fresh isolates right after a deploy. The auth middleware now resolves the session user via a helper that: 1. anchors the read with `after()` so a cancelled request still drives it to completion (the promise settles; the isolate is not poisoned), and 2. applies a fail-closed timeout backstop — a still-stalled read degrades to "unauthenticated for this request" instead of hanging. --- .changeset/session-get-hang.md | 5 ++ packages/core/src/astro/middleware/auth.ts | 59 ++++++++++++++++++++-- 2 files changed, 61 insertions(+), 3 deletions(-) create mode 100644 .changeset/session-get-hang.md diff --git a/.changeset/session-get-hang.md b/.changeset/session-get-hang.md new file mode 100644 index 000000000..ad5618b7d --- /dev/null +++ b/.changeset/session-get-hang.md @@ -0,0 +1,5 @@ +--- +"emdash": patch +--- + +Fix logged-in pages hanging indefinitely on Cloudflare Workers (#1274). A request cancelled mid-`session.get("user")` could leave the session-store read as a promise that never settles, poisoning the isolate so every later session-bearing request hung (0-CPU, multi-minute, `canceled`) — reliably reproducible on fresh isolates right after a deploy. The auth middleware now anchors the session read with `after()` so a cancelled request still drives it to completion (preventing the isolate poisoning), with a fail-closed timeout backstop that degrades a still-stalled read to "unauthenticated for this request" rather than hanging. diff --git a/packages/core/src/astro/middleware/auth.ts b/packages/core/src/astro/middleware/auth.ts index 29bf57f96..934b314b0 100644 --- a/packages/core/src/astro/middleware/auth.ts +++ b/packages/core/src/astro/middleware/auth.ts @@ -20,6 +20,7 @@ import { authenticate as virtualAuthenticate } from "virtual:emdash/auth"; // @ts-ignore - virtual module import virtualConfig from "virtual:emdash/config"; +import { after } from "../../after.js"; import { checkPublicCsrf } from "../../api/csrf.js"; import { apiError } from "../../api/error.js"; import { getPublicOrigin } from "../../api/public-url.js"; @@ -35,6 +36,58 @@ import type { ExternalAuthConfig } from "../../auth/types.js"; import type { EmDashHandlers } from "../types.js"; import { buildEmDashCsp } from "./csp.js"; +/** + * Backstop timeout for resolving the session user. A live session-store read + * settles in a few ms, so this only ever fires on a genuinely stalled read. + */ +const SESSION_GET_TIMEOUT_MS = 3_000; + +/** + * Resolve the session user without risking an isolate-wide hang. + * + * On Cloudflare Workers, a request cancelled mid-`session.get()` (client + * disconnect, context teardown) can leave the underlying session-store read as + * a promise that never settles — neither resolving nor rejecting. Awaiting it + * directly hangs the request until the platform kills it, and because the + * stalled promise is shared at the isolate level, every later session-bearing + * request hangs too (observed as 0-CPU, multi-minute, `canceled` responses; + * see #1274). A surrounding try/catch cannot help: the promise never rejects. + * + * Two layers, mirroring the reclaimable-cache pattern used elsewhere in core: + * 1. `after()` anchors the read so a cancelled request still drives it to + * completion — the promise settles and the isolate is not poisoned for + * subsequent requests (prevents the hang rather than merely surviving it). + * 2. A timeout is a fail-closed backstop: a still-stalled read resolves to + * `undefined`, and every caller treats the absence of a session user as + * unauthenticated (anonymous on public routes, 401/redirect on protected + * ones). It can only ever drop privileges for that one request, never grant + * them. + */ +async function resolveSessionUser( + session: { get(key: "user"): Promise } | undefined, + timeoutMs = SESSION_GET_TIMEOUT_MS, +): Promise { + if (!session) return undefined; + const read: Promise = Promise.resolve(session.get("user")).catch(() => undefined); + // Keep the worker alive past response/cancellation so the read completes and + // the shared promise settles — this is what prevents the isolate poisoning. + after(() => + read.then( + () => undefined, + () => undefined, + ), + ); + let timer: ReturnType | undefined; + const timeout = new Promise((resolve) => { + timer = setTimeout(resolve, timeoutMs, undefined); + }); + try { + return await Promise.race([read, timeout]); + } finally { + clearTimeout(timer); + } +} + declare global { namespace App { interface Locals { @@ -407,7 +460,7 @@ async function handlePluginRouteAuth( try { // Try session auth (sets locals.user if session exists) const { session } = context; - const sessionUser = await session?.get("user"); + const sessionUser = await resolveSessionUser(session); if (sessionUser?.id && emdash?.db) { const adapter = createKyselyAdapter(emdash.db); const user = await adapter.getUserById(sessionUser.id); @@ -435,7 +488,7 @@ async function handlePublicRouteAuth( const { emdash } = locals; try { - const sessionUser = await session?.get("user"); + const sessionUser = await resolveSessionUser(session); if (sessionUser?.id && emdash?.db) { const adapter = createKyselyAdapter(emdash.db); const user = await adapter.getUserById(sessionUser.id); @@ -650,7 +703,7 @@ async function handlePasskeyAuth( try { // Check session for user (session.get returns a Promise) - const sessionUser = await session?.get("user"); + const sessionUser = await resolveSessionUser(session); if (!sessionUser?.id) { if (isApiRoute) { From 8dd418061cc78ff889c38c6241bc27c67e518d1e Mon Sep 17 00:00:00 2001 From: Kevin Kyburz <30409887+swissky@users.noreply.github.com> Date: Thu, 18 Jun 2026 18:13:49 +0200 Subject: [PATCH 2/2] fix(core): apply session-read guard to all call sites + add tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses review feedback on #1538 — the guard was under-applied. Two structurally identical bare `await session.get("user")` reads were left unhardened: - middleware.ts: the main middleware runs *before* the auth middleware (both order: "pre"), so on a session-bearing request this is the *first* session read — it could hang and poison the isolate before the auth fix ever runs. - routes/api/snapshot.ts: a PUBLIC_API_EXACT route the auth middleware skips, so its manual read is the only session lookup. Extract the guard to a shared `astro/session-user.ts` (`resolveSessionUser`) and apply it at all three sites (main middleware keeps its `!hasSessionCookie` short-circuit, the #733 KV-read-miss optimization). Add unit tests for the two testable safety behaviors: timeout backstop on a never-settling read, and fail-closed on a rejecting read. --- .changeset/session-get-hang.md | 2 +- packages/core/src/astro/middleware.ts | 3 +- packages/core/src/astro/middleware/auth.ts | 54 +----------------- .../core/src/astro/routes/api/snapshot.ts | 4 +- packages/core/src/astro/session-user.ts | 57 +++++++++++++++++++ .../tests/unit/astro/session-user.test.ts | 34 +++++++++++ 6 files changed, 98 insertions(+), 56 deletions(-) create mode 100644 packages/core/src/astro/session-user.ts create mode 100644 packages/core/tests/unit/astro/session-user.test.ts diff --git a/.changeset/session-get-hang.md b/.changeset/session-get-hang.md index ad5618b7d..885fef90a 100644 --- a/.changeset/session-get-hang.md +++ b/.changeset/session-get-hang.md @@ -2,4 +2,4 @@ "emdash": patch --- -Fix logged-in pages hanging indefinitely on Cloudflare Workers (#1274). A request cancelled mid-`session.get("user")` could leave the session-store read as a promise that never settles, poisoning the isolate so every later session-bearing request hung (0-CPU, multi-minute, `canceled`) — reliably reproducible on fresh isolates right after a deploy. The auth middleware now anchors the session read with `after()` so a cancelled request still drives it to completion (preventing the isolate poisoning), with a fail-closed timeout backstop that degrades a still-stalled read to "unauthenticated for this request" rather than hanging. +Fix logged-in pages hanging indefinitely on Cloudflare Workers (#1274). A request cancelled mid-`session.get("user")` could leave the session-store read as a promise that never settles, poisoning the isolate so every later session-bearing request hung (0-CPU, multi-minute, `canceled`) — reliably reproducible on fresh isolates right after a deploy. Every session read on the request path (the main middleware, the auth middleware, and the preview-snapshot route) now goes through a shared `resolveSessionUser()` helper that anchors the read with `after()` so a cancelled request still drives it to completion (preventing the isolate poisoning), with a fail-closed timeout backstop that degrades a still-stalled read to "unauthenticated for this request" rather than hanging. diff --git a/packages/core/src/astro/middleware.ts b/packages/core/src/astro/middleware.ts index 06bb3b36b..ca6676d87 100644 --- a/packages/core/src/astro/middleware.ts +++ b/packages/core/src/astro/middleware.ts @@ -62,6 +62,7 @@ import type { EmDashConfig } from "./integration/runtime.js"; import { wrapBodyForStreamMetrics } from "./middleware/stream-end-metrics.js"; import { prefetchLayoutData } from "./prefetch.js"; import { createPublicPluginApiRouteHandler } from "./public-plugin-api-routes.js"; +import { resolveSessionUser } from "./session-user.js"; import type { EmDashHandlers } from "./types.js"; /** @@ -416,7 +417,7 @@ export const onRequest = defineMiddleware(async (context, next) => { // turns normal traffic into a flood of KV read misses. See #733. const hasSessionCookie = cookies.get("astro-session") !== undefined; const sessionUser = - context.isPrerendered || !hasSessionCookie ? null : await context.session?.get("user"); + context.isPrerendered || !hasSessionCookie ? null : await resolveSessionUser(context.session); if (!isEmDashRoute && !isPublicRuntimeRoute && !hasEditCookie && !hasPreviewToken) { if (!sessionUser && !playgroundDb) { diff --git a/packages/core/src/astro/middleware/auth.ts b/packages/core/src/astro/middleware/auth.ts index 934b314b0..cce225e12 100644 --- a/packages/core/src/astro/middleware/auth.ts +++ b/packages/core/src/astro/middleware/auth.ts @@ -20,7 +20,6 @@ import { authenticate as virtualAuthenticate } from "virtual:emdash/auth"; // @ts-ignore - virtual module import virtualConfig from "virtual:emdash/config"; -import { after } from "../../after.js"; import { checkPublicCsrf } from "../../api/csrf.js"; import { apiError } from "../../api/error.js"; import { getPublicOrigin } from "../../api/public-url.js"; @@ -33,61 +32,10 @@ import { resolveApiToken, resolveOAuthToken } from "../../api/handlers/api-token import { hasScope } from "../../auth/api-tokens.js"; import { getAuthMode, type ExternalAuthMode } from "../../auth/mode.js"; import type { ExternalAuthConfig } from "../../auth/types.js"; +import { resolveSessionUser } from "../session-user.js"; import type { EmDashHandlers } from "../types.js"; import { buildEmDashCsp } from "./csp.js"; -/** - * Backstop timeout for resolving the session user. A live session-store read - * settles in a few ms, so this only ever fires on a genuinely stalled read. - */ -const SESSION_GET_TIMEOUT_MS = 3_000; - -/** - * Resolve the session user without risking an isolate-wide hang. - * - * On Cloudflare Workers, a request cancelled mid-`session.get()` (client - * disconnect, context teardown) can leave the underlying session-store read as - * a promise that never settles — neither resolving nor rejecting. Awaiting it - * directly hangs the request until the platform kills it, and because the - * stalled promise is shared at the isolate level, every later session-bearing - * request hangs too (observed as 0-CPU, multi-minute, `canceled` responses; - * see #1274). A surrounding try/catch cannot help: the promise never rejects. - * - * Two layers, mirroring the reclaimable-cache pattern used elsewhere in core: - * 1. `after()` anchors the read so a cancelled request still drives it to - * completion — the promise settles and the isolate is not poisoned for - * subsequent requests (prevents the hang rather than merely surviving it). - * 2. A timeout is a fail-closed backstop: a still-stalled read resolves to - * `undefined`, and every caller treats the absence of a session user as - * unauthenticated (anonymous on public routes, 401/redirect on protected - * ones). It can only ever drop privileges for that one request, never grant - * them. - */ -async function resolveSessionUser( - session: { get(key: "user"): Promise } | undefined, - timeoutMs = SESSION_GET_TIMEOUT_MS, -): Promise { - if (!session) return undefined; - const read: Promise = Promise.resolve(session.get("user")).catch(() => undefined); - // Keep the worker alive past response/cancellation so the read completes and - // the shared promise settles — this is what prevents the isolate poisoning. - after(() => - read.then( - () => undefined, - () => undefined, - ), - ); - let timer: ReturnType | undefined; - const timeout = new Promise((resolve) => { - timer = setTimeout(resolve, timeoutMs, undefined); - }); - try { - return await Promise.race([read, timeout]); - } finally { - clearTimeout(timer); - } -} - declare global { namespace App { interface Locals { diff --git a/packages/core/src/astro/routes/api/snapshot.ts b/packages/core/src/astro/routes/api/snapshot.ts index b1141656f..0b5c2f9ec 100644 --- a/packages/core/src/astro/routes/api/snapshot.ts +++ b/packages/core/src/astro/routes/api/snapshot.ts @@ -20,6 +20,8 @@ import { import { getPublicOrigin } from "#api/public-url.js"; import { resolveSecretsCached } from "#config/secrets.js"; +import { resolveSessionUser } from "../../session-user.js"; + export const prerender = false; export const GET: APIRoute = async ({ request, locals, url, session }) => { @@ -31,7 +33,7 @@ export const GET: APIRoute = async ({ request, locals, url, session }) => { if (!user && session && emdash?.db) { try { const { createKyselyAdapter } = await import("@emdash-cms/auth/adapters/kysely"); - const sessionUser = await session.get("user"); + const sessionUser = await resolveSessionUser(session); if (sessionUser?.id) { const adapter = createKyselyAdapter(emdash.db); const resolved = await adapter.getUserById(sessionUser.id); diff --git a/packages/core/src/astro/session-user.ts b/packages/core/src/astro/session-user.ts new file mode 100644 index 000000000..6976b8685 --- /dev/null +++ b/packages/core/src/astro/session-user.ts @@ -0,0 +1,57 @@ +import { after } from "../after.js"; + +/** + * Backstop timeout for resolving the session user. A live session-store read + * settles in a few ms, so this only ever fires on a genuinely stalled read. + */ +export const SESSION_GET_TIMEOUT_MS = 3_000; + +/** + * Resolve the Astro session user without risking an isolate-wide hang. + * + * On Cloudflare Workers, a request cancelled mid-`session.get()` (client + * disconnect, context teardown) can leave the underlying session-store read as + * a promise that never settles — neither resolving nor rejecting. Awaiting it + * directly hangs the request, and because the stalled promise is shared at the + * isolate level, every later session-bearing request hangs too (observed as + * 0-CPU, multi-minute, `canceled` responses; see #1274). A surrounding + * try/catch cannot help: the promise never rejects. + * + * Two layers, mirroring the reclaimable-cache pattern used elsewhere in core: + * 1. `after()` anchors the read so a cancelled request still drives it to + * completion — the promise settles and the isolate is not poisoned for + * subsequent requests (prevents the hang rather than merely surviving it). + * 2. A timeout is a fail-closed backstop: a still-stalled (or rejecting) read + * resolves to `undefined`, and every caller treats the absence of a session + * user as unauthenticated (anonymous on public routes, 401/redirect on + * protected ones). It can only ever drop privileges for that one request, + * never grant them. + * + * Used by every session read on the request path: the main middleware (the + * first read on a session-bearing request), the auth middleware, and the + * preview-snapshot route (which bypasses the auth middleware). + */ +export async function resolveSessionUser( + session: { get(key: "user"): Promise } | undefined, + timeoutMs = SESSION_GET_TIMEOUT_MS, +): Promise { + if (!session) return undefined; + const read: Promise = Promise.resolve(session.get("user")).catch(() => undefined); + // Keep the worker alive past response/cancellation so the read completes and + // the shared promise settles — this is what prevents the isolate poisoning. + after(() => + read.then( + () => undefined, + () => undefined, + ), + ); + let timer: ReturnType | undefined; + const timeout = new Promise((resolve) => { + timer = setTimeout(resolve, timeoutMs, undefined); + }); + try { + return await Promise.race([read, timeout]); + } finally { + clearTimeout(timer); + } +} diff --git a/packages/core/tests/unit/astro/session-user.test.ts b/packages/core/tests/unit/astro/session-user.test.ts new file mode 100644 index 000000000..d83e39a49 --- /dev/null +++ b/packages/core/tests/unit/astro/session-user.test.ts @@ -0,0 +1,34 @@ +import { describe, expect, it, vi } from "vitest"; + +// `after()` lazily imports this virtual module (provided by the Astro +// integration at build time, absent in unit tests). Stub it so the lazy +// import resolves cleanly to a no-op lifetime extender. +vi.mock("virtual:emdash/wait-until", () => ({ waitUntil: undefined })); + +import { resolveSessionUser } from "../../../src/astro/session-user.js"; + +describe("resolveSessionUser", () => { + it("returns the session user when the read resolves", async () => { + const session = { get: vi.fn(async () => ({ id: "user_1" })) }; + await expect(resolveSessionUser(session)).resolves.toEqual({ id: "user_1" }); + expect(session.get).toHaveBeenCalledWith("user"); + }); + + it("returns undefined when there is no session", async () => { + await expect(resolveSessionUser(undefined)).resolves.toBeUndefined(); + }); + + // The core #1274 guarantee: a read that never settles (a workerd context + // cancelled mid-read) must not hang — the timeout backstop resolves it. + it("falls back to undefined instead of hanging when the read never settles", async () => { + const session = { get: () => new Promise<{ id: string }>(() => {}) }; + await expect(resolveSessionUser(session, 20)).resolves.toBeUndefined(); + }); + + // Fail-closed: a rejecting read resolves to undefined (caller treats the + // request as unauthenticated) rather than throwing. + it("fails closed to undefined when the read rejects", async () => { + const session = { get: () => Promise.reject(new Error("boom")) }; + await expect(resolveSessionUser(session, 20)).resolves.toBeUndefined(); + }); +});