rivet-dev
diff --git a/‎.agent/notes/driver-engine-static-test-order.md‎
Lines changed: 190 additions & 0 deletions b/‎.agent/notes/driver-engine-static-test-order.md‎
Lines changed: 190 additions & 0 deletions
diff --git a/‎.gitattributes‎
Lines changed: 1 addition & 0 deletions b/‎.gitattributes‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎CLAUDE.md‎
Lines changed: 6 additions & 1 deletion b/‎CLAUDE.md‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎Cargo.toml‎
Lines changed: 3 additions & 0 deletions b/‎Cargo.toml‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎engine/CLAUDE.md‎
Lines changed: 0 additions & 4 deletions b/‎engine/CLAUDE.md‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎engine/packages/guard/src/routing/pegboard_gateway/resolve_actor_query.rs‎
Lines changed: 21 additions & 1 deletion b/‎engine/packages/guard/src/routing/pegboard_gateway/resolve_actor_query.rs‎
Lines changed: 21 additions & 1 deletion
diff --git a/‎engine/sdks/rust/envoy-client/src/connection.rs‎
Lines changed: 11 additions & 0 deletions b/‎engine/sdks/rust/envoy-client/src/connection.rs‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎engine/sdks/rust/envoy-client/src/envoy.rs‎
Lines changed: 2 additions & 0 deletions b/‎engine/sdks/rust/envoy-client/src/envoy.rs‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎engine/sdks/rust/envoy-client/src/handle.rs‎
Lines changed: 2 additions & 0 deletions b/‎engine/sdks/rust/envoy-client/src/handle.rs‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎rivetkit-typescript/packages/rivetkit-native/index.d.ts‎
Lines changed: 24 additions & 5 deletions b/‎rivetkit-typescript/packages/rivetkit-native/index.d.ts‎
Lines changed: 24 additions & 5 deletions
@@ -0,0 +1,190 @@
+# Driver Engine Static Test Order
+
+This note breaks the `driver-engine.test.ts` suite into file-name groups for static-only debugging.
+
+Scope:
+- `registry (static)` only
+- `client type (http)` only unless a specific bug points to inline client behavior
+- `encoding (bare)` only unless a specific bug points to CBOR or JSON
+- Exclude `agent-os` from the normal pass target
+- Exclude `dynamic-reload` from the static pass target
+
+Checklist rules:
+- A checkbox is marked only when the entire `*.ts` file has been covered and is fully passing.
+- Do not check a file off just because investigation started.
+- Start with a single test name, not a whole file-group or suite label.
+- After one single test passes, grow scope within that same file until the entire file passes.
+- Do not start the next tracked file until the current file is fully passing.
+- If a widened file run fails, stop expanding scope and fix that same file before running anything from the next file.
+- Record average duration only after the full file is passing.
+- The filenames in this note are tracking labels only. `pnpm test ... -t` does not filter by `src/driver-test-suite/tests/<file>.ts`.
+- `driver-engine.test.ts` wires everything into nested `describe(...)` blocks, so filter by the description text from the suite, plus the static path text when needed: `registry (static)`, `client type (http)`, and `encoding (bare)`.
+
+## How To Filter
+
+Use `-t` against the `describe(...)` text, not the filename from this note.
+
+Base command shape:
+
+```bash
+cd rivetkit-typescript/packages/rivetkit
+pnpm test driver-engine.test.ts -t "registry \\(static\\).*client type \\(http\\).*encoding \\(bare\\).*<suite description text>"
+```
+
+To narrow to one single test inside that suite, append a stable chunk of the test name:
+
+```bash
+cd rivetkit-typescript/packages/rivetkit
+pnpm test driver-engine.test.ts -t "registry \\(static\\).*client type \\(http\\).*encoding \\(bare\\).*Actor Driver Tests.*should"
+```
+
+Common suite-description mappings:
+- `actor-state.ts` -> `Actor State Tests`
+- `actor-schedule.ts` -> `Actor Schedule Tests`
+- `actor-sleep.ts` -> `Actor Sleep Tests`
+- `actor-sleep-db.ts` -> `Actor Sleep Database Tests`
+- `actor-lifecycle.ts` -> `Actor Lifecycle Tests`
+- `manager-driver.ts` -> `Manager Driver Tests`
+- `actor-conn.ts` -> `Actor Connection Tests`
+- `actor-conn-state.ts` -> `Actor Connection State Tests`
+- `conn-error-serialization.ts` -> `Connection Error Serialization Tests`
+- `access-control.ts` -> `access control`
+- `actor-vars.ts` -> `Actor Variables`
+- `actor-db.ts` -> `Actor Database (raw) Tests`, `Actor Database (drizzle) Tests`, or `Actor Database Lifecycle Cleanup Tests`
+- `raw-http.ts` -> `raw http`
+- `raw-http-request-properties.ts` -> `raw http request properties`
+- `raw-websocket.ts` -> `raw websocket`
+- `hibernatable-websocket-protocol.ts` -> `hibernatable websocket protocol`
+- `cross-backend-vfs.ts` -> `Cross-Backend VFS Compatibility Tests`
+- `actor-agent-os.ts` -> `Actor agentOS Tests`
+- `dynamic-reload.ts` -> `Dynamic Actor Reload Tests`
+- `actor-conn-status.ts` -> `Connection Status Changes`
+- `gateway-routing.ts` -> `Gateway Routing`
+- `lifecycle-hooks.ts` -> `Lifecycle Hooks`
+
+Why this order:
+- The suite currently pays full per-test harness cost for every test:
+  - fresh namespace
+  - fresh runner config
+  - fresh envoy/driver lifecycle
+- Cheap tests are mostly harness overhead
+- Slow tests are concentrated in sleep, sandbox, workflow, and DB stress categories
+- Wrapper suites that pull in sleep-heavy children should be treated as slow even if the wrapper filename looks generic
+- Files that use sleep/hibernation waits or `describe.sequential` should not stay in the fast block
+
+## Fastest First
+
+These are the best initial groups for static-only bring-up.
+
+- [x] `manager-driver.ts` - avg ~10.3s/test over 16 tests, suite 15.1s
+- [x] `actor-conn.ts` - avg ~8.4s/test over 23 tests, suite 16.0s
+- [x] `actor-conn-state.ts` - avg ~9.3s/test over 8 tests, suite 9.9s
+- [x] `conn-error-serialization.ts` - avg ~8.2s/test over 2 tests, suite 8.2s
+- [x] `actor-destroy.ts` - avg ~9.8s/test over 10 tests, suite 10.2s
+- [x] `request-access.ts` - avg ~9.1s/test over 4 tests, suite 9.1s
+- [x] `actor-handle.ts` - avg ~7.7s/test over 12 tests, suite 8.3s
+- [x] `action-features.ts` - avg ~8.3s/test over 11 tests, suite 8.8s
+- [x] `access-control.ts` - avg ~8.5s/test over 8 tests, suite 8.8s
+- [x] `actor-vars.ts` - avg ~8.3s/test over 5 tests, suite 8.5s
+- [x] `actor-metadata.ts` - avg ~8.3s/test over 6 tests, suite 8.4s
+- [x] `actor-onstatechange.ts` - avg ~8.3s/test over 5 tests, suite 8.3s
+- [x] `actor-db.ts` - avg ~9.5s/test over 28 tests, suite 27.0s
+- [x] `actor-workflow.ts` - avg ~9.2s/test over 19 tests, suite 11.9s
+- [x] `actor-error-handling.ts` - avg ~8.5s/test over 7 tests, suite 8.5s
+- [x] `actor-queue.ts` - avg ~9.3s/test over 25 tests, suite 17.5s
+- [x] `actor-inline-client.ts` - avg ~9.0s/test over 5 tests, suite 9.8s
+- [x] `actor-kv.ts` - avg ~8.4s/test over 3 tests, suite 8.4s
+- [x] `actor-stateless.ts` - avg ~8.6s/test over 6 tests, suite 9.1s
+- [x] `raw-http.ts` - avg ~8.6s/test over 15 tests, suite 10.1s
+- [x] `raw-http-request-properties.ts` - avg ~8.5s/test over 16 tests, suite 9.9s
+- [x] `raw-websocket.ts` - avg ~8.9s/test over 13 tests, suite 11.1s
+- [x] `actor-inspector.ts` - avg ~9.6s/test over 20 tests, suite 12.1s
+- [x] `gateway-query-url.ts` - avg ~8.3s/test over 2 tests, suite 8.3s
+- [x] `actor-db-kv-stats.ts` - avg ~9.0s/test over 11 tests, suite 9.9s
+- [x] `actor-db-pragma-migration.ts` - avg ~8.8s/test over 4 tests, suite 9.0s
+- [x] `actor-state-zod-coercion.ts` - avg ~8.8s/test over 3 tests, suite 8.8s
+- [ ] `actor-conn-status.ts`
+- [ ] `gateway-routing.ts`
+- [ ] `lifecycle-hooks.ts`
+
+## Slow End
+
+These should be last because they are the most likely to dominate wall time.
+
+- [x] `actor-state.ts` - avg ~9.0s/test over 3 tests, suite 9.1s
+- [x] `actor-schedule.ts` - avg ~9.9s/test over 4 tests, suite 9.9s
+- [ ] `actor-sleep.ts`
+- [ ] `actor-sleep-db.ts`
+- [ ] `actor-lifecycle.ts`
+- [ ] `actor-conn-hibernation.ts`
+- [ ] `actor-run.ts`
+- [ ] `actor-sandbox.ts`
+- [ ] `hibernatable-websocket-protocol.ts`
+- [ ] `cross-backend-vfs.ts`
+- [ ] `actor-db-stress.ts`
+
+## Not In Static Pass
+
+These should not block the static-only pass target.
+
+- [ ] `actor-agent-os.ts`
+  Explicitly allowed to skip for now.
+- [ ] `dynamic-reload.ts`
+  Dynamic-only path.
+
+## Files Present But Not Wired In `runDriverTests`
+
+- [ ] `raw-http-direct-registry.ts` - intentionally commented out (blocked on gateway actor queries)
+- [ ] `raw-websocket-direct-registry.ts` - intentionally commented out (blocked on gateway actor queries)
+
+## Suggested Static-Only Debugging Sequence
+
+Use one single test at a time with `-t`, then grow scope within the same file only after that single test passes.
+
+- [ ] Run one single test from the next unchecked file.
+- [ ] Fix the first failing single test before expanding scope.
+- [ ] After one test passes, widen to the rest of that file until the entire file passes.
+- [ ] Check the file off only after the entire file is passing.
+- [ ] After the fast block is clean, run the medium-cost block.
+- [ ] Run the slow-end block last.
+- [ ] Run `agent-os` separately only if explicitly needed.
+
+## Example Commands
+
+Run one tracked file-group by suite description:
+
+```bash
+cd rivetkit-typescript/packages/rivetkit
+pnpm test driver-engine.test.ts -t "registry \\(static\\).*client type \\(http\\).*encoding \\(bare\\).*Actor Driver Tests"
+```
+
+Run one single test inside that tracked file-group:
+
+```bash
+cd rivetkit-typescript/packages/rivetkit
+pnpm test driver-engine.test.ts -t "registry \\(static\\).*client type \\(http\\).*encoding \\(bare\\).*Actor Driver Tests.*should create actors"
+```
+
+Run a slow group explicitly by suite description:
+
+```bash
+cd rivetkit-typescript/packages/rivetkit
+pnpm test driver-engine.test.ts -t "registry \\(static\\).*client type \\(http\\).*encoding \\(bare\\).*Actor Sleep Database Tests"
+```
+
+Run sandbox only:
+
+```bash
+cd rivetkit-typescript/packages/rivetkit
+pnpm test driver-engine.test.ts -t "registry \\(static\\).*client type \\(http\\).*encoding \\(bare\\).*Actor Sandbox Tests"
+```
+
+## Evidence For Slow Ordering
+
+Observed from the current full-run log:
+- cheap tests like raw HTTP property checks are roughly around 1 second end-to-end including teardown
+- sandbox tests are about 8.5 to 8.8 seconds each
+- sleep and sleep-db groups show repeated alarm/sleep cycles and are consistently the longest-running categories in the log
+- `actor-state.ts`, `actor-schedule.ts`, `actor-sleep.ts`, `actor-sleep-db.ts`, and `actor-lifecycle.ts` are all called directly from `mod.ts` and inherit the sleep-heavy cost profile
+- `actor-run.ts`, `actor-conn-hibernation.ts`, and `hibernatable-websocket-protocol.ts` all spend real time in sleep or hibernation waits
+- the suite-wide average is inflated by the repeated harness lifecycle and these slow categories
@@ -17,6 +17,7 @@ engine/sdks/typescript/runner/** linguist-generated=false
 engine/sdks/typescript/test-runner/** linguist-generated=false
 engine/sdks/rust/data/** linguist-generated=false
 engine/sdks/rust/*-protocol/** linguist-generated=false
+engine/sdks/rust/envoy-client/** linguist-generated=false
 engine/sdks/schemas/** linguist-generated=false
 engine/docker/dev/** linguist-generated=true
 engine/docker/dev-host/** linguist-generated=true
 
@@ -64,6 +64,8 @@ cargo test -- --nocapture
 cargo clippy -- -W warnings
 ```
 
+- Ensure lefthook is installed and enabled for git hooks (`lefthook install`).
+
 ### Docker Development Environment
 ```bash
 # Start the development environment with all services
@@ -292,7 +294,10 @@ let error_with_meta = ApiRateLimited { limit: 100, reset_at: 1234567890 }.build(
 - Connection pooling through `packages/common/pools/`
 
 **Performance**
-- ALWAYS prefer a dedicated concurrency container like `scc::HashMap<_, _>` with its async api or `moka::Cache` over `Arc<Mutex<HashMap<_, _>>>`. `Arc<Mutex<_>>` is very slow for containers.
+- Never use `Mutex<HashMap<...>>` or `RwLock<HashMap<...>>`.
+- Use `scc::HashMap` (preferred), `moka::Cache` (for TTL/bounded), or `DashMap` for concurrent maps.
+- Use `scc::HashSet` instead of `Mutex<HashSet<...>>` for concurrent sets.
+- `scc` async methods do not hold locks across `.await` points. Use `entry_async` for atomic read-then-write.
 
 ### Code Style
 - Hard tabs for Rust formatting (see `rustfmt.toml`)
 
@@ -490,6 +490,9 @@ members = [
     package = "rivet-util"
     path = "engine/packages/util"
 
+    [workspace.dependencies.rivet-util-serde]
+    path = "engine/packages/util-serde"
+
     [workspace.dependencies.rivet-util-id]
     path = "engine/packages/util-id"
 
 
@@ -33,10 +33,6 @@ When changing a versioned VBARE schema, follow the existing migration pattern.
 - When adding fields to epoxy workflow state structs, mark them `#[serde(default)]` so Gasoline can replay older serialized state.
 - Epoxy integration tests that spin up `tests/common::TestCtx` must call `shutdown()` before returning.
 
-## Concurrent containers
-
-Never use `Mutex<HashMap<...>>` or `RwLock<HashMap<...>>`. Use `scc::HashMap` (preferred), `moka::Cache` (for TTL/bounded), or `DashMap`. Same for sets: use `scc::HashSet` instead of `Mutex<HashSet<...>>`. Note that `scc` async methods do not hold locks across `.await` points. Use `entry_async` for atomic read-then-write.
-
 ## Test snapshots
 
 Use `test-snapshot-gen` to generate and load RocksDB snapshots of the full UDB KV store for migration and integration tests. Scenarios produce per-replica RocksDB checkpoints stored under `engine/packages/test-snapshot-gen/snapshots/` (git LFS tracked). In tests, use `test_snapshot::SnapshotTestCtx::from_snapshot("scenario-name")` to boot a cluster from snapshot data. See `docs-internal/engine/TEST_SNAPSHOTS.md` for the full guide.
@@ -207,7 +207,27 @@ async fn resolve_query_target_dc_label(
 }
 
 fn serialize_actor_key(key: &[String]) -> Result<String> {
-	serde_json::to_string(key).context("failed to serialize actor key")
+	const EMPTY_KEY: &str = "/";
+	const KEY_SEPARATOR: char = '/';
+
+	if key.is_empty() {
+		return Ok(EMPTY_KEY.to_string());
+	}
+
+	let mut escaped_parts = Vec::with_capacity(key.len());
+	for part in key {
+		if part.is_empty() {
+			escaped_parts.push(String::from("\\0"));
+			continue;
+		}
+
+		let escaped = part
+			.replace('\\', "\\\\")
+			.replace(KEY_SEPARATOR, "\\/");
+		escaped_parts.push(escaped);
+	}
+
+	Ok(escaped_parts.join(EMPTY_KEY))
 }
 
 fn is_duplicate_key_error(err: &anyhow::Error) -> bool {
 
@@ -1,4 +1,5 @@
 use std::sync::Arc;
+use std::sync::atomic::Ordering;
 
 use futures_util::{SinkExt, StreamExt};
 use rivet_envoy_protocol as protocol;
@@ -22,6 +23,11 @@ async fn connection_loop(shared: Arc<SharedContext>) {
 	let mut attempt = 0u32;
 
 	loop {
+		if shared.shutting_down.load(Ordering::Acquire) {
+			tracing::debug!("stopping reconnect loop because envoy is shutting down");
+			return;
+		}
+
 		let connected_at = std::time::Instant::now();
 
 		match single_connection(&shared).await {
@@ -51,6 +57,11 @@ async fn connection_loop(shared: Arc<SharedContext>) {
 			attempt = 0;
 		}
 
+		if shared.shutting_down.load(Ordering::Acquire) {
+			tracing::debug!("skipping reconnect because envoy is shutting down");
+			return;
+		}
+
 		let delay = calculate_backoff(attempt, &BackoffOptions::default());
 		tracing::info!(attempt, delay_ms = delay.as_millis() as u64, "reconnecting");
 		tokio::time::sleep(delay).await;
 
@@ -1,6 +1,7 @@
 use std::collections::HashMap;
 use std::sync::Arc;
 use std::sync::OnceLock;
+use std::sync::atomic::Ordering;
 
 use rivet_envoy_protocol as protocol;
 use tokio::sync::mpsc;
@@ -396,6 +397,7 @@ async fn handle_shutdown(ctx: &mut EnvoyContext) {
 		return;
 	}
 	ctx.shutting_down = true;
+	ctx.shared.shutting_down.store(true, Ordering::Release);
 
 	tracing::debug!("envoy received shutdown");
 
 
@@ -1,4 +1,5 @@
 use std::sync::Arc;
+use std::sync::atomic::Ordering;
 
 use rivet_envoy_protocol as protocol;
 
@@ -15,6 +16,7 @@ pub struct EnvoyHandle {
 
 impl EnvoyHandle {
 	pub fn shutdown(&self, immediate: bool) {
+		self.shared.shutting_down.store(true, Ordering::Release);
 		if immediate {
 			let _ = self.shared.envoy_tx.send(ToEnvoyMessage::Stop);
 		} else {
 
@@ -3,6 +3,20 @@
 
 /* auto-generated by NAPI-RS */
 
+export interface JsBindParam {
+  kind: string
+  intValue?: number
+  floatValue?: number
+  textValue?: string
+  blobValue?: Buffer
+}
+export interface ExecuteResult {
+  changes: number
+}
+export interface QueryResult {
+  columns: Array<string>
+  rows: Array<Array<any>>
+}
 /** Open a native SQLite database backed by the envoy's KV channel. */
 export declare function openDatabaseFromEnvoy(jsHandle: JsEnvoyHandle, actorId: string): Promise<JsNativeDatabase>
 /** Configuration for starting the native envoy client. */
@@ -44,7 +58,12 @@ export declare function startEnvoySyncJs(config: JsEnvoyConfig, eventCallback: (
 /** Start the native envoy client asynchronously. */
 export declare function startEnvoyJs(config: JsEnvoyConfig, eventCallback: (event: any) => void): JsEnvoyHandle
 /** Native SQLite database handle exposed to JavaScript. */
-export declare class JsNativeDatabase { }
+export declare class JsNativeDatabase {
+  run(sql: string, params?: Array<JsBindParam> | undefined | null): Promise<ExecuteResult>
+  query(sql: string, params?: Array<JsBindParam> | undefined | null): Promise<QueryResult>
+  exec(sql: string): Promise<QueryResult>
+  close(): Promise<void>
+}
 /** Native envoy handle exposed to JavaScript via N-API. */
 export declare class JsEnvoyHandle {
   started(): Promise<void>
@@ -64,10 +83,10 @@ export declare class JsEnvoyHandle {
   kvDrop(actorId: string): Promise<void>
   restoreHibernatingRequests(actorId: string, requests: Array<HibernatingRequestEntry>): void
   sendHibernatableWebSocketMessageAck(gatewayId: Buffer, requestId: Buffer, clientMessageIndex: number): void
-  startServerless(payload: Buffer): Promise<void>
-  /** Send a message on an open WebSocket connection. */
-  sendWsMessage(gatewayId: Buffer, requestId: Buffer, data: Buffer, binary: boolean): void
+  /** Send a message on an open WebSocket connection identified by messageIdHex. */
+  sendWsMessage(gatewayId: Buffer, requestId: Buffer, data: Buffer, binary: boolean): Promise<void>
   /** Close an open WebSocket connection. */
-  closeWebsocket(gatewayId: Buffer, requestId: Buffer, code?: number | undefined | null, reason?: string | undefined | null): void
+  closeWebsocket(gatewayId: Buffer, requestId: Buffer, code?: number | undefined | null, reason?: string | undefined | null): Promise<void>
+  startServerless(payload: Buffer): Promise<void>
   respondCallback(responseId: string, data: any): Promise<void>
 }