Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
118 changes: 118 additions & 0 deletions src/bundles/lifecycle.ts
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,14 @@ import type {
export class BundleLifecycleManager {
private instances = new Map<string, BundleInstance>();
private placementRegistry: PlacementRegistry | null = null;
/**
* Per-(serverName, wsId) lock chain serializing `respawnBundle` against
* concurrent callers (and against itself). Each pending respawn appends
* its promise to the chain; the next caller awaits the previous one
* before starting its own stop/start. Keys are dropped from the map
* when the last queued op resolves, so this doesn't grow unbounded.
*/
private respawnLocks = new Map<string, Promise<unknown>>();
/**
* Getter for a workspace-scoped automations domain context. Set by
* Runtime after the automations platform source is constructed. Used
Expand All @@ -65,6 +73,7 @@ export class BundleLifecycleManager {
private configPath: string | undefined,
private allowInsecureRemotes = false,
private mpakHome: string = join(homedir(), ".mpak"),
private workDir: string = join(homedir(), ".nimblebrain"),
) {}

/** Set the PlacementRegistry (called by Runtime after construction). */
Expand Down Expand Up @@ -567,6 +576,115 @@ export class BundleLifecycleManager {
this.transition(instance, "stopped");
}

/**
* Stop the current bundle subprocess and re-spawn it under the same
* server name. The new subprocess inherits the platform process env
* AND re-reads workspace `user_config` from disk via the named-bundle
* path in `startBundleSource` → `mpak.prepareServer`. This is the
* canonical "user changed a credential, make the bundle use it" path,
* called from the credential-writing tool handlers
* (`set_user_config`, `configureBundle`).
*
* Concurrency: serialized per-(serverName, wsId) via `respawnLocks`.
* Concurrent respawns queue and execute in order; a respawn-in-flight
* blocks subsequent respawns for the same key until it completes
* (success or error). This prevents the half-state window where the
* old source is removed but the new one hasn't started.
*
* Tool calls dispatched during a respawn are NOT serialized by this
* lock — they target the registry, not the lifecycle map — but
* `registry.removeSource` waits for in-flight calls on the old source
* to drain before returning, and the new source is only registered
* after `startBundleSource` returns ready. So a tool call landing
* mid-respawn either completes against the old source (before
* removeSource finishes) or fails with "no source named X" (during
* the brief gap before the new source is registered). The caller
* (engine) treats both as transient and the LLM retries.
*
* Protected bundles refuse to respawn — that flag is the contract
* with platform-critical apps that must not be torn down mid-call.
*
* Scope: named bundles only today. URL bundles read OAuth credentials
* lazily via `WorkspaceOAuthProvider` and don't need respawn on
* credential change. Path bundles (dev-mode local installs) aren't
* exposed via the credential-writing tool surface.
*/
async respawnBundle(
serverName: string,
wsId: string,
registry: ToolRegistry,
): Promise<{ ok: true } | { ok: false; error: string }> {
const key = `${serverName}|${wsId}`;
const prev = this.respawnLocks.get(key);
let resolveNext!: () => void;
const next = new Promise<unknown>((resolve) => {
resolveNext = () => resolve(undefined);
});
this.respawnLocks.set(key, next);
try {
if (prev) await prev;
return await this.runRespawn(serverName, wsId, registry);
} finally {
resolveNext();
// Only drop the chain head if no one queued behind us. If the
// chain advanced, leave it intact for the next waiter.
if (this.respawnLocks.get(key) === next) {
this.respawnLocks.delete(key);
}
}
}

private async runRespawn(
serverName: string,
wsId: string,
registry: ToolRegistry,
): Promise<{ ok: true } | { ok: false; error: string }> {
const instance = this.instances.get(`${serverName}|${wsId}`);
if (!instance) {
return {
ok: false,
error: `Bundle "${serverName}" is not installed in workspace "${wsId}". Install it first via manage_app install.`,
};
}
if (instance.protected) {
return { ok: false, error: `Cannot respawn "${serverName}": bundle is protected` };
}

// Named bundles re-spawn against `{ name: bundleName }`. The named-
// bundle path in `startBundleSource` re-reads user_config from the
// workspace credential store via `resolveUserConfig` — which is the
// whole point of this method.
this.transition(instance, "starting");
try {
if (registry.hasSource(serverName)) {
await registry.removeSource(serverName);
}
await startBundleSource(
{ name: instance.bundleName },
registry,
this.eventSink,
this.configPath ? dirname(this.configPath) : undefined,
{
wsId,
workDir: this.workDir,
allowInsecureRemotes: this.allowInsecureRemotes,
},
);
this.transition(instance, "running");
return { ok: true };
} catch (err) {
// The old source is already removed and the new one failed to
// start. The instance is in `dead` state until the operator
// fixes the underlying cause (typically a bad credential value)
// and the next save retries the respawn.
this.transition(instance, "dead");
return {
ok: false,
error: err instanceof Error ? err.message : String(err),
};
}
}

// ---- State transitions -------------------------------------------------

/**
Expand Down
7 changes: 6 additions & 1 deletion src/bundles/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -334,7 +334,12 @@ export interface BundleInstance {
briefing: BriefingBlock | null;
/** HTTP proxy declaration from _meta["ai.nimblebrain/http-proxy"]. */
httpProxy: HttpProxyConfig | null;
/** Whether the bundle is protected from uninstall. */
/**
* Whether the bundle is protected from tear-down operations: uninstall
* via `nb__manage_app` and respawn via `lifecycle.respawnBundle`. Both
* paths gate on this flag so platform-critical apps can't be torn down
* mid-call (e.g. an in-flight tool would lose its subprocess).
*/
protected: boolean;
/** Whether this is an Upjack app or plain MCP server. */
type: "upjack" | "plain";
Expand Down
2 changes: 1 addition & 1 deletion src/config/nimblebrain-config.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@
},
"protected": {
"type": "boolean",
"description": "Prevents uninstall via nb__manage_app."
"description": "Prevents tear-down operations: uninstall via nb__manage_app and respawn after credential change."
},
"trustScore": { "type": ["number", "null"], "description": "MTF trust score (0-100)." },
"transport": {
Expand Down
4 changes: 3 additions & 1 deletion src/runtime/runtime.ts
Original file line number Diff line number Diff line change
Expand Up @@ -316,12 +316,14 @@ export class Runtime {

// Create placement registry and lifecycle manager
const placementRegistry = new PlacementRegistry();
const mpakHome = join(resolve(resolveWorkDir(config)), "apps");
const resolvedWorkDir = resolve(workDir);
const mpakHome = join(resolvedWorkDir, "apps");
const lifecycle = new BundleLifecycleManager(
events,
config.configPath,
config.allowInsecureRemotes,
mpakHome,
resolvedWorkDir,
);
lifecycle.setPlacementRegistry(placementRegistry);

Expand Down
50 changes: 5 additions & 45 deletions src/tools/connector-tools.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2306,8 +2306,11 @@ async function handleSetUserConfig(
// necessary but not sufficient; without a respawn the running
// subprocess keeps using whatever it was launched with. Mirror the
// chat agent's `configureBundle` pattern so both the chat path and
// the UI path produce identical post-write state.
const respawn = await respawnBundleAfterCredentialChange(ctx, wsId, bundleName, serverName);
// the UI path produce identical post-write state. `respawnBundle`
// owns the per-(serverName, wsId) mutex and protected-bundle gating.
const registry = ctx.runtime.getRegistryForWorkspace(wsId);
const respawnResult = await ctx.runtime.getLifecycle().respawnBundle(serverName, wsId, registry);
const respawn = respawnResult.ok ? { ok: true } : { ok: false, error: respawnResult.error };

const populated = await probeUserConfigPopulated(wsId, bundleName, workDir, schema);
return {
Expand Down Expand Up @@ -2365,49 +2368,6 @@ async function handleClearUserConfig(
};
}

/**
* Tear down + restart a stdio bundle's McpSource so a fresh subprocess
* picks up the just-written credentials from the workspace credential
* store. Called after `set_user_config` and `clear_user_config`.
*
* Why not just leave the bundle running? Mode 1 bundles read
* `user_config` once, at spawn, via `${user_config.foo}` placeholders
* resolved into env vars. The subprocess has no way to re-read after
* launch. Without this respawn the user updates a key in the UI,
* sees "✓ configured," then watches the next tool call fail with the
* old key — the bug the user hit before this fix.
*
* Best-effort by design: a respawn failure (e.g., required field still
* missing after a partial save) shouldn't roll back the credential
* write. The caller's structured response carries `{ respawn: { ok,
* error? } }` so the UI can surface the failure separately.
*/
async function respawnBundleAfterCredentialChange(
ctx: ManageConnectorsContext,
wsId: string,
bundleName: string,
serverName: string,
): Promise<{ ok: boolean; error?: string }> {
try {
const registry = ctx.runtime.getRegistryForWorkspace(wsId);
if (registry.hasSource(serverName)) {
await registry.removeSource(serverName);
}
// Pass `name` (the scoped manifest name) so startBundleSource hits
// the named-bundle path that resolves user_config from the
// workspace credential store. configDir is undefined — same as
// configureBundle's call site; named-bundle path doesn't need it.
await startBundleSource({ name: bundleName }, registry, ctx.runtime.getEventSink(), undefined, {
wsId,
workDir: ctx.runtime.getWorkDir(),
allowInsecureRemotes: ctx.runtime.getAllowInsecureRemotes(),
});
return { ok: true };
} catch (err) {
return { ok: false, error: err instanceof Error ? err.message : String(err) };
}
}

/**
* Workspace admin gate. Returns true if the identity is a workspace
* admin — explicitly via `members[].role === "admin"`, or implicitly
Expand Down
40 changes: 16 additions & 24 deletions src/tools/system-tools.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ import { NoopEventSink } from "../adapters/noop-events.ts";
import type { BundleLifecycleManager } from "../bundles/lifecycle.ts";
import { getMpak } from "../bundles/mpak.ts";
import { deriveServerName } from "../bundles/paths.ts";
import { startBundleSource } from "../bundles/startup.ts";
import type { BundleManifest } from "../bundles/types.ts";
import {
installBundleInWorkspace,
Expand Down Expand Up @@ -219,7 +218,7 @@ export async function createSystemTools(
return await configureBundle(
name,
getRegistry(),
manageBundleCtx.eventSink,
lifecycle,
wsId,
manageBundleCtx.workDir,
gate,
Expand Down Expand Up @@ -712,14 +711,12 @@ function formatUptime(ms: number): string {
async function configureBundle(
name: string,
registry: ToolRegistry,
// Required — passed to the restarted McpSource so its task-augmented tool
// progress events reach SSE broadcasts (Synapse useDataSync). Without it,
// re-configuring a bundle's credentials silently breaks live updates for
// that bundle until the next full platform restart.
eventSink: EventSink,
lifecycle: BundleLifecycleManager,
// Workspace id + work directory — required because credentials are stored
// per-workspace (`{workDir}/workspaces/{wsId}/credentials/{bundle}.json`),
// not globally in `~/.mpak/config.json`. Threaded from the manage_app handler.
// The respawn itself reads eventSink and workDir off the lifecycle manager;
// workDir here is used only by `resolveUserConfig` for the credential write.
wsId: string,
workDir: string,
confirmGate?: ConfirmationGate,
Expand Down Expand Up @@ -771,26 +768,21 @@ async function configureBundle(
forcePrompt: true,
});

// Restart the bundle via the shared primitive — same construction path
// as boot-time / agent install. `startBundleSource` reads the values we
// just persisted above from the workspace credential store. If this
// function diverges from that primitive the rest of the app silently
// breaks (sink plumbing, PYTHONPATH, data-dir layout, user_config
// resolution). Delegate instead: pass `wsId`+`workDir` and let
// `startBundleSource` derive the workspace-scoped data dir itself —
// never compute it here, or it drifts from the install-time layout
// and Upjack entity state disappears across restarts.
// Re-spawn via the central lifecycle helper so this path gets the
// same per-(serverName, wsId) mutex + protected-bundle gating that
// the UI's `set_user_config` path uses. The helper reads the
// workspace credential store via `startBundleSource` → `prepareServer`,
// so the values we just persisted above land in the new subprocess.
const serverName = deriveServerName(name);
if (registry.hasSource(serverName)) {
await registry.removeSource(serverName);
const respawn = await lifecycle.respawnBundle(serverName, wsId, registry);
if (!respawn.ok) {
return {
content: textContent(`Configured ${name} but restart failed: ${respawn.error}`),
isError: true,
};
}
const result = await startBundleSource({ name }, registry, eventSink, undefined, {
wsId,
workDir,
});

const tools = await registry.availableTools();
const count = tools.filter((t) => t.name.startsWith(`${result.sourceName}__`)).length;
const count = tools.filter((t) => t.name.startsWith(`${serverName}__`)).length;
return {
content: textContent(`Configured and restarted ${name}. ${count} tools available.`),
isError: false,
Expand Down
Loading
Loading