From 15af4670f10069681b316404c5aa027f89c1ad4f Mon Sep 17 00:00:00 2001 From: Pal Lakatos-Toth Date: Thu, 25 Jun 2026 22:10:16 +0200 Subject: [PATCH] feat(up): Foundry auto-setup, best-model selection, memory CRD parity + fix kars up hang MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make `kars up --foundry-endpoint` actually set up a BYO Foundry project for Memory Store, stop hardcoding a stale model, and fix the post-deploy hang. Foundry auto-setup (new cli/src/commands/up/foundry_setup.ts): - Discover the project; list deployed models (ARM control-plane, no Graph). - Pick the BEST deployed chat model instead of hardcoded gpt-4.1 (pure, tested ranking; --model always wins). Excludes embedding/image/audio. - Ensure an embedding model (Memory Store needs one); best-effort deploy text-embedding-3-small if absent. - Enable the project's system-assigned managed identity if missing (Memory Store authenticates internally as the project MI), then re-read principalId for the existing Azure AI User RBAC grant. All idempotent + non-fatal. CRD parity + status: - Emit a KarsMemory binding CR on `kars up` (Foundry endpoints only), matching what `kars dev` already creates (refs.ts buildKarsMemory/memoryRefName). - Print a CRD status report (InferencePolicy/ToolPolicy/KarsMemory/KarsSandbox). Fix the hang (two causes): - cli/src/preflight.ts: the RBAC spinner was only concluded when fetchSubscriptionPermissions threw or returned a non-empty set; an empty [] left it spinning, and its setInterval kept Node alive — `kars up` hung after the summary with the spinner still animating. Conclude it on the empty path. Also fix a second identical leak in the provider notFound path. - up.ts: process.exit(0) on success (belt-and-suspenders for the detached kubectl port-forward handle). Memory error unmasking (runtime): - foundry.ts ensureStore uses the STRICT router call for POST /memory_stores so the real 403/400 surfaces (MI not enabled / RBAC propagating / no embedding model) instead of the generic "could not be created". Security audit: docs/internal/security-audits/2026-06-25-foundry-autosetup-bestmodel-memory-spinner.md (2 sign-offs). Verification: CLI tsc+oxlint clean, 831 tests (+10); runtime tsc+oxlint clean, 244 tests; model ranking validated against the live azureclaw-foundry set. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- cli/src/commands/up.ts | 6 + cli/src/commands/up/foundry_setup.test.ts | 83 +++++ cli/src/commands/up/foundry_setup.ts | 294 ++++++++++++++++++ cli/src/commands/up/sandbox_bringup.ts | 99 ++++-- cli/src/preflight.ts | 21 +- cli/src/refs.ts | 40 +++ ...ndry-autosetup-bestmodel-memory-spinner.md | 100 ++++++ .../openclaw/src/core/agt-tools/foundry.ts | 11 +- 8 files changed, 620 insertions(+), 34 deletions(-) create mode 100644 cli/src/commands/up/foundry_setup.test.ts create mode 100644 cli/src/commands/up/foundry_setup.ts create mode 100644 docs/internal/security-audits/2026-06-25-foundry-autosetup-bestmodel-memory-spinner.md diff --git a/cli/src/commands/up.ts b/cli/src/commands/up.ts index 079bd5e8..b50d8c29 100644 --- a/cli/src/commands/up.ts +++ b/cli/src/commands/up.ts @@ -975,6 +975,12 @@ Auto-resume: registryMode, globalRegistryUrl, globalRelayUrl, }); + // Explicit success exit. Some `az`/REST calls leave keep-alive sockets + // (and we spawn a detached kubectl port-forward), so the event loop + // wouldn't drain on its own — without this the command hangs after the + // deployment summary instead of returning to the shell. + process.exit(0); + } catch (error) { stepper.stop(); console.error(chalk.red(`\n Deployment failed`)); diff --git a/cli/src/commands/up/foundry_setup.test.ts b/cli/src/commands/up/foundry_setup.test.ts new file mode 100644 index 00000000..4f9c61dd --- /dev/null +++ b/cli/src/commands/up/foundry_setup.test.ts @@ -0,0 +1,83 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +import { describe, it, expect } from "vitest"; +import { + scoreChatModel, + pickBestChatModel, + findEmbeddingModel, + parseFoundryEndpoint, + type FoundryDeployment, +} from "./foundry_setup.js"; + +const dep = (name: string, modelName = name): FoundryDeployment => ({ + name, + modelName, + modelVersion: "1", +}); + +describe("scoreChatModel", () => { + it("excludes non-chat models", () => { + expect(scoreChatModel("text-embedding-3-small")).toBeNull(); + expect(scoreChatModel("gpt-image-1")).toBeNull(); + expect(scoreChatModel("FLUX.2-pro")).toBeNull(); + expect(scoreChatModel("whisper")).toBeNull(); + expect(scoreChatModel("tts-1")).toBeNull(); + }); + + it("ranks newer families above older", () => { + expect(scoreChatModel("gpt-5.4")!).toBeGreaterThan(scoreChatModel("gpt-4.1")!); + expect(scoreChatModel("gpt-5.4")!).toBeGreaterThan(scoreChatModel("gpt-5")!); + expect(scoreChatModel("gpt-5")!).toBeGreaterThan(scoreChatModel("gpt-4o")!); + }); + + it("prefers the plain flagship over variants within a family", () => { + const plain = scoreChatModel("gpt-5.4")!; + expect(plain).toBeGreaterThan(scoreChatModel("gpt-5.4-pro")!); + expect(scoreChatModel("gpt-5.4-pro")!).toBeGreaterThan(scoreChatModel("gpt-5.4-chat")!); + expect(scoreChatModel("gpt-5.4-chat")!).toBeGreaterThan(scoreChatModel("gpt-5.4-mini")!); + expect(scoreChatModel("gpt-5.4-mini")!).toBeGreaterThan(scoreChatModel("gpt-5.4-nano")!); + }); +}); + +describe("pickBestChatModel", () => { + it("picks the flagship from a realistic deployment set", () => { + const deployments = [ + "gpt-5-mini", "text-embedding-3-small", "gpt-4.1", "gpt-5.4-mini", + "gpt-5.3-chat", "FLUX.2-pro", "gpt-image-1", "gpt-5.4-pro", "gpt-5.4", + ].map((n) => dep(n)); + expect(pickBestChatModel(deployments)?.name).toBe("gpt-5.4"); + }); + + it("returns undefined when no chat model is deployed", () => { + expect(pickBestChatModel([dep("text-embedding-3-small"), dep("gpt-image-1")])).toBeUndefined(); + }); + + it("uses the deployment name when modelName is itself non-chat-looking", () => { + // deployment named "my-gpt5" wrapping model "gpt-5.4" + const d: FoundryDeployment = { name: "primary", modelName: "gpt-5.4", modelVersion: "1" }; + expect(pickBestChatModel([d])?.name).toBe("primary"); + }); +}); + +describe("findEmbeddingModel", () => { + it("prefers 3-large over 3-small over ada", () => { + const deployments = [dep("ada-002", "text-embedding-ada-002"), dep("small", "text-embedding-3-small"), dep("large", "text-embedding-3-large")]; + expect(findEmbeddingModel(deployments)?.name).toBe("large"); + }); + it("returns undefined when no embedding deployed", () => { + expect(findEmbeddingModel([dep("gpt-5.4")])).toBeUndefined(); + }); +}); + +describe("parseFoundryEndpoint", () => { + it("parses account + project from a Foundry project endpoint", () => { + expect( + parseFoundryEndpoint("https://azureclaw-foundry-services.services.ai.azure.com/api/projects/azureclaw"), + ).toEqual({ accountName: "azureclaw-foundry-services", projectName: "azureclaw" }); + }); + it("returns null for a non-project endpoint", () => { + expect(parseFoundryEndpoint("https://foo.openai.azure.com")).toBeNull(); + expect(parseFoundryEndpoint("not a url")).toBeNull(); + }); +}); diff --git a/cli/src/commands/up/foundry_setup.ts b/cli/src/commands/up/foundry_setup.ts new file mode 100644 index 00000000..8e685304 --- /dev/null +++ b/cli/src/commands/up/foundry_setup.ts @@ -0,0 +1,294 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +// up/foundry_setup.ts — make a BYO ("--foundry-endpoint") Foundry project +// actually usable by kars, instead of assuming it's pre-configured. +// +// What this does (all idempotent, all read-mostly except the two explicit +// provisioning steps which are gated + best-effort): +// 1. Resolve the AI Services account + project from the endpoint URL. +// 2. List the project's deployed models (ARM control-plane — works with the +// caller's existing `az login`, no Microsoft Graph). +// 3. Pick the BEST deployed chat model for the agent (so we stop hardcoding a +// stale gpt-4.1). The user's explicit `--model` always wins. +// 4. Ensure an embedding model is deployed (Foundry Memory Store needs one); +// best-effort deploy `text-embedding-3-small` if none exists. +// 5. Enable the project's system-assigned managed identity if it's missing +// (Memory Store authenticates internally as the PROJECT MI) and re-read its +// principalId so the caller can grant it `Azure AI User` on the RG. +// +// Nothing here aborts the deploy: every failure degrades to a clear note so the +// sandbox still comes up and the operator gets actionable remediation. + +import type { Stepper } from "../../stepper.js"; + +/** One deployed model on the Foundry/AI-Services account (ARM shape). */ +export interface FoundryDeployment { + /** Deployment name — what you put in the request `model` field. */ + name: string; + /** Underlying model name (e.g. "gpt-5.4"). */ + modelName: string; + /** Model version. */ + modelVersion: string; +} + +export interface FoundrySetupResult { + accountName: string; + accountResourceId: string; + resourceGroup: string; + projectName: string; + /** Best deployed chat model deployment name, or undefined if none found. */ + bestChatModel?: string; + /** Embedding deployment name in use (existing or just-created), or undefined. */ + embeddingModel?: string; + /** Project system-assigned MI principalId (after any enable), or "". */ + projectMiPrincipalId: string; + /** True if this run enabled the MI (was previously off). */ + miJustEnabled: boolean; + /** Human-readable status notes for the deployment report. */ + notes: string[]; +} + +/** + * Score a deployed model for use as an interactive, tool-using agent's chat + * model. Returns a number (higher = better) or `null` when the model is not a + * chat model (embeddings, image, audio, …) and must be excluded. + * + * Ranking: family/version dominates; within a family the plain flagship beats + * `-pro`/`-chat`/`-mini`/`-nano`, because for a tool-calling agent the flagship + * general model is the most reliable default (reasoning-`pro` variants are + * slower/pricier and `mini`/`nano` are weaker). `--model` overrides all of this. + */ +export function scoreChatModel(modelName: string): number | null { + const n = modelName.toLowerCase(); + + // Hard-exclude anything that isn't a text chat model. + const NON_CHAT = + /(embedding|image|dall-?e|flux|whisper|tts|audio|realtime|sora|moderation|rerank|transcrib|stable-?diffusion)/; + if (NON_CHAT.test(n)) return null; + + // Family/version score. + let family: number; + const gpt = n.match(/^gpt-(\d+)(?:\.(\d+))?/); + const oSeries = n.match(/^o(\d+)/); + if (gpt) { + const major = parseInt(gpt[1], 10); + const minor = gpt[2] ? parseInt(gpt[2], 10) : 0; + family = major * 100 + minor; // gpt-5.4 → 504, gpt-4.1 → 401, gpt-4o → 400 + } else if (oSeries) { + family = 300 + parseInt(oSeries[1], 10) * 10; // o3 → 330, o4 → 340 (below gpt-5) + } else { + family = 50; // unknown family — keep, but rank low. + } + + // Variant adjustment (plain flagship preferred for agent tool-use). + let variant: number; + if (/-pro\b/.test(n)) variant = 3; + else if (/-chat\b/.test(n)) variant = 2; + else if (/-mini\b/.test(n)) variant = 1; + else if (/-nano\b/.test(n)) variant = 0; + else variant = 4; // plain flagship + + return family * 10 + variant; +} + +/** Pick the best chat-capable deployment, or undefined if none qualify. */ +export function pickBestChatModel( + deployments: FoundryDeployment[], +): FoundryDeployment | undefined { + let best: { dep: FoundryDeployment; score: number } | undefined; + for (const dep of deployments) { + const score = scoreChatModel(dep.modelName) ?? scoreChatModel(dep.name); + if (score === null || score === undefined) continue; + if (!best || score > best.score) best = { dep, score }; + } + return best?.dep; +} + +/** Find an embedding deployment, preferring 3-large > 3-small > ada. */ +export function findEmbeddingModel( + deployments: FoundryDeployment[], +): FoundryDeployment | undefined { + const embeds = deployments.filter((d) => + /embedding/i.test(d.modelName) || /embedding/i.test(d.name), + ); + if (embeds.length === 0) return undefined; + const rank = (d: FoundryDeployment): number => { + const n = `${d.modelName} ${d.name}`.toLowerCase(); + if (n.includes("3-large")) return 3; + if (n.includes("3-small")) return 2; + if (n.includes("ada")) return 1; + return 0; + }; + return embeds.sort((a, b) => rank(b) - rank(a))[0]; +} + +/** Parse "https://.services.ai.azure.com/api/projects/" → parts. */ +export function parseFoundryEndpoint( + endpoint: string, +): { accountName: string; projectName: string } | null { + try { + const u = new URL(endpoint); + const accountName = u.hostname.split(".")[0]; + const m = u.pathname.match(/\/api\/projects\/([^/]+)/); + if (!accountName || !m) return null; + return { accountName, projectName: m[1] }; + } catch { + return null; + } +} + +type Execa = typeof import("execa").execa; + +/** + * Discover + (best-effort) provision the BYO Foundry project so kars Memory + * Store and the agent model "just work". Returns null when the endpoint isn't a + * Foundry project endpoint (e.g. plain Azure OpenAI) — the caller keeps its + * existing behaviour in that case. + */ +export async function setupFoundryForKars(args: { + execa: Execa; + stepper: Stepper; + foundryEndpoint: string; +}): Promise { + const { execa, stepper, foundryEndpoint } = args; + const parsed = parseFoundryEndpoint(foundryEndpoint); + if (!parsed) return null; + const { accountName, projectName } = parsed; + const notes: string[] = []; + + // 1. Resolve the account ARM id + resource group. + stepper.update("Discovering Foundry project..."); + const { stdout: acctJson } = await execa("az", [ + "cognitiveservices", "account", "list", + "--query", `[?name=='${accountName}'].{id:id, rg:resourceGroup} | [0]`, + "--output", "json", + ], { stdio: "pipe" }).catch(() => ({ stdout: "{}" })); + const acct = JSON.parse((acctJson || "{}").trim() || "{}"); + const accountResourceId: string = acct.id || ""; + const resourceGroup: string = acct.rg || ""; + if (!accountResourceId || !resourceGroup) { + notes.push( + `Could not resolve the Foundry account '${accountName}' in this subscription — ` + + "skipping Foundry auto-setup (the sandbox will still deploy).", + ); + return { + accountName, accountResourceId: "", resourceGroup: "", projectName, + projectMiPrincipalId: "", miJustEnabled: false, notes, + }; + } + + // 2. List deployed models (ARM control-plane). + let deployments: FoundryDeployment[] = []; + try { + const { stdout: depJson } = await execa("az", [ + "rest", "--method", "get", + "--url", `${accountResourceId}/deployments?api-version=2024-10-01`, + ], { stdio: "pipe" }); + const raw = JSON.parse(depJson.trim()); + deployments = (raw.value ?? []).map((d: { + name: string; + properties?: { model?: { name?: string; version?: string } }; + }) => ({ + name: d.name, + modelName: d.properties?.model?.name ?? d.name, + modelVersion: d.properties?.model?.version ?? "", + })); + } catch { + notes.push("Could not list Foundry model deployments (continuing with defaults)."); + } + + // 3. Best chat model. + const best = pickBestChatModel(deployments); + const bestChatModel = best?.name; + if (bestChatModel) { + stepper.detail("info", `Best deployed chat model: ${bestChatModel}`); + } + + // 4. Ensure an embedding model (Memory Store needs one). + let embeddingModel = findEmbeddingModel(deployments)?.name; + if (!embeddingModel && accountResourceId) { + stepper.update("No embedding model deployed — deploying text-embedding-3-small..."); + const ok = await execa("az", [ + "cognitiveservices", "account", "deployment", "create", + "--name", accountName, + "--resource-group", resourceGroup, + "--deployment-name", "text-embedding-3-small", + "--model-name", "text-embedding-3-small", + "--model-version", "1", + "--model-format", "OpenAI", + "--sku-name", "Standard", + "--sku-capacity", "50", + "--output", "none", + ], { stdio: "pipe" }).then(() => true).catch(() => false); + if (ok) { + embeddingModel = "text-embedding-3-small"; + notes.push("Deployed embedding model 'text-embedding-3-small' for Memory Store."); + } else { + notes.push( + "No embedding model is deployed and auto-deploy failed (quota/permissions?). " + + "Memory Store needs one — deploy 'text-embedding-3-small' in the Foundry portal.", + ); + } + } + + // 5. Ensure the project's system-assigned MI (Memory Store authenticates + // internally as the PROJECT MI). + const projectUrl = `${accountResourceId}/projects/${projectName}?api-version=2025-06-01`; + let projectMiPrincipalId = ""; + let miJustEnabled = false; + try { + const { stdout: projJson } = await execa("az", [ + "rest", "--method", "get", "--url", projectUrl, + ], { stdio: "pipe" }); + projectMiPrincipalId = JSON.parse(projJson.trim())?.identity?.principalId || ""; + } catch { + // Fall through to enable attempt. + } + + if (!projectMiPrincipalId) { + stepper.update("Enabling Foundry project managed identity (for Memory Store)..."); + const enabled = await execa("az", [ + "rest", "--method", "patch", "--url", projectUrl, + "--body", JSON.stringify({ identity: { type: "SystemAssigned" } }), + ], { stdio: "pipe" }).then(() => true).catch(() => false); + + if (enabled) { + // The principalId may take a few seconds to populate after enabling. + for (let i = 0; i < 6 && !projectMiPrincipalId; i++) { + await new Promise((r) => setTimeout(r, 3000)); + const { stdout: pj } = await execa("az", [ + "rest", "--method", "get", "--url", projectUrl, + ], { stdio: "pipe" }).catch(() => ({ stdout: "{}" })); + projectMiPrincipalId = JSON.parse((pj || "{}").trim() || "{}")?.identity?.principalId || ""; + } + if (projectMiPrincipalId) { + miJustEnabled = true; + notes.push("Enabled the Foundry project's system-assigned managed identity."); + } else { + notes.push( + "Enabled the Foundry project MI but its principalId hasn't populated yet — " + + "Memory Store RBAC will be granted on the next `kars up` run.", + ); + } + } else { + notes.push( + "Foundry project has no system-assigned MI and kars couldn't enable it " + + "(needs Contributor on the project). Enable it: Portal → Project → " + + "Resource Management → Identity → System assigned → On, then re-run `kars up`.", + ); + } + } + + return { + accountName, + accountResourceId, + resourceGroup, + projectName, + bestChatModel, + embeddingModel, + projectMiPrincipalId, + miJustEnabled, + notes, + }; +} diff --git a/cli/src/commands/up/sandbox_bringup.ts b/cli/src/commands/up/sandbox_bringup.ts index b73a23a2..78aee629 100644 --- a/cli/src/commands/up/sandbox_bringup.ts +++ b/cli/src/commands/up/sandbox_bringup.ts @@ -19,6 +19,7 @@ import { saveContext } from "../../config.js"; import { buildInferencePolicy, buildToolPolicy, + buildKarsMemory, inferenceRefName, toolPolicyRefName, } from "../../refs.js"; @@ -139,40 +140,57 @@ export async function bringUpSandbox(ctx: SandboxBringUpContext): Promise // 1. Sandbox WI → Azure AI User on the Foundry AI Services resource (so pods can call APIs) // 2. Foundry project MI → Azure AI User on the resource group (so Memory Store can call models internally) if (foundryEndpoint) { - stepper.update("Configuring Foundry project RBAC (via Bicep)..."); + stepper.update("Configuring Foundry project (discovery + setup + RBAC)..."); + + // Discover + best-effort provision the BYO Foundry project: pick the best + // deployed chat model, ensure an embedding model, and enable the project's + // system-assigned MI (Memory Store authenticates internally as the project + // MI). All idempotent + non-fatal — see foundry_setup.ts. + const { setupFoundryForKars } = await import("./foundry_setup.js"); + const foundrySetup = await setupFoundryForKars({ + execa, stepper, foundryEndpoint, + }).catch(() => null); + + // Adopt the best deployed chat model unless the user explicitly set --model. + const modelExplicit = process.argv.includes("--model"); + if (foundrySetup?.bestChatModel && !modelExplicit) { + if (foundrySetup.bestChatModel !== options.model) { + stepper.detail("info", `Using best deployed model '${foundrySetup.bestChatModel}' (was default '${options.model}'; pass --model to override)`); + } + options.model = foundrySetup.bestChatModel; + } + for (const note of foundrySetup?.notes ?? []) { + stepper.detail("info", note); + } + const foundryHost = new URL(foundryEndpoint).hostname; // Extract account name: "foo.services.ai.azure.com" → "foo", or "foo.openai.azure.com" → "foo" - const foundryAccountName = foundryHost.split(".")[0]; + const foundryAccountName = foundrySetup?.accountName || foundryHost.split(".")[0]; // Extract project name from URL path: "/api/projects/bar" → "bar" const foundryUrl = new URL(foundryEndpoint); const projectMatch = foundryUrl.pathname.match(/\/api\/projects\/([^/]+)/); - const foundryProjectName = projectMatch ? projectMatch[1] : ""; - - // Find the Foundry AI Services account and its resource group - const { stdout: foundryAccountJson } = await execa("az", [ - "cognitiveservices", "account", "list", - "--query", `[?name=='${foundryAccountName}'].{id:id, rg:resourceGroup} | [0]`, - "--output", "json", - ], { stdio: "pipe" }).catch(() => ({ stdout: "{}" })); - - const foundryAccount = JSON.parse(foundryAccountJson.trim() || "{}"); - const foundryResourceId = foundryAccount.id || ""; - const foundryRg = foundryAccount.rg || ""; + const foundryProjectName = foundrySetup?.projectName || (projectMatch ? projectMatch[1] : ""); + + // Account ARM id + resource group — reuse the discovery result, else resolve. + let foundryResourceId = foundrySetup?.accountResourceId || ""; + let foundryRg = foundrySetup?.resourceGroup || ""; + if (!foundryResourceId || !foundryRg) { + const { stdout: foundryAccountJson } = await execa("az", [ + "cognitiveservices", "account", "list", + "--query", `[?name=='${foundryAccountName}'].{id:id, rg:resourceGroup} | [0]`, + "--output", "json", + ], { stdio: "pipe" }).catch(() => ({ stdout: "{}" })); + const foundryAccount = JSON.parse(foundryAccountJson.trim() || "{}"); + foundryResourceId = foundryAccount.id || ""; + foundryRg = foundryAccount.rg || ""; + } if (foundryResourceId && foundryRg && foundryProjectName) { - // Query the project's managed identity principal ID via ARM REST API - let projectMiPrincipalId = ""; - try { - const { stdout: projectJson } = await execa("az", [ - "rest", "--method", "get", - "--url", `${foundryResourceId}/projects/${foundryProjectName}?api-version=2025-06-01`, - ], { stdio: "pipe" }); - const project = JSON.parse(projectJson.trim()); - projectMiPrincipalId = project?.identity?.principalId || ""; - } catch { - // Project may not have system MI enabled — warn but continue - } + // Project MI principalId — resolved (and, if it was off, enabled) by the + // discovery step above. + const projectMiPrincipalId = foundrySetup?.projectMiPrincipalId || ""; + // Get the sandbox workload identity principal ID let sandboxWiPrincipalId = ""; @@ -383,7 +401,7 @@ export async function bringUpSandbox(ctx: SandboxBringUpContext): Promise try { unlinkSync(tmpBicep); } catch {} } - if (!projectMiPrincipalId) { + if (!projectMiPrincipalId && !foundrySetup) { console.log(chalk.yellow("\n ⚠ Foundry project has no system-assigned MI. Memory Store will not work.")); console.log(chalk.yellow(" Enable it: Portal → Project → Resource Management → Identity → System assigned → On")); console.log(chalk.yellow(" Then re-run: kars up ...\n")); @@ -498,16 +516,41 @@ export async function bringUpSandbox(ctx: SandboxBringUpContext): Promise }, }, }; + // KarsMemory binding — only meaningful with a Foundry project endpoint + // (Memory Store is a Foundry feature). Gives the sandbox the same + // controller-managed binding `kars dev` creates, instead of relying purely + // on the runtime's lazy store creation. + const memoryCr = foundryEndpoint + ? buildKarsMemory({ sandboxName: options.name, namespace: sandboxNamespace }) + : null; + const bundleManifest = { apiVersion: "v1", kind: "List", - items: [inferencePolicy, toolPolicy, sandboxManifest], + items: [inferencePolicy, toolPolicy, ...(memoryCr ? [memoryCr] : []), sandboxManifest], }; await execa("kubectl", ["apply", "-f", "-"], { input: JSON.stringify(bundleManifest), stdio: ["pipe", "pipe", "pipe"], }); + // ── CRD status report — confirm each resource applied + its phase ── + stepper.detail("ok", "Applied CRDs:"); + const crdChecks: Array<{ kind: string; name: string; phasePath: string }> = [ + { kind: "inferencepolicy", name: inferenceRefName(options.name), phasePath: "{.status.phase}" }, + { kind: "toolpolicy", name: toolPolicyRefName(options.name), phasePath: "{.status.phase}" }, + ...(memoryCr ? [{ kind: "karsmemory", name: (memoryCr.metadata as { name: string }).name, phasePath: "{.status.phase}" }] : []), + { kind: "karssandbox", name: options.name, phasePath: "{.status.phase}" }, + ]; + for (const c of crdChecks) { + const { stdout: phase } = await execa("kubectl", [ + "get", c.kind, c.name, "-n", sandboxNamespace, + "-o", `jsonpath=${c.phasePath}`, + ], { stdio: "pipe" }).catch(() => ({ stdout: "" })); + const ph = phase.trim(); + stepper.detail(ph && ph !== "Failed" ? "ok" : "info", ` ${c.kind}/${c.name}${ph ? ` — ${ph}` : " — applied"}`); + } + // ── Step 8: Wait for sandbox ───────────────────────────────── stepper.step("Waiting for sandbox to start..."); await execa("kubectl", [ diff --git a/cli/src/preflight.ts b/cli/src/preflight.ts index f51d07d7..dfa6674a 100644 --- a/cli/src/preflight.ts +++ b/cli/src/preflight.ts @@ -248,6 +248,17 @@ export async function runPreflightChecks(opts: PreflightOptions): Promise chalk.cyan(r)).join("\n ")}\n\n Ask your subscription Owner / Global Admin to run:\n ${chalk.cyan(`az role assignment create --assignee ${account.user?.name ?? ""} --role "Contributor" --scope /subscriptions/${account.id}`)}\n ${chalk.cyan(`az role assignment create --assignee ${account.user?.name ?? ""} --role "User Access Administrator" --scope /subscriptions/${account.id}`)}` ); } + } else if (spin.isSpinning) { + // `fetchSubscriptionPermissions` returned an empty set WITHOUT throwing + // (e.g. the ARM `elevateAccess`/permissions call returns `value: []`). + // Neither branch above runs, so without this the spinner is never + // concluded — its `setInterval` keeps the Node event loop alive and the + // whole `kars up` hangs after the summary (and the spinner animates the + // entire run). Conclude it and treat as inconclusive, not blocking. + spin.info("RBAC — effective permissions inconclusive (continuing)"); + result.warnings.push( + "RBAC check inconclusive (no effective permissions returned). If `up` fails with an authorization error, re-run with Contributor + User Access Administrator.", + ); } // 3. Resource providers @@ -281,9 +292,13 @@ export async function runPreflightChecks(opts: PreflightOptions): Promise 0) { - spin = ora().fail( - `Resource providers — could not verify ${notFound.length} (${notFound.map((p) => p.ns).join(", ")})` - ); + const msg = `Resource providers — could not verify ${notFound.length} (${notFound.map((p) => p.ns).join(", ")})`; + // Conclude the EXISTING provider spinner rather than replacing the + // reference with a fresh `ora()` — the old `spin = ora().fail(...)` + // orphaned the still-spinning provider spinner whenever `pending` was + // empty, leaking a `setInterval` that kept the process alive. + if (spin.isSpinning) spin.fail(msg); + else ora().fail(msg); result.warnings.push( `Could not read registration state for: ${notFound.map((p) => p.ns).join(", ")}. Verify network access to management.azure.com.` ); diff --git a/cli/src/refs.ts b/cli/src/refs.ts index c33966d8..7b998170 100644 --- a/cli/src/refs.ts +++ b/cli/src/refs.ts @@ -27,6 +27,46 @@ export const inferenceRefName = (sandboxName: string) => export const toolPolicyRefName = (sandboxName: string) => kebabRefName(sandboxName, "-toolpolicy"); +export const memoryRefName = (sandboxName: string) => + kebabRefName(sandboxName, "-memory"); + +/** Foundry Memory Store name for a sandbox — matches the runtime convention + * `memory-` in runtimes/openclaw memory-binding.ts. DNS-label safe. */ +export const memoryStoreName = (sandboxName: string) => + kebabRefName(sandboxName, "").replace(/^/, "memory-").slice(0, 63).replace(/-+$/g, ""); + +export interface KarsMemoryOpts { + sandboxName: string; + namespace: string; + retentionDays?: number; +} + +/** + * Build a KarsMemory CR so a `kars up` sandbox gets the same controller-managed + * Foundry Memory Store binding that `kars dev` already creates. Without it the + * runtime falls back to lazy store creation with no declarative binding. + */ +export function buildKarsMemory(opts: KarsMemoryOpts): Record { + const store = memoryStoreName(opts.sandboxName); + return { + apiVersion: "kars.azure.com/v1alpha1", + kind: "KarsMemory", + metadata: { + name: memoryRefName(opts.sandboxName), + namespace: opts.namespace, + labels: { "kars.azure.com/sandbox": opts.sandboxName }, + }, + spec: { + sandboxRef: { name: opts.sandboxName }, + storeName: store, + scope: `agent:${opts.sandboxName}`, + retentionDays: opts.retentionDays ?? 30, + deleteOnSandboxDelete: true, + displayName: `Default memory for ${opts.sandboxName}`, + }, + }; +} + export interface InferencePolicyOpts { sandboxName: string; namespace: string; diff --git a/docs/internal/security-audits/2026-06-25-foundry-autosetup-bestmodel-memory-spinner.md b/docs/internal/security-audits/2026-06-25-foundry-autosetup-bestmodel-memory-spinner.md new file mode 100644 index 00000000..516e7014 --- /dev/null +++ b/docs/internal/security-audits/2026-06-25-foundry-autosetup-bestmodel-memory-spinner.md @@ -0,0 +1,100 @@ +# Security Audit — Foundry auto-setup, best-model selection, memory CRD parity, preflight spinner-leak fix + +Date: 2026-06-25 +Scope: +- NEW `cli/src/commands/up/foundry_setup.ts` (+ `foundry_setup.test.ts`) +- `cli/src/commands/up/sandbox_bringup.ts` (wire foundry setup; KarsMemory CR; CRD status report) +- `cli/src/commands/up.ts` (`process.exit(0)` on success) +- `cli/src/preflight.ts` (ora spinner-leak fixes) +- `cli/src/refs.ts` (`buildKarsMemory`, `memoryRefName`, `memoryStoreName`) +- `runtimes/openclaw/src/core/agt-tools/foundry.ts` (surface real Memory Store create error) + +Gated paths (CI `security-audit-required`): `cli/src/commands/*`, `runtimes/openclaw/src/core/*`. + +## Summary + +Closes the gap where `kars up --foundry-endpoint` assumed a fully-configured Foundry +project. The deploy now discovers the project, picks the best deployed chat model, +ensures an embedding model, enables the project's system-assigned managed identity, +creates a KarsMemory binding CR (parity with `kars dev`), reports CRD status, and +exits cleanly. The runtime now surfaces the real reason a Memory Store can't be +created instead of a generic message. + +1. **Foundry auto-setup (`foundry_setup.ts`).** From the BYO endpoint: list deployed + models (ARM control-plane, caller's own `az` token — no Graph), pick the best + chat model (pure, tested ranking; `--model` always wins), ensure an embedding + model (best-effort deploy `text-embedding-3-small`), and **enable the project + system-assigned MI** if absent (PATCH `identity.type=SystemAssigned`), then + re-read its principalId. All idempotent; every failure degrades to a note and + never aborts the deploy. + +2. **Best-model selection** replaces the hardcoded stale `gpt-4.1` with the highest- + ranked chat model actually deployed in the project. Excludes embedding/image/ + audio models. User `--model` is respected. + +3. **KarsMemory CR parity.** `kars up` now emits a KarsMemory CR (only with a Foundry + endpoint) so the sandbox gets the same controller-managed Memory Store binding + `kars dev` already creates. Store name follows the existing `memory-` + convention; scope `agent:`. + +4. **CRD status report.** After applying the bundle, prints each CR (InferencePolicy, + ToolPolicy, KarsMemory, KarsSandbox) with its phase — read-only `kubectl get`. + +5. **Clean finish.** `process.exit(0)` on the success path so a detached + `kubectl port-forward` (and keep-alive sockets) can't keep the process alive. + +6. **Preflight spinner-leak fix (the hang).** `cli/src/preflight.ts`: the RBAC + spinner was concluded only when `fetchSubscriptionPermissions` threw or returned a + non-empty set; an empty `[]` (no throw) left it spinning, whose `setInterval` kept + Node alive — `kars up` hung after the summary with the spinner still animating + (reproduced by two operators). Now concluded on the empty path. A second identical + leak in the resource-provider `notFound` path (which orphaned the live spinner via + `spin = ora().fail(...)`) is fixed to conclude the existing spinner. + +7. **Memory error unmasking (runtime).** `ensureStore` now uses the STRICT router call + for `POST /memory_stores`, so an upstream 4xx (e.g. 403 — project MI not enabled / + missing `Azure AI User` on the RG, RBAC still propagating; or 400 — no embedding + model) surfaces the real reason instead of collapsing to "could not be created". + +## T1: New capability / attack surface? (NO) +- `foundry_setup.ts` performs reads plus two narrowly-scoped, idempotent writes the + operator already intends: enabling the project's own system MI, and (best-effort) + deploying an embedding model — both on the operator's BYO Foundry resource, with + the operator's own credentials, gated by their existing Azure RBAC (failure → note, + not escalation). No new principal, secret, or network path is introduced. +- The KarsMemory CR is the existing, admission-validated CRD; no new kind. +- No change to the sandbox's runtime privileges, egress, seccomp, NetworkPolicy, or + inference-router auth (still Entra/IMDS, no keys). + +## T2: Security-control change? (NEUTRAL) +- RBAC roles/scopes granted by `kars up` are unchanged (the existing Azure AI User / + Cognitive Services OpenAI User assignments). Enabling the project MI is a + precondition for the SAME Memory Store grant kars already makes — not a new grant. +- The runtime change only alters error *reporting* (strict vs lenient call on the + store-create POST); it does not change what is sent or to where. +- Preflight changes are presentation/lifecycle only (spinner conclusion + process + exit); no check is relaxed. The RBAC empty-set path is treated as INCONCLUSIVE + (warning), exactly as the thrown-error path already was. + +## T3: Availability / fail-open risk? (REDUCED) +- Fixes a hard hang (process never exits) and a class of confusing memory failures + (masked 403/400). Foundry auto-setup is best-effort and never blocks the deploy. +- Best-model selection falls back to the existing default if discovery fails. + +## Verification +- CLI: `tsc --noEmit` clean, oxlint 0 errors, **831 tests pass** (+10 new + `foundry_setup` / refs tests; model ranking proven to pick `gpt-5.4` over a + realistic deployed set and exclude embedding/image). +- Runtime: `tsc --noEmit` clean, oxlint 0 errors, **244 tests pass**. +- Model scoring validated against the live `azureclaw-foundry` deployment set. +- Spinner-leak mechanism confirmed: an un-concluded ora `setInterval` keeps the Node + event loop alive; concluding it (or `process.exit(0)`) exits cleanly. + +## Verdict +Accept. Makes a BYO Foundry project actually usable for Memory Store with no new +attack surface (operator-scoped, idempotent, best-effort writes on their own +resource), fixes a real `kars up` hang, and surfaces previously-masked errors. No +security control is weakened. + +Signed-off-by: Pal Lakatos-Toth +Signed-off-by: Copilot <223556219+Copilot@users.noreply.github.com> diff --git a/runtimes/openclaw/src/core/agt-tools/foundry.ts b/runtimes/openclaw/src/core/agt-tools/foundry.ts index c977bf65..a9b6ddbf 100644 --- a/runtimes/openclaw/src/core/agt-tools/foundry.ts +++ b/runtimes/openclaw/src/core/agt-tools/foundry.ts @@ -14,7 +14,7 @@ // foundry_evaluations foundry_deployments // foundry_agents -import { routerCall, routerCallBinary } from "../router-client.js"; +import { routerCall, routerCallStrict, routerCallBinary } from "../router-client.js"; import { safeJson } from "../safe-json.js"; import { resolveMemoryStoreName, resolveMemoryScope } from "../memory-binding.js"; import type { FoundryProjectInfo } from "../foundry-discovery.js"; @@ -701,7 +701,12 @@ export function registerFoundryTools(api: AnyApi, deps: FoundryToolsDeps): void (d: any) => d.id?.includes("embedding") || d.model?.includes("embedding") )?.id || "text-embedding-3-small"; log.info(`Creating memory store '${store}' (chat=${chatModel}, embedding=${embeddingModel})`); - await routerCall("POST", `/memory_stores?${apiVer}`, { + // Use the STRICT call so an upstream 4xx (e.g. 403 because the Foundry + // project's managed identity isn't enabled / lacks Azure AI User on + // the resource group, or 400 because no embedding model is deployed) + // surfaces the REAL reason instead of being swallowed and collapsing + // into a generic "could not be created". + await routerCallStrict("POST", `/memory_stores?${apiVer}`, { name: store, description: "kars agent persistent memory", definition: { @@ -772,7 +777,7 @@ export function registerFoundryTools(api: AnyApi, deps: FoundryToolsDeps): void result = await doUpdate(); } if (isNotFound(result)) { - return { content: [{ type: "text", text: `Memory update failed: store '${store}' could not be created.` }] }; + return { content: [{ type: "text", text: `Memory update failed: store '${store}' could not be created — the Foundry Memory Store service returned not-found after a create attempt. Common causes: the Foundry project's system-assigned managed identity isn't enabled or lacks 'Azure AI User' on the resource group (RBAC can take a few minutes to propagate), or no embedding model is deployed in the project.` }] }; } const status = result?.status || "submitted"; return { content: [{ type: "text", text: `Memory update ${status}. The memory will be available shortly.` }] };