From 31f94937184137b06cced5f603f4eba6bb956575 Mon Sep 17 00:00:00 2001 From: 61cygni <> Date: Fri, 22 May 2026 21:55:59 -0700 Subject: [PATCH 1/8] Add TypeScript tool registry Co-authored-by: Cursor --- docs/tools.md | 112 ++++++ examples/typescript/basic-harness.ts | 8 +- examples/typescript/tools/uppercase.ts | 72 ++++ typescript/harness/built-in-tools.ts | 82 +++++ typescript/harness/index.test.ts | 484 +++++++++++++++++++++++++ typescript/harness/index.ts | 31 +- typescript/harness/tool-manifest.ts | 101 ++++++ typescript/harness/tools.ts | 212 +++++++++++ typescript/model-runtime/responses.ts | 9 +- 9 files changed, 1082 insertions(+), 29 deletions(-) create mode 100644 docs/tools.md create mode 100644 examples/typescript/tools/uppercase.ts create mode 100644 typescript/harness/built-in-tools.ts create mode 100644 typescript/harness/index.test.ts create mode 100644 typescript/harness/tool-manifest.ts create mode 100644 typescript/harness/tools.ts diff --git a/docs/tools.md b/docs/tools.md new file mode 100644 index 0000000..00114a1 --- /dev/null +++ b/docs/tools.md @@ -0,0 +1,112 @@ +# Tools + +The TypeScript harness has broad support for tool use. There are basic tools +provided by the harness, as well as agent generated tools, and support for tools +developed by the user or a third party. Most generally, tools are model-callable +functions. They let a model ask the harness to do something outside the model +call, such as run a shell command, call a service, inspect local state, or +execute project-specific code. + +## Tool Types + +### Built-In Tools + +Built-in tools are maintained by the `exo` project and shipped with the harness +runtime. + +Currently the only built-in tool is `shell`. The shell tool runs commands in the +conversation sandbox, using the conversation’s configured `shellProgram`. The +TypeScript harness exposes the model-facing tool, but execution is delegated to +the Rust host runtime so sandbox lifecycle and process execution remain +host-managed. + +### Library Tools + +Library tools are not written by the agent itself and are not part of the core +`exo` release. They may be written by the user, a team, or an external +maintainer. They are loaded explicitly by the harness. + +A library tool is a TypeScript module with a default export satisfying `Tool`: + +### Agent Tools + +Agent tools are created by the agent itself. They use the same default-export +`Tool` module contract as library tools, but they are registered with source +`"agent"` instead of `"library"`. + +Agent tools should be treated as less trusted than built-in or library tools. +Load them from an explicit manifest and keep the scope narrow: + +The loader: + +- imports the module's default export +- verifies it looks like a `Tool` +- validates `initialization` against `initializationParameters` +- initializes the tool with source `"agent"` +- registers the resulting `ToolInstance` + +No directory scanning is done by default. The manifest is explicit so users and +harness authors can see which agent-created modules are being loaded. + +## Events + +Tool use is stored in the conversation event log. + +The model runtime records tool requests as `tool_requested` events. The registry +returns `tool_result` events after execution. The harness appends those events +to the current turn. + +This means the durable conversation history records: + +1. The model requested a tool. +2. The harness executed or rejected it. +3. The result was returned to the next model round. + +Tracing can separately record richer information, such as duration, errors, or +tool source, without changing the canonical event shape. + +## Configuration And Secrets + +Tools should use existing exoharness configuration primitives: + +- Put credentials in `Secret`. +- Refer to secrets by id in initialization parameters. +- Keep non-secret setup values in harness code, config, manifests, or artifacts. +- Do not put raw secrets in tool definitions, model-visible prompts, or + `tool_result` events. + +For example, an IRC tool might take `passwordSecretId` as an initialization +parameter. The tool handler can resolve the secret at execution time through the +exoharness API. + +## Safety Considerations + +Different tool sources have different trust levels: + +- Built-in tools are first-party and reviewed with `exo`. +- Library tools are trusted by the user or harness author who chose to load + them. +- Agent tools are generated by the agent and should have the narrowest scope. + +Recommended defaults: + +- Load tools explicitly, not by scanning directories. +- Validate initialization parameters before exposing a tool. +- Require explicit networking enablement for tools that call external services. +- Require confirmation for tools with external side effects. +- Avoid persisting agent tools beyond the conversation or workspace unless a + user reviews and promotes them. +- Keep large logs in artifacts, not event payloads. + +## Current Status + +The generic registry, built-in shell registration, library tool loading, and +agent tool manifest loading are implemented in the TypeScript harness API. + +The basic TypeScript harness currently opts into only the built-in shell tool. +Library and agent tools can be registered by harnesses using the manifest +helpers, but the basic harness does not load manifests automatically yet. + +There is an example library tool at `examples/typescript/tools/uppercase.ts`. +It exists to test and demonstrate the registry contract; no example harness +exposes it to a model by default. diff --git a/examples/typescript/basic-harness.ts b/examples/typescript/basic-harness.ts index 595e285..b1ff657 100644 --- a/examples/typescript/basic-harness.ts +++ b/examples/typescript/basic-harness.ts @@ -1,7 +1,8 @@ import { - buildShellToolDefinitions, + createToolRegistry, defineHarness, materializePromptMessages, + registerBuiltInTools, turnMetadata, type TurnContext, } from "@exo/harness"; @@ -37,6 +38,8 @@ async function runBasicTurnLoop( ): Promise { const { conversation, turn } = context.exoharness.current; const maxToolRoundTrips = context.agentConfig.maxToolRoundTrips; + const tools = createToolRegistry(context); + registerBuiltInTools(tools, context, ["shell"]); let latestEventId: string | null = null; for (let round = 0; ; round += 1) { @@ -55,7 +58,7 @@ async function runBasicTurnLoop( const request: NativeResponsesRequest = { model, messages, - tools: buildShellToolDefinitions(context.conversationConfig), + tools: tools.definitions(), maxOutputTokens: context.agentConfig.maxOutputTokens, metadata: turnMetadata(context), }; @@ -93,6 +96,7 @@ async function runBasicTurnLoop( context, toolCall, round, + (toolCall) => tools.executePending([toolCall]), ); if (toolResultEvents.length > 0) { latestEventId = (await turn.addEvents(toolResultEvents)).latestEventId; diff --git a/examples/typescript/tools/uppercase.ts b/examples/typescript/tools/uppercase.ts new file mode 100644 index 0000000..227aa3d --- /dev/null +++ b/examples/typescript/tools/uppercase.ts @@ -0,0 +1,72 @@ +// Example library tool used by the harness registry tests. No example harness +// exposes this tool to a model yet. + +import type { JsonObject, Tool, ToolResult } from "@exo/harness"; + +interface UppercaseConfig { + prefix: string; +} + +const uppercaseTool = { + definition: { + name: "uppercase", + description: "Uppercase text and optionally prefix the result.", + parameters: { + type: "object", + additionalProperties: false, + properties: { + text: { + type: "string", + description: "Text to uppercase.", + }, + }, + required: ["text"], + }, + outputSchema: { + type: "object", + additionalProperties: false, + properties: { + text: { type: "string" }, + }, + required: ["text"], + }, + }, + initializationParameters: { + type: "object", + additionalProperties: false, + properties: { + prefix: { + type: "string", + description: "Prefix to prepend to each uppercase result.", + }, + }, + required: ["prefix"], + }, + initialize(args) { + const config = parseConfig(args); + return { + async execute(args): Promise { + const text = stringArgument(args, "text"); + return { + text: `${config.prefix}${text.toUpperCase()}`, + }; + }, + }; + }, +} satisfies Tool; + +export default uppercaseTool; + +function parseConfig(args: JsonObject): UppercaseConfig { + return { + prefix: stringArgument(args, "prefix"), + }; +} + +function stringArgument(args: JsonObject, name: string): string { + const value = args[name]; + if (typeof value !== "string") { + throw new Error(`uppercase tool argument ${name} must be a string`); + } + return value; +} diff --git a/typescript/harness/built-in-tools.ts b/typescript/harness/built-in-tools.ts new file mode 100644 index 0000000..682f71e --- /dev/null +++ b/typescript/harness/built-in-tools.ts @@ -0,0 +1,82 @@ +import type { + ConversationConfig, + JsonObject, + ToolDefinition, + ToolResult, + TurnContext, +} from "./index"; +import type { HarnessToolRegistry, ToolInstance } from "./tools"; + +export type BuiltInToolName = "shell"; + +export function registerBuiltInTools( + registry: HarnessToolRegistry, + context: TurnContext, + names: BuiltInToolName[], +): void { + for (const name of names) { + if (name === "shell") { + const shell = createShellToolInstance(context.conversationConfig); + if (shell) { + registry.register(shell); + } + } + } +} + +export function createShellToolInstance( + config: ConversationConfig, +): ToolInstance | null { + if (!config.shellProgram) { + return null; + } + return { + source: "built_in", + definition: shellToolDefinition(config.shellProgram), + handler: { + execute(args, execution) { + return execution.context.executeTool({ + functionName: "shell", + arguments: args, + }); + }, + }, + }; +} + +export function buildShellToolDefinitions( + config: ConversationConfig, +): ToolDefinition[] { + const tool = createShellToolInstance(config); + return tool ? [tool.definition] : []; +} + +function shellToolDefinition(shellProgram: string): ToolDefinition { + return { + name: "shell", + description: `Run a shell command using ${shellProgram}.`, + parameters: { + type: "object", + additionalProperties: false, + properties: { + command: { + type: "string", + description: "Shell command to execute.", + }, + }, + required: ["command"], + }, + }; +} + +export function shellToolRequest(args: JsonObject): { + functionName: "shell"; + arguments: JsonObject; +} { + return { + functionName: "shell", + arguments: args, + }; +} + +export type ShellToolResult = ToolResult; diff --git a/typescript/harness/index.test.ts b/typescript/harness/index.test.ts new file mode 100644 index 0000000..e84c8b9 --- /dev/null +++ b/typescript/harness/index.test.ts @@ -0,0 +1,484 @@ +import { describe, expect, it } from "vitest"; + +import { + buildShellToolDefinitions, + createShellToolInstance, + createToolRegistry, + initializeTool, + registerBuiltInTools, + registerAgentToolsFromManifest, + registerLibraryToolsFromManifest, + registerToolsFromManifest, + toolResultEvent, + type EventData, + type JsonObject, + type ToolExecutionContext, + type ToolInstance, + type ToolResult, + type TurnContext, +} from "./index"; +import uppercaseTool from "../../examples/typescript/tools/uppercase"; + +describe("HarnessToolRegistry", () => { + it("returns registered tool definitions", () => { + const context = fakeTurnContext(); + const tool = fakeTool("echo", async (args) => args); + const registry = createToolRegistry(context).register(tool); + + expect(registry.definitions()).toEqual([tool.definition]); + expect(registry.get("echo")).toBe(tool); + }); + + it("rejects duplicate tool names", () => { + const context = fakeTurnContext(); + const registry = createToolRegistry(context).register( + fakeTool("echo", async (args) => args), + ); + + expect(() => + registry.register(fakeTool("echo", async (args) => args)), + ).toThrow("tool is already registered: echo"); + }); + + it("executes pending tool calls and returns tool result events", async () => { + const context = fakeTurnContext(); + const executionContexts: ToolExecutionContext[] = []; + const registry = createToolRegistry(context).register( + fakeTool("echo", async (args, execution) => { + executionContexts.push(execution); + return { echoed: args.value }; + }), + ); + + const events = await registry.executePending([ + { + toolCallId: "call_1", + request: { + functionName: "echo", + arguments: { value: "hello" }, + }, + }, + ]); + + expect(events).toEqual([ + toolResultEvent("call_1", { + echoed: "hello", + }), + ]); + expect(executionContexts).toHaveLength(1); + expect(executionContexts[0].context).toBe(context); + expect(executionContexts[0].toolCallId).toBe("call_1"); + }); + + it("emits stream events around tool execution when streaming", async () => { + const streamEvents: EventData[] = []; + const context = fakeTurnContext({ + streaming: true, + streamEvents, + }); + const registry = createToolRegistry(context).register( + fakeTool("echo", async (args) => ({ echoed: args.value })), + ); + + await registry.executePending([ + { + toolCallId: "call_1", + request: { + functionName: "echo", + arguments: { value: "hello" }, + }, + }, + ]); + + expect(streamEvents).toEqual([ + { + type: "tool_call_streamed", + toolCallId: "call_1", + toolName: "echo", + arguments: { value: "hello" }, + }, + { + type: "tool_result_streamed", + toolCallId: "call_1", + result: { echoed: "hello" }, + }, + ]); + }); + + it("throws for unregistered tools", async () => { + const registry = createToolRegistry(fakeTurnContext()); + + await expect( + registry.executePending([ + { + toolCallId: "call_1", + request: { + functionName: "missing", + arguments: {}, + }, + }, + ]), + ).rejects.toThrow("tool execution is not configured for missing"); + }); +}); + +describe("shell built-in tool", () => { + it("builds the existing shell tool definition shape", () => { + expect( + buildShellToolDefinitions({ + enableNetworking: false, + shellProgram: "/bin/bash", + mounts: [], + }), + ).toEqual([ + { + name: "shell", + description: "Run a shell command using /bin/bash.", + parameters: { + type: "object", + additionalProperties: false, + properties: { + command: { + type: "string", + description: "Shell command to execute.", + }, + }, + required: ["command"], + }, + }, + ]); + }); + + it("omits the shell definition when shell is disabled", () => { + expect( + buildShellToolDefinitions({ + enableNetworking: false, + shellProgram: null, + mounts: [], + }), + ).toEqual([]); + }); + + it("delegates shell execution to the host tool path", async () => { + const executedRequests: JsonObject[] = []; + const context = fakeTurnContext({ + executeTool: async (request) => { + executedRequests.push({ + functionName: request.functionName, + arguments: request.arguments, + }); + return { + stdout: "ok\n", + stderr: "", + exit_code: 0, + }; + }, + }); + const shell = createShellToolInstance({ + enableNetworking: false, + shellProgram: "/bin/bash", + mounts: [], + }); + + expect(shell).not.toBeNull(); + const result = await shell!.handler.execute( + { command: "echo ok" }, + { + context, + toolCallId: "call_1", + }, + ); + + expect(executedRequests).toEqual([ + { + functionName: "shell", + arguments: { command: "echo ok" }, + }, + ]); + expect(result).toEqual({ + stdout: "ok\n", + stderr: "", + exit_code: 0, + }); + }); + + it("registers requested built-in tools", () => { + const context = fakeTurnContext({ + conversationConfig: { + enableNetworking: false, + shellProgram: "/bin/bash", + mounts: [], + }, + }); + const registry = createToolRegistry(context); + + registerBuiltInTools(registry, context, ["shell"]); + + expect(registry.definitions()).toEqual( + buildShellToolDefinitions(context.conversationConfig), + ); + }); +}); + +describe("library tool modules", () => { + it("initializes, registers, and executes a direct TypeScript tool", async () => { + const context = fakeTurnContext(); + const tool = await initializeTool( + uppercaseTool, + "library", + { + prefix: "result: ", + }, + context, + ); + const registry = createToolRegistry(context).register(tool); + + expect(registry.definitions()).toEqual([uppercaseTool.definition]); + await expect( + registry.executePending([ + { + toolCallId: "call_1", + request: { + functionName: "uppercase", + arguments: { + text: "hello", + }, + }, + }, + ]), + ).resolves.toEqual([ + toolResultEvent("call_1", { + text: "result: HELLO", + }), + ]); + }); +}); + +describe("agent tool loading", () => { + it("loads and registers library tools from a manifest", async () => { + const context = fakeTurnContext(); + const registry = createToolRegistry(context); + + await registerLibraryToolsFromManifest(registry, context, { + tools: [ + { + modulePath: uppercaseToolModulePath(), + initialization: { + prefix: "library: ", + }, + }, + ], + }); + + expect(registry.get("uppercase")?.source).toBe("library"); + await expect( + registry.executePending([ + { + toolCallId: "call_1", + request: { + functionName: "uppercase", + arguments: { + text: "hello", + }, + }, + }, + ]), + ).resolves.toEqual([ + toolResultEvent("call_1", { + text: "library: HELLO", + }), + ]); + }); + + it("loads and registers agent tools from a manifest", async () => { + const context = fakeTurnContext(); + const registry = createToolRegistry(context); + + await registerAgentToolsFromManifest(registry, context, { + tools: [ + { + modulePath: uppercaseToolModulePath(), + initialization: { + prefix: "agent: ", + }, + }, + ], + }); + + expect(registry.definitions()).toEqual([uppercaseTool.definition]); + expect(registry.get("uppercase")?.source).toBe("agent"); + await expect( + registry.executePending([ + { + toolCallId: "call_1", + request: { + functionName: "uppercase", + arguments: { + text: "hello", + }, + }, + }, + ]), + ).resolves.toEqual([ + toolResultEvent("call_1", { + text: "agent: HELLO", + }), + ]); + }); + + it("loads tools through the generic source-aware manifest path", async () => { + const context = fakeTurnContext(); + const registry = createToolRegistry(context); + + await registerToolsFromManifest( + registry, + context, + { + tools: [ + { + modulePath: uppercaseToolModulePath(), + initialization: { + prefix: "generic: ", + }, + }, + ], + }, + "library", + ); + + expect(registry.get("uppercase")?.source).toBe("library"); + }); + + it("rejects agent tool modules without a default Tool export", async () => { + const registry = createToolRegistry(fakeTurnContext()); + + await expect( + registerAgentToolsFromManifest(registry, fakeTurnContext(), { + tools: [ + { + modulePath: "data:text/javascript,export const value = 1;", + initialization: {}, + }, + ], + }), + ).rejects.toThrow("agent tool module must default export a Tool"); + }); + + it("rejects invalid agent tool initialization", async () => { + const registry = createToolRegistry(fakeTurnContext()); + + await expect( + registerAgentToolsFromManifest(registry, fakeTurnContext(), { + tools: [ + { + modulePath: uppercaseToolModulePath(), + initialization: {}, + }, + ], + }), + ).rejects.toThrow("tool initialization.prefix is required"); + }); +}); + +function fakeTool( + name: string, + execute: ( + args: JsonObject, + execution: ToolExecutionContext, + ) => Promise, +): ToolInstance { + return { + source: "library", + definition: { + name, + description: `Fake ${name} tool.`, + parameters: { + type: "object", + additionalProperties: true, + }, + }, + handler: { + execute, + }, + }; +} + +function uppercaseToolModulePath(): string { + return new URL( + "../../examples/typescript/tools/uppercase.ts", + import.meta.url, + ).href; +} + +function fakeTurnContext( + options: { + streaming?: boolean; + streamEvents?: EventData[]; + executeTool?: TurnContext["executeTool"]; + conversationConfig?: TurnContext["conversationConfig"]; + } = {}, +): TurnContext { + const streamEvents = options.streamEvents ?? []; + return { + agentConfig: { + instructions: [], + harness: "typescript", + typescript: null, + sandboxImage: null, + enableNetworking: false, + model: "test-model", + maxOutputTokens: null, + maxToolRoundTrips: null, + braintrust: null, + }, + conversationConfig: options.conversationConfig ?? { + enableNetworking: false, + shellProgram: null, + mounts: [], + }, + request: { + input: [], + sessionId: null, + }, + streaming: options.streaming ?? false, + braintrustParent: null, + exoharness: { + current: { + agent: {}, + conversation: {}, + turn: {}, + }, + }, + executeTool: options.executeTool ?? (async () => null), + async startSandboxProcess() { + throw new Error("not implemented"); + }, + async executePendingTools() { + return []; + }, + stream: { + async firstChunk(ttftMs: number) { + streamEvents.push({ type: "first_chunk_streamed", ttftMs }); + }, + async text(text: string) { + streamEvents.push({ type: "text_streamed", text }); + }, + async toolCall(args: { + toolCallId: string; + toolName: string; + arguments: JsonObject; + }) { + streamEvents.push({ + type: "tool_call_streamed", + ...args, + }); + }, + async toolResult(args: { toolCallId: string; result: ToolResult }) { + streamEvents.push({ + type: "tool_result_streamed", + ...args, + }); + }, + }, + } as unknown as TurnContext; +} diff --git a/typescript/harness/index.ts b/typescript/harness/index.ts index 7f24c2f..2770a60 100644 --- a/typescript/harness/index.ts +++ b/typescript/harness/index.ts @@ -4,6 +4,10 @@ export interface JsonObject { [key: string]: JsonValue; } +export * from "./tools"; +export * from "./built-in-tools"; +export * from "./tool-manifest"; + export type MessageRole = | "system" | "developer" @@ -94,6 +98,7 @@ export interface ToolDefinition { name: string; description: string; parameters: JsonValue; + outputSchema?: JsonValue; } export interface ToolRequest { @@ -609,32 +614,6 @@ export function toolResultMessage( }; } -export function buildShellToolDefinitions( - config: ConversationConfig, -): ToolDefinition[] { - if (!config.shellProgram) { - return []; - } - - return [ - { - name: "shell", - description: `Run a shell command using ${config.shellProgram}.`, - parameters: { - type: "object", - additionalProperties: false, - properties: { - command: { - type: "string", - description: "Shell command to execute.", - }, - }, - required: ["command"], - }, - }, - ]; -} - export function filterMessages( messages: Message[], role?: MessageRole, diff --git a/typescript/harness/tool-manifest.ts b/typescript/harness/tool-manifest.ts new file mode 100644 index 0000000..1f7405e --- /dev/null +++ b/typescript/harness/tool-manifest.ts @@ -0,0 +1,101 @@ +import type { JsonObject, TurnContext } from "./index"; +import { + initializeTool, + type HarnessToolRegistry, + type HarnessToolSource, + type Tool, + type ToolInstance, +} from "./tools"; + +export interface ToolManifest { + tools: ToolManifestEntry[]; +} + +export interface ToolManifestEntry { + modulePath: string; + initialization: JsonObject; +} + +export type LibraryToolManifest = ToolManifest; +export type LibraryToolManifestEntry = ToolManifestEntry; +export type AgentToolManifest = ToolManifest; +export type AgentToolManifestEntry = ToolManifestEntry; + +export async function registerToolsFromManifest( + registry: HarnessToolRegistry, + context: TurnContext, + manifest: ToolManifest, + source: Extract, +): Promise { + for (const entry of manifest.tools) { + registry.register(await loadToolFromManifestEntry(context, entry, source)); + } +} + +export function registerLibraryToolsFromManifest( + registry: HarnessToolRegistry, + context: TurnContext, + manifest: LibraryToolManifest, +): Promise { + return registerToolsFromManifest(registry, context, manifest, "library"); +} + +export function registerAgentToolsFromManifest( + registry: HarnessToolRegistry, + context: TurnContext, + manifest: AgentToolManifest, +): Promise { + return registerToolsFromManifest(registry, context, manifest, "agent"); +} + +export async function loadToolFromManifestEntry( + context: TurnContext, + entry: ToolManifestEntry, + source: Extract, +): Promise { + const tool = await importTool(entry.modulePath, source); + return initializeTool(tool, source, entry.initialization, context); +} + +export function loadLibraryTool( + context: TurnContext, + entry: LibraryToolManifestEntry, +): Promise { + return loadToolFromManifestEntry(context, entry, "library"); +} + +export function loadAgentTool( + context: TurnContext, + entry: AgentToolManifestEntry, +): Promise { + return loadToolFromManifestEntry(context, entry, "agent"); +} + +async function importTool( + modulePath: string, + source: Extract, +): Promise { + const module = (await import(modulePath)) as { default?: unknown }; + if (!isTool(module.default)) { + throw new Error( + `${source} tool module must default export a Tool: ${modulePath}`, + ); + } + return module.default; +} + +function isTool(value: unknown): value is Tool { + if (!value || typeof value !== "object") { + return false; + } + const candidate = value as { + definition?: unknown; + initializationParameters?: unknown; + initialize?: unknown; + }; + return ( + Boolean(candidate.definition) && + Boolean(candidate.initializationParameters) && + typeof candidate.initialize === "function" + ); +} diff --git a/typescript/harness/tools.ts b/typescript/harness/tools.ts new file mode 100644 index 0000000..56ac798 --- /dev/null +++ b/typescript/harness/tools.ts @@ -0,0 +1,212 @@ +import type { + EventData, + JsonObject, + JsonValue, + PendingToolCall, + ToolDefinition, + ToolResult, + TurnContext, +} from "./index"; + +export type HarnessToolSource = "built_in" | "library" | "agent"; + +export interface ToolExecutionContext { + readonly context: TurnContext; + readonly toolCallId?: string; +} + +export interface ToolHandler { + execute( + args: JsonObject, + execution: ToolExecutionContext, + ): Promise; +} + +export interface ToolInstance { + definition: ToolDefinition; + source: HarnessToolSource; + handler: ToolHandler; +} + +export interface ToolInitializationContext { + readonly context: TurnContext; + readonly source: HarnessToolSource; +} + +export interface Tool { + definition: ToolDefinition; + initializationParameters: JsonValue; + initialize( + args: JsonObject, + initialization: ToolInitializationContext, + ): Promise | ToolHandler; +} + +export class HarnessToolRegistry { + private readonly tools = new Map(); + + constructor(private readonly context: TurnContext) {} + + register(tool: ToolInstance): this { + const { name } = tool.definition; + if (this.tools.has(name)) { + throw new Error(`tool is already registered: ${name}`); + } + this.tools.set(name, tool); + return this; + } + + definitions(): ToolDefinition[] { + return [...this.tools.values()].map((tool) => tool.definition); + } + + get(name: string): ToolInstance | undefined { + return this.tools.get(name); + } + + async executePending(toolCalls: PendingToolCall[]): Promise { + const events: EventData[] = []; + for (const toolCall of toolCalls) { + const result = await this.executeToolCall(toolCall); + events.push(toolResultEvent(toolCall.toolCallId, result)); + } + return events; + } + + private async executeToolCall( + toolCall: PendingToolCall, + ): Promise { + const tool = this.tools.get(toolCall.request.functionName); + if (!tool) { + throw new Error( + `tool execution is not configured for ${toolCall.request.functionName}`, + ); + } + if (this.context.streaming) { + await this.context.stream.toolCall({ + toolCallId: toolCall.toolCallId, + toolName: toolCall.request.functionName, + arguments: toolCall.request.arguments, + }); + } + const result = await tool.handler.execute(toolCall.request.arguments, { + context: this.context, + toolCallId: toolCall.toolCallId, + }); + if (this.context.streaming) { + await this.context.stream.toolResult({ + toolCallId: toolCall.toolCallId, + result, + }); + } + return result; + } +} + +export function createToolRegistry(context: TurnContext): HarnessToolRegistry { + return new HarnessToolRegistry(context); +} + +export async function initializeTool( + tool: Tool, + source: HarnessToolSource, + initializationArgs: JsonObject, + context: TurnContext, +): Promise { + validateJsonSchema( + tool.initializationParameters, + initializationArgs, + "tool initialization", + ); + return { + definition: tool.definition, + source, + handler: await tool.initialize(initializationArgs, { + context, + source, + }), + }; +} + +function validateJsonSchema( + schema: JsonValue, + value: JsonValue, + path: string, +): void { + if (!isRecord(schema)) { + return; + } + const type = schema.type; + if (type !== undefined && !matchesJsonSchemaType(type, value)) { + throw new Error( + `${path} does not match schema type ${formatSchemaType(type)}`, + ); + } + if (type !== "object" || !isRecord(value)) { + return; + } + const properties = isRecord(schema.properties) ? schema.properties : {}; + const required = Array.isArray(schema.required) ? schema.required : []; + for (const requiredKey of required) { + if (typeof requiredKey === "string" && !(requiredKey in value)) { + throw new Error(`${path}.${requiredKey} is required`); + } + } + if (schema.additionalProperties === false) { + for (const key of Object.keys(value)) { + if (!(key in properties)) { + throw new Error(`${path}.${key} is not allowed`); + } + } + } + for (const [key, propertySchema] of Object.entries(properties)) { + if (key in value) { + validateJsonSchema( + propertySchema as JsonValue, + value[key], + `${path}.${key}`, + ); + } + } +} + +function matchesJsonSchemaType(type: JsonValue, value: JsonValue): boolean { + if (Array.isArray(type)) { + return type.some((candidate) => matchesJsonSchemaType(candidate, value)); + } + if (type === "null") { + return value === null; + } + if (type === "array") { + return Array.isArray(value); + } + if (type === "object") { + return isRecord(value); + } + if (type === "string") { + return typeof value === "string"; + } + if (type === "number") { + return typeof value === "number"; + } + if (type === "boolean") { + return typeof value === "boolean"; + } + return true; +} + +function formatSchemaType(type: JsonValue): string { + return Array.isArray(type) ? type.map(String).join(" | ") : String(type); +} + +function isRecord(value: JsonValue): value is JsonObject { + return Boolean(value) && typeof value === "object" && !Array.isArray(value); +} + +function toolResultEvent(toolCallId: string, result: ToolResult): EventData { + return { + type: "tool_result", + tool_call_id: toolCallId, + result, + }; +} diff --git a/typescript/model-runtime/responses.ts b/typescript/model-runtime/responses.ts index 031a9d1..db72704 100644 --- a/typescript/model-runtime/responses.ts +++ b/typescript/model-runtime/responses.ts @@ -71,6 +71,10 @@ export interface NativeStreamHandlers { export type TraceParent = Span | string; +export type ToolCallExecutor = ( + toolCall: PendingToolCall, +) => Promise; + export interface NativeTraceOptions { parent?: TraceParent; roundIndex?: number; @@ -91,6 +95,7 @@ export interface ResponsesRuntimeLike { context: TurnContext, toolCall: PendingToolCall, roundIndex: number, + execute?: ToolCallExecutor, ): Promise; } @@ -176,12 +181,14 @@ export class ResponsesRuntime implements ResponsesRuntimeLike { context: TurnContext, toolCall: PendingToolCall, roundIndex: number, + execute: ToolCallExecutor = (toolCall) => + context.executePendingTools([toolCall]), ): Promise { return tracedUnderParent( turnParent, async (span) => { try { - const events = await context.executePendingTools([toolCall]); + const events = await execute(toolCall); span.log({ output: toolResultTraceOutput(events) }); return events; } catch (error) { From 7d27b783e61f6e5cdab9e958ebe121c31e82c701 Mon Sep 17 00:00:00 2001 From: 61cygni <> Date: Fri, 22 May 2026 22:00:47 -0700 Subject: [PATCH 2/8] Add demo IRC tool Co-authored-by: Cursor --- examples/typescript/tools/irc.ts | 248 +++++++++++++++++++++++++++++++ typescript/harness/index.test.ts | 44 ++++++ 2 files changed, 292 insertions(+) create mode 100644 examples/typescript/tools/irc.ts diff --git a/examples/typescript/tools/irc.ts b/examples/typescript/tools/irc.ts new file mode 100644 index 0000000..ba3f674 --- /dev/null +++ b/examples/typescript/tools/irc.ts @@ -0,0 +1,248 @@ +// Demo library tool for the TypeScript harness tool system. This example shows +// how a networked service integration can be packaged as a default-export Tool. +// It is not exposed by any example harness by default. + +import net from "node:net"; +import tls from "node:tls"; + +import type { JsonObject, Tool, ToolResult, TurnContext } from "@exo/harness"; + +interface IrcConfig { + server: string; + port: number; + nick: string; + username: string; + realname: string; + tls: boolean; + dryRun: boolean; + passwordSecretId: string | null; +} + +const ircTool = { + definition: { + name: "irc_send_message", + description: "Send a message to an IRC channel.", + parameters: { + type: "object", + additionalProperties: false, + properties: { + channel: { + type: "string", + description: "IRC channel name, for example #exo.", + }, + text: { + type: "string", + description: "Message text to send.", + }, + }, + required: ["channel", "text"], + }, + outputSchema: { + type: "object", + additionalProperties: false, + properties: { + ok: { type: "boolean" }, + dryRun: { type: "boolean" }, + server: { type: "string" }, + channel: { type: "string" }, + }, + required: ["ok", "dryRun", "server", "channel"], + }, + }, + initializationParameters: { + type: "object", + additionalProperties: false, + properties: { + server: { + type: "string", + description: "IRC server hostname.", + }, + port: { + type: "number", + description: "IRC server port.", + }, + nick: { + type: "string", + description: "Nickname to use for the IRC connection.", + }, + username: { + type: "string", + description: "Username to send in the IRC USER command.", + }, + realname: { + type: "string", + description: "Real name to send in the IRC USER command.", + }, + tls: { + type: "boolean", + description: "Whether to connect with TLS.", + }, + dryRun: { + type: "boolean", + description: "If true, build IRC commands without opening a socket.", + }, + passwordSecretId: { + type: ["string", "null"], + description: "Optional secret id containing an IRC server password.", + }, + }, + required: ["server", "port", "nick", "username", "realname", "tls"], + }, + initialize(args) { + const config = parseConfig(args); + return { + async execute(args, execution): Promise { + return sendIrcMessage(execution.context, config, args); + }, + }; + }, +} satisfies Tool; + +export default ircTool; + +async function sendIrcMessage( + context: TurnContext, + config: IrcConfig, + args: JsonObject, +): Promise { + const channel = stringArgument(args, "channel"); + const text = stringArgument(args, "text"); + const password = await resolvePassword(context, config.passwordSecretId); + const commands = ircCommands(config, channel, text, password); + + if (!config.dryRun) { + await withIrcConnection(config, async (socket) => { + for (const command of commands) { + socket.write(`${command}\r\n`); + } + }); + } + + return { + ok: true, + dryRun: config.dryRun, + server: config.server, + channel, + }; +} + +function ircCommands( + config: IrcConfig, + channel: string, + text: string, + password: string | null, +): string[] { + return [ + ...(password ? [`PASS ${password}`] : []), + `NICK ${config.nick}`, + `USER ${config.username} 0 * :${config.realname}`, + `PRIVMSG ${channel} :${text}`, + "QUIT", + ]; +} + +async function resolvePassword( + context: TurnContext, + secretId: string | null, +): Promise { + if (!secretId) { + return null; + } + const secret = + await context.exoharness.current.conversation.getSecret(secretId); + if (!secret) { + throw new Error(`IRC password secret does not exist: ${secretId}`); + } + if (secret.type !== "key") { + throw new Error("IRC password secret must be a key secret"); + } + return secret.value; +} + +async function withIrcConnection( + config: IrcConfig, + run: (socket: net.Socket) => Promise | void, +): Promise { + await new Promise((resolve, reject) => { + const socket = config.tls + ? tls.connect(config.port, config.server) + : net.connect(config.port, config.server); + socket.setEncoding("utf8"); + socket.setTimeout(10_000); + socket.once("connect", async () => { + try { + await run(socket); + socket.end(resolve); + } catch (error) { + socket.destroy(); + reject(error); + } + }); + socket.once("error", reject); + socket.once("timeout", () => { + socket.destroy(new Error("IRC connection timed out")); + }); + }); +} + +function parseConfig(args: JsonObject): IrcConfig { + return { + server: stringArgument(args, "server"), + port: numberArgument(args, "port"), + nick: stringArgument(args, "nick"), + username: stringArgument(args, "username"), + realname: stringArgument(args, "realname"), + tls: booleanArgument(args, "tls"), + dryRun: optionalBooleanArgument(args, "dryRun") ?? false, + passwordSecretId: optionalStringArgument(args, "passwordSecretId"), + }; +} + +function stringArgument(args: JsonObject, name: string): string { + const value = args[name]; + if (typeof value !== "string" || value.length === 0) { + throw new Error(`IRC tool argument ${name} must be a non-empty string`); + } + return value; +} + +function optionalStringArgument(args: JsonObject, name: string): string | null { + const value = args[name]; + if (value === undefined || value === null) { + return null; + } + if (typeof value !== "string" || value.length === 0) { + throw new Error(`IRC tool argument ${name} must be a non-empty string`); + } + return value; +} + +function numberArgument(args: JsonObject, name: string): number { + const value = args[name]; + if (typeof value !== "number") { + throw new Error(`IRC tool argument ${name} must be a number`); + } + return value; +} + +function booleanArgument(args: JsonObject, name: string): boolean { + const value = args[name]; + if (typeof value !== "boolean") { + throw new Error(`IRC tool argument ${name} must be a boolean`); + } + return value; +} + +function optionalBooleanArgument( + args: JsonObject, + name: string, +): boolean | null { + const value = args[name]; + if (value === undefined || value === null) { + return null; + } + if (typeof value !== "boolean") { + throw new Error(`IRC tool argument ${name} must be a boolean`); + } + return value; +} diff --git a/typescript/harness/index.test.ts b/typescript/harness/index.test.ts index e84c8b9..8b0092b 100644 --- a/typescript/harness/index.test.ts +++ b/typescript/harness/index.test.ts @@ -17,6 +17,7 @@ import { type ToolResult, type TurnContext, } from "./index"; +import ircTool from "../../examples/typescript/tools/irc"; import uppercaseTool from "../../examples/typescript/tools/uppercase"; describe("HarnessToolRegistry", () => { @@ -252,6 +253,49 @@ describe("library tool modules", () => { }), ]); }); + + it("initializes and executes the demo IRC tool in dry-run mode", async () => { + const context = fakeTurnContext(); + const tool = await initializeTool( + ircTool, + "library", + { + server: "irc.example.test", + port: 6697, + nick: "exo-agent", + username: "exo", + realname: "Exo Agent", + tls: true, + dryRun: true, + passwordSecretId: null, + }, + context, + ); + const registry = createToolRegistry(context).register(tool); + + expect(registry.definitions()).toEqual([ircTool.definition]); + await expect( + registry.executePending([ + { + toolCallId: "call_1", + request: { + functionName: "irc_send_message", + arguments: { + channel: "#exo", + text: "hello", + }, + }, + }, + ]), + ).resolves.toEqual([ + toolResultEvent("call_1", { + ok: true, + dryRun: true, + server: "irc.example.test", + channel: "#exo", + }), + ]); + }); }); describe("agent tool loading", () => { From 1913848db4a4829be461afca9d440e2b09cbab76 Mon Sep 17 00:00:00 2001 From: 61cygni <> Date: Fri, 22 May 2026 22:28:50 -0700 Subject: [PATCH 3/8] Fix env-file lookup for CLI secrets Allow secret registration from --env to use values loaded from --env-file, so users can register model keys without exporting them in the shell. Co-authored-by: Cursor --- crates/cli/src/main.rs | 13 ++++++++++--- crates/cli/src/secret_tests.rs | 19 +++++++++++++++++-- 2 files changed, 27 insertions(+), 5 deletions(-) diff --git a/crates/cli/src/main.rs b/crates/cli/src/main.rs index 34b8a4c..8aab0a1 100644 --- a/crates/cli/src/main.rs +++ b/crates/cli/src/main.rs @@ -1043,7 +1043,7 @@ async fn main() -> Result<(), Box> { } SecretCommands::Set { name, env, value } => { let value = match (env, value) { - (Some(env), None) => secret_value_from_env_arg(&env)?, + (Some(env), None) => secret_value_from_env_arg(&env, &env_vars)?, (None, Some(value)) => value, (Some(_), Some(_)) => { return Err("provide either --env or --value, not both".into()); @@ -1549,7 +1549,10 @@ fn slugify(input: &str) -> String { slug } -pub(crate) fn secret_value_from_env_arg(env: &str) -> Result { +pub(crate) fn secret_value_from_env_arg( + env: &str, + loaded_env: &HashMap, +) -> Result { if !is_env_var_name(env) { return Err( "invalid --env value; pass an environment variable name such as OPENAI_API_KEY, not the secret value" @@ -1557,7 +1560,11 @@ pub(crate) fn secret_value_from_env_arg(env: &str) -> Result { ); } - std::env::var(env).map_err(|_| "environment variable passed to --env is not set".to_string()) + loaded_env + .get(env) + .cloned() + .or_else(|| std::env::var(env).ok()) + .ok_or_else(|| "environment variable passed to --env is not set".to_string()) } fn is_env_var_name(env: &str) -> bool { diff --git a/crates/cli/src/secret_tests.rs b/crates/cli/src/secret_tests.rs index 78e572a..7f7b988 100644 --- a/crates/cli/src/secret_tests.rs +++ b/crates/cli/src/secret_tests.rs @@ -1,10 +1,13 @@ +use std::collections::HashMap; + use crate::secret_value_from_env_arg; #[test] fn env_secret_error_does_not_echo_shell_expanded_secret() { let expanded_secret = "sk-proj-sensitive-secret-value"; - let error = secret_value_from_env_arg(expanded_secret).expect_err("secret should be rejected"); + let error = secret_value_from_env_arg(expanded_secret, &HashMap::new()) + .expect_err("secret should be rejected"); assert!(!error.contains(expanded_secret)); assert!(error.contains("not the secret value")); @@ -14,8 +17,20 @@ fn env_secret_error_does_not_echo_shell_expanded_secret() { fn unset_env_secret_error_does_not_echo_env_name() { let env_name = "EXO_TEST_SECRET_THAT_SHOULD_NOT_EXIST"; - let error = secret_value_from_env_arg(env_name).expect_err("env var should be unset"); + let error = + secret_value_from_env_arg(env_name, &HashMap::new()).expect_err("env var should be unset"); assert!(!error.contains(env_name)); assert!(error.contains("--env")); } + +#[test] +fn env_secret_can_come_from_loaded_env_file_values() { + let env_name = "EXO_TEST_SECRET_FROM_ENV_FILE"; + let mut loaded_env = HashMap::new(); + loaded_env.insert(env_name.to_string(), "loaded-secret".to_string()); + + let secret = secret_value_from_env_arg(env_name, &loaded_env).expect("secret should resolve"); + + assert_eq!(secret, "loaded-secret"); +} From cd070777fadfa28db29139ced7d3b5baef1bdc23 Mon Sep 17 00:00:00 2001 From: 61cygni <> Date: Fri, 22 May 2026 22:55:59 -0700 Subject: [PATCH 4/8] Add CLI library tool manifests Enable TypeScript agents to load library tools from manifest files so REPL agents can use user-provided tools without custom harness code. Co-authored-by: Cursor --- Cargo.lock | 1 + crates/cli/Cargo.toml | 1 + crates/cli/src/main.rs | 110 +- crates/executor/src/basic_tests.rs | 1 + crates/executor/src/executor_types.rs | 8 + crates/executor/src/harness_basic_tests.rs | 8 + crates/executor/src/harness_facade.rs | 1 + crates/executor/src/harness_types.rs | 1 + crates/executor/src/lib.rs | 4 +- crates/executor/src/rlm_tests.rs | 5 + docs/tools.md | 24 +- examples/typescript/basic-harness.ts | 4 + examples/typescript/tools/irc.manifest.json | 17 + examples/typescript/tools/irc.ts | 146 ++- .../typescript/tools/uppercase.manifest.json | 10 + tools.md | 1079 +++++++++++++++++ typescript/harness/index.test.ts | 3 + typescript/harness/index.ts | 4 + typescript/harness/runner.ts | 17 + 19 files changed, 1419 insertions(+), 25 deletions(-) create mode 100644 examples/typescript/tools/irc.manifest.json create mode 100644 examples/typescript/tools/uppercase.manifest.json create mode 100644 tools.md diff --git a/Cargo.lock b/Cargo.lock index 44a1a45..d660545 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1837,6 +1837,7 @@ dependencies = [ "executor", "lingua", "rustyline", + "serde", "serde_json 1.0.149", "tempfile", "tokio", diff --git a/crates/cli/Cargo.toml b/crates/cli/Cargo.toml index 8fe9c87..3ba7fb2 100644 --- a/crates/cli/Cargo.toml +++ b/crates/cli/Cargo.toml @@ -15,6 +15,7 @@ clap.workspace = true executor = { path = "../executor" } lingua.workspace = true rustyline = "15.0.0" +serde.workspace = true serde_json.workspace = true tokio.workspace = true tokio-stream.workspace = true diff --git a/crates/cli/src/main.rs b/crates/cli/src/main.rs index 8aab0a1..e649817 100644 --- a/crates/cli/src/main.rs +++ b/crates/cli/src/main.rs @@ -21,8 +21,8 @@ use executor::{ BraintrustRuntimeConfig, BraintrustTracingConfig, ConversationModelConfig, CreateAgentRequest, CreateConversationRequest, EventQuery, EventQueryDirection, ExoHarness, FileSystemMount, FileSystemMountMode, ForkConversationRequest, Harness, HarnessAgent, HarnessConversation, - PutSecretRequest, RlmHarness, SANDBOX_MAIN_MOUNT_DIR, Secret, SendRequest, TypeScriptHarness, - TypeScriptHarnessConfig, Uuid7, load_agent_config, + PutSecretRequest, RlmHarness, SANDBOX_MAIN_MOUNT_DIR, Secret, SendRequest, ToolManifestEntry, + TypeScriptHarness, TypeScriptHarnessConfig, Uuid7, load_agent_config, }; use lingua::Message; use lingua::universal::{AssistantContent, AssistantContentPart, ToolContentPart, UserContent}; @@ -115,6 +115,8 @@ enum AgentCommands { slug: Option, #[arg(long)] module: Option, + #[arg(long = "tool-manifest")] + tool_manifests: Vec, #[arg(long)] sandbox_image: Option, #[arg(long, value_enum)] @@ -140,6 +142,10 @@ enum AgentCommands { module: Option, #[arg(long)] clear_module: bool, + #[arg(long = "tool-manifest")] + tool_manifests: Vec, + #[arg(long)] + clear_tool_manifests: bool, #[arg(long)] sandbox_image: Option, #[arg(long)] @@ -391,6 +397,7 @@ async fn main() -> Result<(), Box> { name, slug, module, + tool_manifests, sandbox_image, networking, model, @@ -410,13 +417,16 @@ async fn main() -> Result<(), Box> { { return Err("sandbox image must not be empty".into()); } + let agent_harness_kind = to_agent_harness_kind(harness_kind); let typescript = build_typescript_harness_config(harness_kind, module.as_deref())?; + let library_tools = load_tool_manifests(agent_harness_kind, &tool_manifests)?; let agent = harness .create_agent(CreateAgentRequest { slug, name: Some(name), - harness: to_agent_harness_kind(harness_kind), + harness: agent_harness_kind, typescript, + library_tools, sandbox_image, enable_networking: matches!(networking, Some(NetworkingMode::Enabled)), model, @@ -440,6 +450,8 @@ async fn main() -> Result<(), Box> { set_harness, module, clear_module, + tool_manifests, + clear_tool_manifests, sandbox_image, clear_sandbox_image, networking, @@ -456,6 +468,11 @@ async fn main() -> Result<(), Box> { if clear_module && module.is_some() { return Err("provide either --clear-module or --module, not both".into()); } + if clear_tool_manifests && !tool_manifests.is_empty() { + return Err( + "provide either --clear-tool-manifests or --tool-manifest, not both".into(), + ); + } if clear_sandbox_image && sandbox_image.is_some() { return Err( "provide either --clear-sandbox-image or --sandbox-image, not both".into(), @@ -503,6 +520,18 @@ async fn main() -> Result<(), Box> { changed = true; } } + if clear_tool_manifests { + if !config.library_tools.is_empty() { + config.library_tools.clear(); + changed = true; + } + } else if !tool_manifests.is_empty() { + let library_tools = load_tool_manifests(config.harness, &tool_manifests)?; + if config.library_tools != library_tools { + config.library_tools = library_tools; + changed = true; + } + } if clear_sandbox_image { config.sandbox_image = None; changed = true; @@ -565,7 +594,7 @@ async fn main() -> Result<(), Box> { )?; if updated_braintrust.is_none() && !changed { return Err( - "no changes provided; pass --set-harness, --module, --sandbox-image, --networking, model flags, --clear-braintrust, or Braintrust project flags" + "no changes provided; pass --set-harness, --module, --tool-manifest, --sandbox-image, --networking, model flags, --clear-braintrust, or Braintrust project flags" .into(), ); } @@ -600,6 +629,10 @@ async fn main() -> Result<(), Box> { .map(|config| config.module_path.as_str()) .unwrap_or("none") ); + println!("library_tools: {}", config.library_tools.len()); + for tool in &config.library_tools { + println!(" - {}", tool.module_path); + } println!( "sandbox_image: {}", config.sandbox_image.as_deref().unwrap_or("default") @@ -1238,6 +1271,75 @@ fn resolve_typescript_module_path( }) } +#[derive(serde::Deserialize)] +struct CliToolManifest { + tools: Vec, +} + +#[derive(serde::Deserialize)] +struct CliToolManifestEntry { + #[serde(rename = "modulePath", alias = "module_path")] + module_path: String, + #[serde(default = "empty_json_object")] + initialization: serde_json::Value, +} + +fn empty_json_object() -> serde_json::Value { + serde_json::Value::Object(serde_json::Map::new()) +} + +fn load_tool_manifests( + harness_kind: AgentHarnessKind, + paths: &[PathBuf], +) -> Result, Box> { + if paths.is_empty() { + return Ok(Vec::new()); + } + if harness_kind != AgentHarnessKind::TypeScript { + return Err("--tool-manifest is only valid with TypeScript agents".into()); + } + + let mut tools = Vec::new(); + for path in paths { + let manifest_path = std::fs::canonicalize(path)?; + let manifest_dir = manifest_path + .parent() + .ok_or_else(|| format!("tool manifest has no parent directory: {}", path.display()))?; + let manifest: CliToolManifest = + serde_json::from_str(&std::fs::read_to_string(&manifest_path)?)?; + + for entry in manifest.tools { + if entry.module_path.trim().is_empty() { + return Err(format!( + "tool manifest {} contains an empty modulePath", + manifest_path.display() + ) + .into()); + } + if !entry.initialization.is_object() { + return Err(format!( + "tool manifest {} entry {} must use an object initialization value", + manifest_path.display(), + entry.module_path + ) + .into()); + } + + let module_path = PathBuf::from(&entry.module_path); + let resolved_module_path = if module_path.is_absolute() { + std::fs::canonicalize(&module_path)? + } else { + std::fs::canonicalize(manifest_dir.join(module_path))? + }; + tools.push(ToolManifestEntry { + module_path: resolved_module_path.to_string_lossy().into_owned(), + initialization: entry.initialization, + }); + } + } + Ok(tools) +} + struct RegisteredModel { name: String, model: String, diff --git a/crates/executor/src/basic_tests.rs b/crates/executor/src/basic_tests.rs index 4da29f3..494ca4d 100644 --- a/crates/executor/src/basic_tests.rs +++ b/crates/executor/src/basic_tests.rs @@ -988,6 +988,7 @@ fn default_agent_config() -> AgentConfig { instructions: Vec::new(), harness: crate::AgentHarnessKind::Basic, typescript: None, + library_tools: Vec::new(), sandbox_image: None, enable_networking: false, model: "test-model".to_string(), diff --git a/crates/executor/src/executor_types.rs b/crates/executor/src/executor_types.rs index 32083a1..8976f9e 100644 --- a/crates/executor/src/executor_types.rs +++ b/crates/executor/src/executor_types.rs @@ -23,6 +23,8 @@ pub struct AgentConfig { #[serde(default)] pub typescript: Option, #[serde(default)] + pub library_tools: Vec, + #[serde(default)] pub sandbox_image: Option, #[serde(default)] pub enable_networking: bool, @@ -47,6 +49,12 @@ pub struct TypeScriptHarnessConfig { pub module_path: String, } +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub struct ToolManifestEntry { + pub module_path: String, + pub initialization: Value, +} + #[derive(Debug, Clone, Serialize, serde::Deserialize)] pub struct ConversationConfig { pub enable_networking: bool, diff --git a/crates/executor/src/harness_basic_tests.rs b/crates/executor/src/harness_basic_tests.rs index 7663ee2..81e7f6a 100644 --- a/crates/executor/src/harness_basic_tests.rs +++ b/crates/executor/src/harness_basic_tests.rs @@ -41,6 +41,7 @@ async fn creates_agents_and_conversations_with_persisted_config() { name: Some("Demo".to_string()), harness: crate::AgentHarnessKind::Basic, typescript: None, + library_tools: Vec::new(), sandbox_image: None, enable_networking: false, model: "gpt-5.4".to_string(), @@ -111,6 +112,7 @@ async fn send_persists_messages_through_harness() { name: None, harness: crate::AgentHarnessKind::Basic, typescript: None, + library_tools: Vec::new(), sandbox_image: None, enable_networking: false, model: "gpt-5.4".to_string(), @@ -165,6 +167,7 @@ async fn close_session_appends_session_ended_event() { name: None, harness: crate::AgentHarnessKind::Basic, typescript: None, + library_tools: Vec::new(), sandbox_image: None, enable_networking: false, model: "gpt-5.4".to_string(), @@ -248,6 +251,7 @@ async fn updating_agent_config_refreshes_executor_cache() { name: Some("Demo".to_string()), harness: crate::AgentHarnessKind::Basic, typescript: None, + library_tools: Vec::new(), sandbox_image: None, enable_networking: false, model: "gpt-5.4".to_string(), @@ -328,6 +332,7 @@ async fn send_executes_shell_tool_when_enabled() { name: Some("Demo".to_string()), harness: crate::AgentHarnessKind::Basic, typescript: None, + library_tools: Vec::new(), sandbox_image: None, enable_networking: true, model: "gpt-5.4".to_string(), @@ -416,6 +421,7 @@ async fn harness_exposes_raw_exoharness_handles() { name: Some("Demo".to_string()), harness: crate::AgentHarnessKind::Basic, typescript: None, + library_tools: Vec::new(), sandbox_image: None, enable_networking: false, model: "gpt-5.4".to_string(), @@ -495,6 +501,7 @@ async fn updating_mounts_recreates_shell_sandbox() { name: Some("Demo".to_string()), harness: crate::AgentHarnessKind::Basic, typescript: None, + library_tools: Vec::new(), sandbox_image: None, enable_networking: false, model: "gpt-5.4".to_string(), @@ -626,6 +633,7 @@ async fn conversation_model_override_changes_effective_model() { name: None, harness: crate::AgentHarnessKind::Basic, typescript: None, + library_tools: Vec::new(), sandbox_image: None, enable_networking: false, model: "gpt-5.4".to_string(), diff --git a/crates/executor/src/harness_facade.rs b/crates/executor/src/harness_facade.rs index 2ff1768..0973bdb 100644 --- a/crates/executor/src/harness_facade.rs +++ b/crates/executor/src/harness_facade.rs @@ -124,6 +124,7 @@ where instructions: Vec::new(), harness: request.harness, typescript: request.typescript, + library_tools: request.library_tools, sandbox_image: request.sandbox_image, enable_networking: request.enable_networking, model: request.model, diff --git a/crates/executor/src/harness_types.rs b/crates/executor/src/harness_types.rs index 862e71a..259cbb4 100644 --- a/crates/executor/src/harness_types.rs +++ b/crates/executor/src/harness_types.rs @@ -62,6 +62,7 @@ pub struct CreateAgentRequest { pub name: Option, pub harness: AgentHarnessKind, pub typescript: Option, + pub library_tools: Vec, pub sandbox_image: Option, pub enable_networking: bool, pub model: String, diff --git a/crates/executor/src/lib.rs b/crates/executor/src/lib.rs index 9e5c6a3..3a9575f 100644 --- a/crates/executor/src/lib.rs +++ b/crates/executor/src/lib.rs @@ -27,8 +27,8 @@ pub use braintrust::{BraintrustProject, BraintrustRuntimeConfig, BraintrustTraci pub use executor_types::{ AgentConfig, AgentHarnessKind, ConversationConfig, ConversationModelConfig, ExecutionStreamEvent, ExecutionStreamHandle, ModelClient, ModelRequest, ModelResponse, - ModelResponseStream, PendingToolCall, SendRequest, SendResult, ToolDefinition, ToolRuntime, - TypeScriptHarnessConfig, + ModelResponseStream, PendingToolCall, SendRequest, SendResult, ToolDefinition, + ToolManifestEntry, ToolRuntime, TypeScriptHarnessConfig, }; pub use exoharness::{ AgentHandle, BasicExoHarness, Binding, BindingMetadata, ConversationHandle, EventData, EventId, diff --git a/crates/executor/src/rlm_tests.rs b/crates/executor/src/rlm_tests.rs index a3f4b22..ab22038 100644 --- a/crates/executor/src/rlm_tests.rs +++ b/crates/executor/src/rlm_tests.rs @@ -62,6 +62,7 @@ async fn rlm_send_executes_repl_steps_and_persists_final_answer() { name: Some("Demo".to_string()), harness: crate::AgentHarnessKind::Rlm, typescript: None, + library_tools: Vec::new(), sandbox_image: None, enable_networking: false, model: "gpt-5.4".to_string(), @@ -188,6 +189,7 @@ async fn rlm_subquery_variable_can_store_final_answer() { name: Some("Demo".to_string()), harness: crate::AgentHarnessKind::Rlm, typescript: None, + library_tools: Vec::new(), sandbox_image: None, enable_networking: false, model: "gpt-5.4".to_string(), @@ -252,6 +254,7 @@ async fn rlm_send_stream_suppresses_internal_control_text() { name: Some("Demo".to_string()), harness: crate::AgentHarnessKind::Rlm, typescript: None, + library_tools: Vec::new(), sandbox_image: None, enable_networking: false, model: "gpt-5.4".to_string(), @@ -353,6 +356,7 @@ globalThis.answer = String(\n\ name: Some("Demo".to_string()), harness: crate::AgentHarnessKind::Rlm, typescript: None, + library_tools: Vec::new(), sandbox_image: None, enable_networking: false, model: "gpt-5.4".to_string(), @@ -422,6 +426,7 @@ async fn rlm_can_finish_by_setting_final_in_repl() { name: Some("Demo".to_string()), harness: crate::AgentHarnessKind::Rlm, typescript: None, + library_tools: Vec::new(), sandbox_image: None, enable_networking: false, model: "gpt-5.4".to_string(), diff --git a/docs/tools.md b/docs/tools.md index 00114a1..62a8f5f 100644 --- a/docs/tools.md +++ b/docs/tools.md @@ -79,6 +79,21 @@ For example, an IRC tool might take `passwordSecretId` as an initialization parameter. The tool handler can resolve the secret at execution time through the exoharness API. +## Command Line Loading + +TypeScript agents can load library tools from manifest files when the agent is +created or updated: + +```bash +exo --harness typescript agent create "Tool Demo" \ + --module examples/typescript/basic-harness.ts \ + --model gpt-5.4 \ + --tool-manifest examples/typescript/tools/uppercase.manifest.json +``` + +`--tool-manifest` may be passed more than once. Relative `modulePath` values in +each manifest are resolved relative to that manifest file. + ## Safety Considerations Different tool sources have different trust levels: @@ -103,10 +118,9 @@ Recommended defaults: The generic registry, built-in shell registration, library tool loading, and agent tool manifest loading are implemented in the TypeScript harness API. -The basic TypeScript harness currently opts into only the built-in shell tool. -Library and agent tools can be registered by harnesses using the manifest -helpers, but the basic harness does not load manifests automatically yet. +The basic TypeScript harness currently opts into the built-in shell tool and +loads library tool manifests stored on the agent config. There is an example library tool at `examples/typescript/tools/uppercase.ts`. -It exists to test and demonstrate the registry contract; no example harness -exposes it to a model by default. +It exists to test and demonstrate the registry contract, and can be enabled with +`examples/typescript/tools/uppercase.manifest.json`. diff --git a/examples/typescript/basic-harness.ts b/examples/typescript/basic-harness.ts index b1ff657..2eddde7 100644 --- a/examples/typescript/basic-harness.ts +++ b/examples/typescript/basic-harness.ts @@ -3,6 +3,7 @@ import { defineHarness, materializePromptMessages, registerBuiltInTools, + registerLibraryToolsFromManifest, turnMetadata, type TurnContext, } from "@exo/harness"; @@ -40,6 +41,9 @@ async function runBasicTurnLoop( const maxToolRoundTrips = context.agentConfig.maxToolRoundTrips; const tools = createToolRegistry(context); registerBuiltInTools(tools, context, ["shell"]); + await registerLibraryToolsFromManifest(tools, context, { + tools: context.agentConfig.libraryTools, + }); let latestEventId: string | null = null; for (let round = 0; ; round += 1) { diff --git a/examples/typescript/tools/irc.manifest.json b/examples/typescript/tools/irc.manifest.json new file mode 100644 index 0000000..8a17be5 --- /dev/null +++ b/examples/typescript/tools/irc.manifest.json @@ -0,0 +1,17 @@ +{ + "tools": [ + { + "modulePath": "./irc.ts", + "initialization": { + "server": "irc.libera.chat", + "port": 6697, + "nick": "exo-demo", + "username": "exo-demo", + "realname": "Exo Demo", + "tls": true, + "dryRun": false, + "passwordSecretId": null + } + } + ] +} diff --git a/examples/typescript/tools/irc.ts b/examples/typescript/tools/irc.ts index ba3f674..0799453 100644 --- a/examples/typescript/tools/irc.ts +++ b/examples/typescript/tools/irc.ts @@ -43,10 +43,12 @@ const ircTool = { properties: { ok: { type: "boolean" }, dryRun: { type: "boolean" }, + registered: { type: "boolean" }, + joined: { type: "boolean" }, server: { type: "string" }, channel: { type: "string" }, }, - required: ["ok", "dryRun", "server", "channel"], + required: ["ok", "dryRun", "registered", "joined", "server", "channel"], }, }, initializationParameters: { @@ -108,37 +110,153 @@ async function sendIrcMessage( const channel = stringArgument(args, "channel"); const text = stringArgument(args, "text"); const password = await resolvePassword(context, config.passwordSecretId); - const commands = ircCommands(config, channel, text, password); + let registered = false; + let joined = false; if (!config.dryRun) { await withIrcConnection(config, async (socket) => { - for (const command of commands) { - socket.write(`${command}\r\n`); - } + const session = await registerJoinAndSend( + socket, + config, + channel, + text, + password, + ); + registered = session.registered; + joined = session.joined; }); } return { ok: true, dryRun: config.dryRun, + registered, + joined, server: config.server, channel, }; } -function ircCommands( +interface IrcSessionResult { + registered: boolean; + joined: boolean; +} + +async function registerJoinAndSend( + socket: net.Socket, config: IrcConfig, channel: string, text: string, password: string | null, -): string[] { - return [ - ...(password ? [`PASS ${password}`] : []), - `NICK ${config.nick}`, - `USER ${config.username} 0 * :${config.realname}`, - `PRIVMSG ${channel} :${text}`, - "QUIT", - ]; +): Promise { + const reader = new IrcLineReader(socket); + if (password) { + writeIrcCommand(socket, `PASS ${password}`); + } + writeIrcCommand(socket, `NICK ${config.nick}`); + writeIrcCommand(socket, `USER ${config.username} 0 * :${config.realname}`); + + await reader.waitFor( + (line) => line.includes(` 001 ${config.nick} `), + "IRC registration welcome", + ); + writeIrcCommand(socket, `JOIN ${channel}`); + await reader.waitFor( + (line) => + line.includes(` JOIN ${channel}`) || + line.includes(` JOIN :${channel}`) || + line.includes(` 366 ${config.nick} ${channel} `), + `JOIN confirmation for ${channel}`, + ); + writeIrcCommand(socket, `PRIVMSG ${channel} :${text}`); + await sleep(500); + writeIrcCommand(socket, "QUIT"); + return { registered: true, joined: true }; +} + +function writeIrcCommand(socket: net.Socket, command: string): void { + socket.write(`${command}\r\n`); +} + +class IrcLineReader { + private buffer = ""; + private readonly lines: string[] = []; + private waiter: IrcLineWaiter | null = null; + + constructor(private readonly socket: net.Socket) { + socket.on("data", (chunk) => this.handleData(chunk.toString())); + } + + waitFor( + predicate: (line: string) => boolean, + description: string, + timeoutMs = 10_000, + ): Promise { + const existingIndex = this.lines.findIndex(predicate); + if (existingIndex >= 0) { + const [line] = this.lines.splice(existingIndex, 1); + if (line === undefined) { + throw new Error("IRC line reader invariant violated"); + } + return Promise.resolve(line); + } + if (this.waiter) { + return Promise.reject(new Error("IRC line reader already has a waiter")); + } + + return new Promise((resolve, reject) => { + const timeout = setTimeout(() => { + this.waiter = null; + reject(new Error(`Timed out waiting for ${description}`)); + }, timeoutMs); + this.waiter = { predicate, resolve, reject, timeout }; + }); + } + + private handleData(data: string): void { + this.buffer += data; + for (;;) { + const newlineIndex = this.buffer.indexOf("\n"); + if (newlineIndex === -1) { + return; + } + const line = this.buffer.slice(0, newlineIndex).replace(/\r$/, ""); + this.buffer = this.buffer.slice(newlineIndex + 1); + this.handleLine(line); + } + } + + private handleLine(line: string): void { + const pingToken = line.match(/^PING :(.+)$/)?.[1]; + if (pingToken) { + writeIrcCommand(this.socket, `PONG :${pingToken}`); + } + + if (!this.waiter) { + this.lines.push(line); + return; + } + if (!this.waiter.predicate(line)) { + this.lines.push(line); + return; + } + + const waiter = this.waiter; + this.waiter = null; + clearTimeout(waiter.timeout); + waiter.resolve(line); + } +} + +interface IrcLineWaiter { + predicate: (line: string) => boolean; + resolve: (line: string) => void; + reject: (error: Error) => void; + timeout: ReturnType; +} + +function sleep(ms: number): Promise { + return new Promise((resolve) => setTimeout(resolve, ms)); } async function resolvePassword( diff --git a/examples/typescript/tools/uppercase.manifest.json b/examples/typescript/tools/uppercase.manifest.json new file mode 100644 index 0000000..f673ec3 --- /dev/null +++ b/examples/typescript/tools/uppercase.manifest.json @@ -0,0 +1,10 @@ +{ + "tools": [ + { + "modulePath": "./uppercase.ts", + "initialization": { + "prefix": "UPPER: " + } + } + ] +} diff --git a/tools.md b/tools.md new file mode 100644 index 0000000..8d6fa02 --- /dev/null +++ b/tools.md @@ -0,0 +1,1079 @@ +# Tool Support Plan + +## Context + +`exo` separates the trusted exoharness substrate from executor-owned agent +semantics. Tool support should follow the same split: + +- The exoharness owns durable state, events, artifacts, bindings, secrets, and + sandbox execution. +- Executors and harness modules own which model-facing tools exist, how they + are exposed, how calls are authorized, and how calls are dispatched. + +The tool system should make it easy for harnesses to expose a small set of +model-callable functions without turning exoharness into a product-specific +integration registry. The exoharness can already store bindings and secrets. +Tools should use those generic substrate capabilities when they need +configuration or credentials, but tool semantics remain above the substrate. + +## Goals + +- Let TypeScript harnesses compose tools without hard-coding every tool in Rust. +- Keep the model-facing tool contract portable across model runtimes. +- Preserve `tool_requested` and `tool_result` events as the canonical durable + record of tool use. +- Use existing bindings and secrets for credentials instead of tool-specific + secret plumbing. +- Keep product-specific tool behavior out of the exoharness substrate. +- Support three tool sources: built-in, library, and agent. +- Make it possible for an agent to create a local tool as code without needing a + package distribution system. + +## Non-Goals + +- Do not make exoharness a registry of specific app semantics. +- Do not require all tool execution to cross the Rust `execute_tool` protocol. +- Do not choose one model provider's tool schema as the internal source of + truth. +- Do not design a standardized tool marketplace or package distribution system + yet. +- Do not add product-specific event variants, binding kinds, or storage records + for individual tools. + +## Current Shape + +The current TypeScript surface has a small `ToolDefinition`: + +```ts +interface ToolDefinition { + name: string; + description: string; + parameters: JsonValue; +} +``` + +`examples/typescript/basic-harness.ts` exposes only +`buildShellToolDefinitions(context.conversationConfig)`. When the model calls a +tool, the TypeScript runner sends an `execute_tool` runtime request to Rust. +Rust's `BasicToolRuntime` currently dispatches only `shell`, backed by the +conversation sandbox. + +That is a good host-backed boundary for built-in tools that need Rust-owned +runtime behavior. It should not be the only tool execution path. TypeScript can +already access events, artifacts, bindings, secrets, and sandbox processes +through `TurnContext`, so many tools can execute directly in TypeScript while +still using exoharness for durable and privileged operations. + +## Tool Sources + +### Built-In + +Built-in tools are maintained by the maintainers of `exo`. They are part of the +core release, reviewed with the project, documented with the harness, and +updated as `exo` evolves. + +Examples: + +- `shell` +- `run_workspace_command`, if we choose to ship it as a first-party tool +- future core exoharness inspection or artifact tools + +Built-ins can still be optional. A conversation or harness should explicitly +choose which built-ins are exposed to the model. + +### Library + +Library tools are not written by the agent itself, but they are also not part of +the core `exo` release. They may be written by the user, by a team, or by a +third party. + +There is no standardized distribution plan yet. For now, a library tool can be a +local TypeScript module imported by a harness. Later, library tools could be +distributed as npm packages, copied modules, git submodules, or another format. +The architecture should not depend on that choice. + +Examples: + +- A user-written IRC tool module. +- A team-maintained internal incident-management tool. +- An externally maintained GitHub or Linear tool package. + +### Agent + +Agent tools are created by the agent itself. The agent may write a TypeScript +module, a script, or another local artifact, then ask the harness to expose it as +a model-facing tool. + +Agent tools are the riskiest category because the author is the agent. They +should be clearly marked as `agent`, scoped narrowly, and subject to stricter +policy. For a first implementation, agent-created tools should be local to a +conversation or workspace and should not be promoted into shared library tools +without user review. + +## Core Design + +### Model Tool Definition + +Keep the model-facing definition small and provider-neutral: + +```ts +interface ToolDefinition { + name: string; + description: string; + parameters: JsonValue; + outputSchema?: JsonValue; +} +``` + +`outputSchema` should be optional. It is useful for tools that return structured +results, but model runtimes can ignore it when the provider has no native +output-schema concept. + +Auth requirements, source, policy, runtime choice, and provenance should not be +added to `ToolDefinition`. Those are executor concerns. + +### Harness Tool + +Add an executor-side representation around the model definition: + +```ts +type HarnessToolSource = "built_in" | "library" | "agent"; + +interface ToolExecutionContext { + readonly context: TurnContext; + readonly toolCallId?: string; +} + +interface ToolHandler { + execute( + args: JsonObject, + execution: ToolExecutionContext, + ): Promise; +} + +interface ToolInstance { + definition: ToolDefinition; + source: HarnessToolSource; + handler: ToolHandler; +} +``` + +This is a TypeScript harness API, not an exoharness API. The executor can attach +policy, tracing, auth, and implementation details without changing the portable +model-facing contract. + +### Tool Module + +Library and agent tools should use a standardized module shape. Each tool module +should default export a `Tool`. The export name is standardized, so loaders do +not need to guess whether a file exports `createTool`, `ircSendMessageTool`, or +something else. This is the TypeScript equivalent of loading a `.so` file with a +known interface. + +Tool modules should separate initialization parameters from runtime parameters: + +- Initialization parameters configure the tool before it is exposed. They are + not model-visible and can include server names, default channels, secret ids, + allowlists, and other setup values. +- Runtime parameters are the model-facing arguments in `definition.parameters`. + They are supplied by the model each time it calls the tool. + +```ts +interface ToolInitializationContext { + readonly context: TurnContext; + readonly source: HarnessToolSource; +} + +interface Tool { + definition: ToolDefinition; + initializationParameters: JsonValue; + initialize( + args: JsonObject, + initialization: ToolInitializationContext, + ): Promise | ToolHandler; +} +``` + +The registry or loader combines the module, source, and initialized handler into +a `ToolInstance`: + +```ts +async function initializeTool( + tool: Tool, + source: HarnessToolSource, + initializationArgs: JsonObject, + context: TurnContext, +): Promise { + return { + definition: tool.definition, + source, + handler: await tool.initialize(initializationArgs, { + context, + source, + }), + }; +} +``` + +This makes the module contract stable while leaving each tool's private +implementation types, such as `IrcConfig`, internal to that module. The harness +can load the module with `await import(path)` and read `module.default`. For +static imports, the equivalent is: + +```ts +import * as module from "./foo"; + +const tool = module.default; +``` + +### Tool Registry + +Add a `HarnessToolRegistry` in `typescript/harness/index.ts`: + +```ts +const tools = createToolRegistry(context); + +tools.useBuiltIns(["shell"]); +tools.register(await loadLibraryTool(context, "irc", ircInitialization)); +tools.register(await loadAgentTool(context, "irc_send_message")); + +const request = { + model, + messages, + tools: tools.definitions(), +}; + +const events = await tools.executePending(toolCalls); +``` + +The registry should: + +- Map tool names to `ToolInstance` handlers. +- Reject duplicate tool names at registration time. +- Expose `definitions()` for model calls. +- Execute pending tool calls with streaming `tool_call` and `tool_result` + updates when enabled. +- Return durable `tool_result` events for the caller to append to the turn. +- Preserve each tool's source so policy and tracing can distinguish built-in, + library, and agent tools. +- Support registering initialized `Tool` default exports for + library and agent tools. + +The existing `context.executePendingTools` can remain as the host-backed default +for compatibility with simple harnesses. The registry should be the preferred +path for TypeScript harnesses that compose multiple tool sources. + +## Execution Paths + +### Host-Backed Execution + +Some tools should continue to delegate to Rust or another host runtime. `shell` +is the main example today. The TypeScript registry can expose `shell` as a +built-in tool while its handler delegates to: + +```ts +context.executeTool({ + functionName: "shell", + arguments: args, +}); +``` + +This lets Rust continue to own sandbox lifecycle and shell execution while the +TypeScript harness gets a uniform registry API. + +### TypeScript Execution + +Library and agent tools can often run directly in the TypeScript harness runner. +They can call external APIs, use Node libraries, access generic exoharness +bindings and secrets, write artifacts, and append custom events through the +existing `TurnContext`. + +This path is useful for tools where the trusted substrate does not need to know +the protocol semantics. + +### Sandboxed Process Execution + +Some tools need to run code in a sandbox. A built-in tool such as +`run_workspace_command` can use: + +```ts +const process = await context.startSandboxProcess({ + command: [shellProgram, "-lc", command], +}); +``` + +This is useful for running scripts or local programs, but it should be treated +as a powerful built-in capability, not as its own tool source. If we expose it, +it should have explicit policy and should be enabled intentionally. + +Before relying on sandboxed execution for untrusted agent-authored code, we +should verify the sandbox security model. If we need strong in-process +JavaScript isolation, a smaller runtime such as QuickJS may be a better fit than +unrestricted Node execution. + +## Configuration and Credentials + +Tools should use existing generic substrate objects: + +- Non-secret configuration can live in harness code, agent config, + conversation config, artifacts, or future generic installation records. +- Credentials should live in `Secret`. +- References between configuration and credentials should use secret ids or + existing binding ids. +- Tool definitions should not expose raw credential material. +- Tool result events should not contain raw credential material. + +If persisted tool installation state becomes necessary, add a generic record +that does not encode product-specific semantics: + +```ts +interface ToolInstallation { + id: string; + toolId: string; + source: "library" | "agent"; + version?: string; + scope: "exoharness" | "agent" | "conversation"; + initialization: JsonObject; + bindingIds: string[]; + secretIds: string[]; + enabled?: boolean; +} +``` + +Do this only when the executor needs persisted tool configuration. The first +implementation can work with explicit imports and local initialization arguments +in the harness. + +## Policy + +Policy belongs to the executor or harness module. Evaluate it in two places: + +- Exposure time: decide whether a tool should be included in + `tools.definitions()` for the current turn. +- Invocation time: decide whether the exact call can run with the supplied + arguments, credentials, bindings, mounts, network access, and user/session + context. + +The first implementation can keep policy simple and explicit: + +- `shell` is exposed only when `conversationConfig.shellProgram` is set. +- Networked tools require explicit networking enablement. +- Tools with external side effects should have a confirmation hook before + execution. +- Agent tools should default to the narrowest useful scope. +- Agent tools should not silently persist beyond the conversation or workspace + where they were created. + +The CLI/TUI should render confirmation prompts, but the executor should own the +decision and the durable record of the decision. + +## Events and Observability + +Keep `tool_requested` and `tool_result` as the canonical history. Model runtime +helpers already append `tool_requested` from model outputs, and registry +execution should return `tool_result` events. + +Add optional custom events only when they provide real value: + +- `tool_policy_decision`: exposure or invocation allowed/denied. +- `tool_invocation_started`: tool name, source, optional library id/version. +- `tool_invocation_completed`: duration, status, redacted result summary. +- `tool_auth_refreshed`: secret id or binding id, without credential material. + +Large logs should be artifacts. Events should contain summaries and references, +not unbounded output or secrets. + +Tracing should also preserve the tool source. That makes it possible to compare +built-in, library, and agent tool behavior in Braintrust or other tracing +systems without changing the durable event contract. + +## Incremental Implementation Plan + +The implementation should move in small, testable steps. The first milestone is +shell parity: the TypeScript basic harness should behave exactly as it does +today, but through the registry. Only after that should we add library and agent +tool loading. + +### Step 1: Add Portable Types Only + +Add the core TypeScript types in `typescript/harness/index.ts`: + +- `outputSchema?: JsonValue` on `ToolDefinition`. +- `HarnessToolSource = "built_in" | "library" | "agent"`. +- `ToolExecutionContext`. +- `ToolHandler`. +- `ToolInstance`. +- `ToolInitializationContext`. +- `Tool`. + +For Rust, add `output_schema: Option` to the Rust `ToolDefinition` only +if the Rust model-runtime path needs to deserialize or forward tool definitions +with output schemas. This can be done later if TypeScript-only work does not +touch Rust serialization. + +Test checkpoint: + +- `pnpm typecheck` +- `cargo test -p executor` if the Rust `ToolDefinition` changes + +Expected behavior change: none. + +### Step 2: Add Registry Without Switching Harnesses + +Add `HarnessToolRegistry` and `createToolRegistry(context)`. + +The registry should support: + +- `register(tool: ToolInstance)`. +- Duplicate-name rejection. +- `definitions()`. +- `get(name)`. +- `executePending(toolCalls)`, including stream events and `tool_result` event + construction. + +At this point, no harness needs to use it yet. + +Test checkpoint: + +- Unit tests for duplicate registration. +- Unit tests for `definitions()`. +- Unit tests for `executePending(...)` using a fake in-memory `ToolInstance`. +- `pnpm typecheck` + +Expected behavior change: none. + +### Step 3: Move Shell Definition Behind A Built-In Tool + +Implement a built-in shell `ToolInstance` that delegates execution to the +existing host path: + +```ts +context.executeTool({ + functionName: "shell", + arguments: args, +}); +``` + +Then reimplement `buildShellToolDefinitions(config)` through the built-in shell +helper. Existing callers should still receive the same model-facing shell +definition. + +Test checkpoint: + +- Existing tests still pass. +- A focused test verifies `buildShellToolDefinitions(...)` returns the same + shape as before. +- A focused test verifies the shell `ToolInstance` delegates to + `context.executeTool`. + +Expected behavior change: none. + +### Step 4: Let Tracing Use A Custom Tool Executor + +Update `ResponsesRuntime.traceToolCall(...)` to accept an optional execution +callback: + +```ts +execute = (toolCall: PendingToolCall) => + context.executePendingTools([toolCall]); +``` + +The default preserves existing behavior. Registry-aware harnesses can pass: + +```ts +(toolCall) => tools.executePending([toolCall]); +``` + +Test checkpoint: + +- Unit test or typecheck proving existing call sites compile unchanged. +- Unit test proving a supplied callback is used. +- `pnpm typecheck` + +Expected behavior change: none for existing harnesses. + +### Step 5: Switch The Basic TypeScript Harness To Shell Through Registry + +Update `examples/typescript/basic-harness.ts` to: + +- Create a registry once per turn loop. +- Register built-in `shell`. +- Pass `tools.definitions()` to the model. +- Execute tool calls through the registry callback passed to + `traceToolCall(...)`. + +This step should expose only shell, so it should be behaviorally equivalent to +the current basic TypeScript harness. + +Test checkpoint: + +- `pnpm typecheck` +- Existing TypeScript harness tests or e2e script. +- Manual smoke test: ask the basic TypeScript harness to run a simple shell + command and verify `tool_requested` / `tool_result` events still appear. + +Expected behavior change: none except internal dispatch path. + +### Step 6: Prove Direct TypeScript Library Tools + +Add one small library tool that does not require Rust. Prefer a harmless local +tool over a networked integration for the first proof, for example: + +- `echo_json` +- `now_fixed_for_test` +- `uppercase` + +The point is to prove that a `Tool` default export can be initialized and +registered, and that its handler can produce a `tool_result` without +`context.executeTool`. + +Test checkpoint: + +- Unit test imports the module, initializes it, registers it, and executes it. +- `pnpm typecheck` + +Expected behavior change: none unless the example harness opts into this tool. + +### Step 7: Add A Local Agent Tool Loading Convention + +Add the smallest local convention for agent tools, such as an artifact or config +record containing: + +```json +{ + "tools": [ + { + "modulePath": ".exo/agent-tools/irc.ts", + "initialization": {} + } + ] +} +``` + +The loader should: + +- Import `module.default`. +- Validate it satisfies the `Tool` shape. +- Validate `initialization` against `initializationParameters`. +- Call `initializeTool(...)`. +- Register the resulting `ToolInstance` with source `"agent"`. + +This can start as a helper used by the example TypeScript harness rather than a +new exoharness storage feature. + +Test checkpoint: + +- Unit test with a generated local agent tool module. +- Unit test for a missing default export. +- Unit test for invalid initialization parameters. +- `pnpm typecheck` + +Expected behavior change: only conversations/harnesses that opt into agent tool +loading can expose agent tools. + +### Step 8: Add An Example IRC Tool + +After the local agent tool loading path works, add a concrete IRC tool under an +examples directory, for example `examples/typescript/tools/irc.ts`. + +This should be an example of the standardized `Tool` default export: + +- `definition` exposes the runtime model-facing `irc_send_message` parameters. +- `initializationParameters` exposes setup values such as server, port, nick, + TLS, allowed channels, and optional password secret id. +- `initialize(...)` validates initialization arguments and returns a + `ToolHandler`. +- The handler uses the generic secret APIs for credentials and regular + TypeScript/Node networking for IRC. + +This should be committed separately from the core registry and loading changes. +That keeps the review split clean: first prove the tool API, then add a real +example tool that exercises it. + +Test checkpoint: + +- Unit test imports the IRC tool, validates initialization, initializes it, and + verifies the model-facing definition. +- A network-free handler test should mock the IRC socket or connection layer. +- `pnpm typecheck` + +Expected behavior change: none unless an example harness opts into the IRC tool. + +### Step 9: Add Optional Built-In Code Execution + +Only after shell parity and direct TypeScript tools work, decide whether to add +`run_workspace_command` as a built-in. If added, treat it as a powerful built-in +capability with explicit enablement and tests. + +Test checkpoint: + +- Unit tests for argument validation and structured output. +- Sandbox smoke test. +- Manual review of sandbox security assumptions. + +Expected behavior change: only when the built-in is explicitly enabled. + +### Step 10: Defer Persistent Installation Storage + +Do not add `ToolInstallation` storage until a real library or agent tool needs +durable configuration that cannot reasonably live in harness code, agent config, +conversation config, or artifacts. + +Test checkpoint: + +- None yet. This should remain a later design decision. + +Expected behavior change: none. + +## Suggested First Patch + +The first patch should stop at Step 2: + +- Add the TypeScript types. +- Add `HarnessToolRegistry`. +- Add tests for registration, duplicate names, definitions, and execution using + fake in-memory tools. +- Do not change `examples/typescript/basic-harness.ts` yet. +- Do not change Rust unless TypeScript changes force a Rust schema update. + +That patch validates the core API without changing runtime behavior. The second +patch can add shell as a built-in registry tool while preserving the old +`buildShellToolDefinitions(...)` behavior. The third patch can switch the basic +TypeScript harness to registry-backed shell execution. + +## Open Questions + +- What is the confirmation API between executor and CLI/TUI? +- Should `run_workspace_command` be a built-in tool, or should the first built-in + code execution tool have a narrower interface? +- What local entrypoint should agent-created tools use so the agent does not + have to modify the main harness module directly? +- Should library tools be loaded only by explicit imports at first, or should + there be a small manifest format? +- What is the smallest generic installation record needed before adding storage? + +## Recommendation + +Build the TypeScript registry first. Treat tools as built-in, library, or agent +tools. Keep `shell` on the existing Rust execution path, execute library and +agent tools directly in TypeScript where practical, and keep exoharness focused +on durable substrate responsibilities: events, bindings, secrets, artifacts, and +sandbox execution. + +## Example: Agent-Created IRC Tool + +This example walks through how an agent could create IRC support as its own +tool. IRC is useful because it needs network access, configuration, and optional +credentials, but it does not require exoharness to learn anything IRC-specific. + +Assume the agent wants to expose this model-facing tool: + +```ts +{ + name: "irc_send_message", + description: "Send a message to an IRC channel.", + parameters: { + type: "object", + additionalProperties: false, + properties: { + channel: { + type: "string", + description: "IRC channel name, for example #exo.", + }, + text: { + type: "string", + description: "Message text to send.", + }, + }, + required: ["channel", "text"], + }, + outputSchema: { + type: "object", + additionalProperties: false, + properties: { + ok: { type: "boolean" }, + server: { type: "string" }, + channel: { type: "string" }, + }, + required: ["ok", "server", "channel"], + }, +} +``` + +### What the Agent Creates + +The agent writes a local tool module, for example +`.exo/agent-tools/irc.ts`: + +```ts +import net from "node:net"; +import tls from "node:tls"; + +import type { Tool, JsonObject, ToolResult, TurnContext } from "@exo/harness"; + +interface IrcConfig { + server: string; + port: number; + nick: string; + username: string; + realname: string; + tls: boolean; + passwordSecretId?: string | null; +} + +const ircTool = { + definition: { + name: "irc_send_message", + description: "Send a message to an IRC channel.", + parameters: { + type: "object", + additionalProperties: false, + properties: { + channel: { type: "string" }, + text: { type: "string" }, + }, + required: ["channel", "text"], + }, + outputSchema: { + type: "object", + additionalProperties: false, + properties: { + ok: { type: "boolean" }, + server: { type: "string" }, + channel: { type: "string" }, + }, + required: ["ok", "server", "channel"], + }, + }, + initializationParameters: { + type: "object", + additionalProperties: false, + properties: { + server: { type: "string" }, + port: { type: "number" }, + nick: { type: "string" }, + username: { type: "string" }, + realname: { type: "string" }, + tls: { type: "boolean" }, + passwordSecretId: { type: ["string", "null"] }, + }, + required: ["server", "port", "nick", "username", "realname", "tls"], + }, + initialize(args) { + const config = parseIrcConfig(args); + return { + async execute(args, execution): Promise { + return sendIrcMessage(execution.context, config, args); + }, + }; + }, +} satisfies Tool; + +export default ircTool; + +function parseIrcConfig(args: JsonObject): IrcConfig { + return { + server: stringArgument(args, "server"), + port: numberArgument(args, "port"), + nick: stringArgument(args, "nick"), + username: stringArgument(args, "username"), + realname: stringArgument(args, "realname"), + tls: booleanArgument(args, "tls"), + passwordSecretId: optionalStringArgument(args, "passwordSecretId"), + }; +} + +async function sendIrcMessage( + context: TurnContext, + config: IrcConfig, + args: JsonObject, +): Promise { + const channel = stringArgument(args, "channel"); + const text = stringArgument(args, "text"); + const password = await resolvePassword(context, config.passwordSecretId); + + await withIrcConnection(config, async (socket) => { + if (password) { + socket.write(`PASS ${password}\r\n`); + } + socket.write(`NICK ${config.nick}\r\n`); + socket.write(`USER ${config.username} 0 * :${config.realname}\r\n`); + socket.write(`PRIVMSG ${channel} :${text}\r\n`); + socket.write("QUIT\r\n"); + }); + + return { + ok: true, + server: config.server, + channel, + }; +} + +async function resolvePassword( + context: TurnContext, + secretId: string | null | undefined, +): Promise { + if (!secretId) { + return null; + } + const secret = + await context.exoharness.current.conversation.getSecret(secretId); + if (!secret) { + throw new Error(`IRC password secret does not exist: ${secretId}`); + } + if (secret.type !== "key") { + throw new Error("IRC password secret must be a key secret"); + } + return secret.value; +} + +async function withIrcConnection( + config: IrcConfig, + run: (socket: net.Socket) => Promise | void, +): Promise { + await new Promise((resolve, reject) => { + const socket = config.tls + ? tls.connect(config.port, config.server) + : net.connect(config.port, config.server); + socket.setEncoding("utf8"); + socket.setTimeout(10_000); + socket.once("connect", async () => { + try { + await run(socket); + socket.end(resolve); + } catch (error) { + socket.destroy(); + reject(error); + } + }); + socket.once("error", reject); + socket.once("timeout", () => { + socket.destroy(new Error("IRC connection timed out")); + }); + }); +} + +function stringArgument(args: JsonObject, name: string): string { + const value = args[name]; + if (typeof value !== "string" || value.length === 0) { + throw new Error(`IRC tool argument ${name} must be a non-empty string`); + } + return value; +} + +function optionalStringArgument(args: JsonObject, name: string): string | null { + const value = args[name]; + if (value === undefined || value === null) { + return null; + } + if (typeof value !== "string" || value.length === 0) { + throw new Error(`IRC tool initialization ${name} must be a string`); + } + return value; +} + +function numberArgument(args: JsonObject, name: string): number { + const value = args[name]; + if (typeof value !== "number") { + throw new Error(`IRC tool initialization ${name} must be a number`); + } + return value; +} + +function booleanArgument(args: JsonObject, name: string): boolean { + const value = args[name]; + if (typeof value !== "boolean") { + throw new Error(`IRC tool initialization ${name} must be a boolean`); + } + return value; +} +``` + +This module is ordinary TypeScript harness code. It does not require a Rust tool +implementation because it can run directly inside the TypeScript harness runner. +It uses exoharness only for secret lookup. Its default export is the stable +loader contract. `IrcConfig` can remain internal because the harness passes +untyped JSON initialization parameters and the module validates them. + +### What the Harness Needs + +The harness needs a local entrypoint for agent-created tools so the agent does +not have to edit the main turn loop every time. A simple first version could be +an explicit loader in `examples/typescript/basic-harness.ts`: + +```ts +interface AgentToolManifest { + tools: Array<{ + modulePath: string; + initialization: JsonObject; + }>; +} + +async function registerAgentTools( + context: TurnContext, + tools: HarnessToolRegistry, +): Promise { + const manifest = + await context.exoharness.current.conversation.readArtifactJson( + { + artifactId: "agent-tools", + }, + ); + for (const entry of manifest?.tools ?? []) { + const module = (await import(entry.modulePath)) as { + default: Tool; + }; + tools.register( + await initializeTool( + module.default, + "agent", + entry.initialization, + context, + ), + ); + } +} +``` + +`initializeTool` should validate `entry.initialization` against the tool's +`initializationParameters` before calling `initialize(...)`. + +The turn loop then builds the registry and loads agent tools before calling the +model: + +```ts +const tools = createToolRegistry(context).useBuiltIns(["shell"]); +await registerAgentTools(context, tools); + +const request: NativeResponsesRequest = { + model, + messages, + tools: tools.definitions(), + maxOutputTokens: context.agentConfig.maxOutputTokens, + metadata: turnMetadata(context), +}; +``` + +The agent would also need to write the manifest artifact: + +```json +{ + "tools": [ + { + "modulePath": ".exo/agent-tools/irc.ts", + "initialization": { + "server": "irc.libera.chat", + "port": 6697, + "nick": "exo-agent", + "username": "exo", + "realname": "Exo Agent", + "tls": true, + "passwordSecretId": "irc-password" + } + } + ] +} +``` + +This is intentionally a minimal local convention, not a distribution system. A +more polished version could validate the manifest, restrict allowed paths, cache +loaded modules, and require user approval before exposing new agent tools. + +### Tool Execution Wiring + +The harness passes `tools.definitions()` to the model request and executes +returned calls through `tools.executePending(...)`: + +```ts +const toolResultEvents = await tools.executePending([toolCall]); +await turn.addEvents(toolResultEvents); +``` + +If the current runtime helper still hardcodes +`context.executePendingTools(...)`, update `ResponsesRuntime.traceToolCall` to +accept an optional executor callback: + +```ts +async traceToolCall( + turnParent: TraceParent, + context: TurnContext, + toolCall: PendingToolCall, + roundIndex: number, + execute = (toolCall: PendingToolCall) => + context.executePendingTools([toolCall]), +): Promise { + return tracedUnderParent( + turnParent, + async (span) => { + const events = await execute(toolCall); + span.log({ output: toolResultTraceOutput(events) }); + return events; + }, + // existing trace args + ); +} +``` + +The harness can then call: + +```ts +await runtime.traceToolCall(turnParent, context, toolCall, round, (toolCall) => + tools.executePending([toolCall]), +); +``` + +### Required Configuration + +The conversation or agent must have networking enabled because IRC is an +external network call: + +```bash +exo agent create --model gpt-5.4 --enable-networking "IRC Agent" +``` + +If the IRC server requires a password or NickServ token, store it as a normal +secret: + +```bash +exo secret set irc-password --env IRC_PASSWORD +``` + +The exact CLI command may differ as the config surface evolves, but the storage +model should remain generic: the IRC tool references a secret id, and the +exoharness stores only the secret material. The model sees the tool schema and +arguments, not the password. + +### What Does Not Change + +This example should not require: + +- A new exoharness binding type named `irc`. +- A Rust `ToolRuntime` implementation for IRC. +- IRC-specific event variants. +- Raw IRC credentials in model-visible prompts, tool definitions, or events. + +The durable event history remains the same: + +1. The model emits `irc_send_message`. +2. The executor appends `tool_requested`. +3. The registry authorizes and executes the TypeScript handler. +4. The registry returns a `tool_result` event. +5. The next model round sees the result through normal event materialization. + +### Hardening Before Sharing + +For a local experiment, the direct module above is enough. Before treating IRC +as a reusable library tool or allowing broad agent-created tools, add: + +- A policy check that only allows configured servers and channels. +- A confirmation requirement for sending messages to public channels. +- Rate limits and message length validation. +- Redacted observability events for connection failures. +- Manifest validation and path restrictions for agent tool modules. +- Tests for argument validation, duplicate tool registration, missing secrets, + disabled networking, and rejected manifests. + +After review, a user could promote the IRC implementation from an `agent` tool +to a `library` tool by moving it into a user-maintained module and importing it +explicitly from the harness. The exoharness substrate still only needs generic +bindings, secrets, artifacts, events, and sandbox/network policy. diff --git a/typescript/harness/index.test.ts b/typescript/harness/index.test.ts index 8b0092b..841514c 100644 --- a/typescript/harness/index.test.ts +++ b/typescript/harness/index.test.ts @@ -291,6 +291,8 @@ describe("library tool modules", () => { toolResultEvent("call_1", { ok: true, dryRun: true, + registered: false, + joined: false, server: "irc.example.test", channel: "#exo", }), @@ -468,6 +470,7 @@ function fakeTurnContext( instructions: [], harness: "typescript", typescript: null, + libraryTools: [], sandboxImage: null, enableNetworking: false, model: "test-model", diff --git a/typescript/harness/index.ts b/typescript/harness/index.ts index 2770a60..b0f85be 100644 --- a/typescript/harness/index.ts +++ b/typescript/harness/index.ts @@ -27,6 +27,10 @@ export interface AgentConfig { typescript?: { modulePath: string; } | null; + libraryTools: Array<{ + modulePath: string; + initialization: JsonObject; + }>; sandboxImage?: string | null; enableNetworking: boolean; model: string; diff --git a/typescript/harness/runner.ts b/typescript/harness/runner.ts index 41a871d..4ab626b 100644 --- a/typescript/harness/runner.ts +++ b/typescript/harness/runner.ts @@ -47,6 +47,7 @@ interface RawAgentConfig { typescript?: { module_path: string; } | null; + library_tools?: RawToolManifestEntry[]; sandbox_image?: string | null; enable_networking: boolean; model: string; @@ -55,6 +56,11 @@ interface RawAgentConfig { braintrust?: unknown; } +interface RawToolManifestEntry { + module_path: string; + initialization: JsonObject; +} + interface RawConversationConfig { enable_networking: boolean; shell_program?: string | null; @@ -777,6 +783,7 @@ function toAgentConfig(raw: RawAgentConfig): AgentConfig { modulePath: raw.typescript.module_path, } : null, + libraryTools: (raw.library_tools ?? []).map(toToolManifestEntry), sandboxImage: raw.sandbox_image ?? null, enableNetworking: raw.enable_networking, model: raw.model, @@ -786,6 +793,16 @@ function toAgentConfig(raw: RawAgentConfig): AgentConfig { }; } +function toToolManifestEntry(raw: RawToolManifestEntry): { + modulePath: string; + initialization: JsonObject; +} { + return { + modulePath: raw.module_path, + initialization: raw.initialization, + }; +} + function toConversationConfig(raw: RawConversationConfig): ConversationConfig { return { enableNetworking: raw.enable_networking, From dfd2b198218e60ec63f7496f309d40f55ebc933f Mon Sep 17 00:00:00 2001 From: 61cygni <> Date: Fri, 22 May 2026 23:28:35 -0700 Subject: [PATCH 5/8] Enable agent-created tools Let TypeScript agents install, validate, and reload their own tools during a turn so newly created capabilities can be used without manual registration. Co-authored-by: Cursor --- crates/cli/src/main.rs | 31 ++- crates/executor/src/basic_tests.rs | 1 + crates/executor/src/executor_types.rs | 6 + crates/executor/src/harness_basic_tests.rs | 8 + crates/executor/src/harness_facade.rs | 1 + crates/executor/src/harness_types.rs | 1 + crates/executor/src/rlm_tests.rs | 5 + docs/tools.md | 59 ++++- examples/typescript/basic-harness.ts | 44 +++- typescript/harness/built-in-tools.ts | 210 +++++++++++++++++- typescript/harness/index.test.ts | 243 ++++++++++++++++++++- typescript/harness/index.ts | 47 +++- typescript/harness/runner.ts | 2 + typescript/harness/tool-manifest.ts | 126 ++++++++++- typescript/harness/tools.ts | 85 ++++++- 15 files changed, 844 insertions(+), 25 deletions(-) diff --git a/crates/cli/src/main.rs b/crates/cli/src/main.rs index e649817..3f5d33a 100644 --- a/crates/cli/src/main.rs +++ b/crates/cli/src/main.rs @@ -118,6 +118,8 @@ enum AgentCommands { #[arg(long = "tool-manifest")] tool_manifests: Vec, #[arg(long)] + disable_agent_tool_creation: bool, + #[arg(long)] sandbox_image: Option, #[arg(long, value_enum)] networking: Option, @@ -147,6 +149,10 @@ enum AgentCommands { #[arg(long)] clear_tool_manifests: bool, #[arg(long)] + enable_agent_tool_creation: bool, + #[arg(long)] + disable_agent_tool_creation: bool, + #[arg(long)] sandbox_image: Option, #[arg(long)] clear_sandbox_image: bool, @@ -398,6 +404,7 @@ async fn main() -> Result<(), Box> { slug, module, tool_manifests, + disable_agent_tool_creation, sandbox_image, networking, model, @@ -427,6 +434,7 @@ async fn main() -> Result<(), Box> { harness: agent_harness_kind, typescript, library_tools, + enable_agent_tool_creation: !disable_agent_tool_creation, sandbox_image, enable_networking: matches!(networking, Some(NetworkingMode::Enabled)), model, @@ -452,6 +460,8 @@ async fn main() -> Result<(), Box> { clear_module, tool_manifests, clear_tool_manifests, + enable_agent_tool_creation, + disable_agent_tool_creation, sandbox_image, clear_sandbox_image, networking, @@ -473,6 +483,12 @@ async fn main() -> Result<(), Box> { "provide either --clear-tool-manifests or --tool-manifest, not both".into(), ); } + if enable_agent_tool_creation && disable_agent_tool_creation { + return Err( + "provide either --enable-agent-tool-creation or --disable-agent-tool-creation, not both" + .into(), + ); + } if clear_sandbox_image && sandbox_image.is_some() { return Err( "provide either --clear-sandbox-image or --sandbox-image, not both".into(), @@ -532,6 +548,15 @@ async fn main() -> Result<(), Box> { changed = true; } } + if enable_agent_tool_creation { + if !config.enable_agent_tool_creation { + config.enable_agent_tool_creation = true; + changed = true; + } + } else if disable_agent_tool_creation && config.enable_agent_tool_creation { + config.enable_agent_tool_creation = false; + changed = true; + } if clear_sandbox_image { config.sandbox_image = None; changed = true; @@ -594,7 +619,7 @@ async fn main() -> Result<(), Box> { )?; if updated_braintrust.is_none() && !changed { return Err( - "no changes provided; pass --set-harness, --module, --tool-manifest, --sandbox-image, --networking, model flags, --clear-braintrust, or Braintrust project flags" + "no changes provided; pass --set-harness, --module, --tool-manifest, --enable-agent-tool-creation, --disable-agent-tool-creation, --sandbox-image, --networking, model flags, --clear-braintrust, or Braintrust project flags" .into(), ); } @@ -633,6 +658,10 @@ async fn main() -> Result<(), Box> { for tool in &config.library_tools { println!(" - {}", tool.module_path); } + println!( + "enable_agent_tool_creation: {}", + config.enable_agent_tool_creation + ); println!( "sandbox_image: {}", config.sandbox_image.as_deref().unwrap_or("default") diff --git a/crates/executor/src/basic_tests.rs b/crates/executor/src/basic_tests.rs index 494ca4d..3d846f8 100644 --- a/crates/executor/src/basic_tests.rs +++ b/crates/executor/src/basic_tests.rs @@ -989,6 +989,7 @@ fn default_agent_config() -> AgentConfig { harness: crate::AgentHarnessKind::Basic, typescript: None, library_tools: Vec::new(), + enable_agent_tool_creation: true, sandbox_image: None, enable_networking: false, model: "test-model".to_string(), diff --git a/crates/executor/src/executor_types.rs b/crates/executor/src/executor_types.rs index 8976f9e..c8c63aa 100644 --- a/crates/executor/src/executor_types.rs +++ b/crates/executor/src/executor_types.rs @@ -24,6 +24,8 @@ pub struct AgentConfig { pub typescript: Option, #[serde(default)] pub library_tools: Vec, + #[serde(default = "default_enable_agent_tool_creation")] + pub enable_agent_tool_creation: bool, #[serde(default)] pub sandbox_image: Option, #[serde(default)] @@ -55,6 +57,10 @@ pub struct ToolManifestEntry { pub initialization: Value, } +pub fn default_enable_agent_tool_creation() -> bool { + true +} + #[derive(Debug, Clone, Serialize, serde::Deserialize)] pub struct ConversationConfig { pub enable_networking: bool, diff --git a/crates/executor/src/harness_basic_tests.rs b/crates/executor/src/harness_basic_tests.rs index 81e7f6a..7668812 100644 --- a/crates/executor/src/harness_basic_tests.rs +++ b/crates/executor/src/harness_basic_tests.rs @@ -42,6 +42,7 @@ async fn creates_agents_and_conversations_with_persisted_config() { harness: crate::AgentHarnessKind::Basic, typescript: None, library_tools: Vec::new(), + enable_agent_tool_creation: true, sandbox_image: None, enable_networking: false, model: "gpt-5.4".to_string(), @@ -113,6 +114,7 @@ async fn send_persists_messages_through_harness() { harness: crate::AgentHarnessKind::Basic, typescript: None, library_tools: Vec::new(), + enable_agent_tool_creation: true, sandbox_image: None, enable_networking: false, model: "gpt-5.4".to_string(), @@ -168,6 +170,7 @@ async fn close_session_appends_session_ended_event() { harness: crate::AgentHarnessKind::Basic, typescript: None, library_tools: Vec::new(), + enable_agent_tool_creation: true, sandbox_image: None, enable_networking: false, model: "gpt-5.4".to_string(), @@ -252,6 +255,7 @@ async fn updating_agent_config_refreshes_executor_cache() { harness: crate::AgentHarnessKind::Basic, typescript: None, library_tools: Vec::new(), + enable_agent_tool_creation: true, sandbox_image: None, enable_networking: false, model: "gpt-5.4".to_string(), @@ -333,6 +337,7 @@ async fn send_executes_shell_tool_when_enabled() { harness: crate::AgentHarnessKind::Basic, typescript: None, library_tools: Vec::new(), + enable_agent_tool_creation: true, sandbox_image: None, enable_networking: true, model: "gpt-5.4".to_string(), @@ -422,6 +427,7 @@ async fn harness_exposes_raw_exoharness_handles() { harness: crate::AgentHarnessKind::Basic, typescript: None, library_tools: Vec::new(), + enable_agent_tool_creation: true, sandbox_image: None, enable_networking: false, model: "gpt-5.4".to_string(), @@ -502,6 +508,7 @@ async fn updating_mounts_recreates_shell_sandbox() { harness: crate::AgentHarnessKind::Basic, typescript: None, library_tools: Vec::new(), + enable_agent_tool_creation: true, sandbox_image: None, enable_networking: false, model: "gpt-5.4".to_string(), @@ -634,6 +641,7 @@ async fn conversation_model_override_changes_effective_model() { harness: crate::AgentHarnessKind::Basic, typescript: None, library_tools: Vec::new(), + enable_agent_tool_creation: true, sandbox_image: None, enable_networking: false, model: "gpt-5.4".to_string(), diff --git a/crates/executor/src/harness_facade.rs b/crates/executor/src/harness_facade.rs index 0973bdb..8fd0b0e 100644 --- a/crates/executor/src/harness_facade.rs +++ b/crates/executor/src/harness_facade.rs @@ -125,6 +125,7 @@ where harness: request.harness, typescript: request.typescript, library_tools: request.library_tools, + enable_agent_tool_creation: request.enable_agent_tool_creation, sandbox_image: request.sandbox_image, enable_networking: request.enable_networking, model: request.model, diff --git a/crates/executor/src/harness_types.rs b/crates/executor/src/harness_types.rs index 259cbb4..ea1e701 100644 --- a/crates/executor/src/harness_types.rs +++ b/crates/executor/src/harness_types.rs @@ -63,6 +63,7 @@ pub struct CreateAgentRequest { pub harness: AgentHarnessKind, pub typescript: Option, pub library_tools: Vec, + pub enable_agent_tool_creation: bool, pub sandbox_image: Option, pub enable_networking: bool, pub model: String, diff --git a/crates/executor/src/rlm_tests.rs b/crates/executor/src/rlm_tests.rs index ab22038..df4bee4 100644 --- a/crates/executor/src/rlm_tests.rs +++ b/crates/executor/src/rlm_tests.rs @@ -63,6 +63,7 @@ async fn rlm_send_executes_repl_steps_and_persists_final_answer() { harness: crate::AgentHarnessKind::Rlm, typescript: None, library_tools: Vec::new(), + enable_agent_tool_creation: true, sandbox_image: None, enable_networking: false, model: "gpt-5.4".to_string(), @@ -190,6 +191,7 @@ async fn rlm_subquery_variable_can_store_final_answer() { harness: crate::AgentHarnessKind::Rlm, typescript: None, library_tools: Vec::new(), + enable_agent_tool_creation: true, sandbox_image: None, enable_networking: false, model: "gpt-5.4".to_string(), @@ -255,6 +257,7 @@ async fn rlm_send_stream_suppresses_internal_control_text() { harness: crate::AgentHarnessKind::Rlm, typescript: None, library_tools: Vec::new(), + enable_agent_tool_creation: true, sandbox_image: None, enable_networking: false, model: "gpt-5.4".to_string(), @@ -357,6 +360,7 @@ globalThis.answer = String(\n\ harness: crate::AgentHarnessKind::Rlm, typescript: None, library_tools: Vec::new(), + enable_agent_tool_creation: true, sandbox_image: None, enable_networking: false, model: "gpt-5.4".to_string(), @@ -427,6 +431,7 @@ async fn rlm_can_finish_by_setting_final_in_repl() { harness: crate::AgentHarnessKind::Rlm, typescript: None, library_tools: Vec::new(), + enable_agent_tool_creation: true, sandbox_image: None, enable_networking: false, model: "gpt-5.4".to_string(), diff --git a/docs/tools.md b/docs/tools.md index 62a8f5f..25fcc62 100644 --- a/docs/tools.md +++ b/docs/tools.md @@ -14,11 +14,21 @@ execute project-specific code. Built-in tools are maintained by the `exo` project and shipped with the harness runtime. -Currently the only built-in tool is `shell`. The shell tool runs commands in the -conversation sandbox, using the conversation’s configured `shellProgram`. The -TypeScript harness exposes the model-facing tool, but execution is delegated to -the Rust host runtime so sandbox lifecycle and process execution remain -host-managed. +The basic TypeScript harness exposes `shell` and, when agent tool creation is +enabled, `install_agent_tool`. The shell tool runs commands in the conversation +sandbox, using the conversation's configured `shellProgram`. Execution is +delegated to the Rust host runtime so sandbox lifecycle and process execution +remain host-managed. + +`install_agent_tool` writes an agent-created TypeScript tool module into +`.exo/agent-tools/`, validates it, and updates `.exo/agent-tools/manifest.json`. +The basic harness refreshes tools before each model round, so an installed tool +can be used later in the same user turn. + +If a tool throws during execution or installation, the registry records a +`tool_result` error instead of crashing the turn. If a previous process crash +left a tool request without a result, prompt materialization synthesizes an error +result so the conversation can continue. ### Library Tools @@ -28,6 +38,15 @@ maintainer. They are loaded explicitly by the harness. A library tool is a TypeScript module with a default export satisfying `Tool`: +- `definition.name` is the model-facing tool name. +- `definition.parameters` is a strict JSON object schema with + `additionalProperties: false`. +- `initializationParameters` validates manifest-time configuration. +- `initialize(...)` returns a handler with `execute(args, execution)`. + +Tools should not use `inputSchema`, `call`, or `invoke`; those are not part of +the `exo` tool contract. + ### Agent Tools Agent tools are created by the agent itself. They use the same default-export @@ -35,7 +54,9 @@ Agent tools are created by the agent itself. They use the same default-export `"agent"` instead of `"library"`. Agent tools should be treated as less trusted than built-in or library tools. -Load them from an explicit manifest and keep the scope narrow: +The basic harness loads them from `.exo/agent-tools/manifest.json` when agent +tool creation is enabled. That setting is enabled by default and can be disabled +per agent. The loader: @@ -94,6 +115,17 @@ exo --harness typescript agent create "Tool Demo" \ `--tool-manifest` may be passed more than once. Relative `modulePath` values in each manifest are resolved relative to that manifest file. +Agent tool creation is enabled by default. Disable or re-enable it with: + +```bash +exo agent create "Locked Down" \ + --model gpt-5.4 \ + --disable-agent-tool-creation + +exo agent update demo --disable-agent-tool-creation +exo agent update demo --enable-agent-tool-creation +``` + ## Safety Considerations Different tool sources have different trust levels: @@ -107,19 +139,28 @@ Recommended defaults: - Load tools explicitly, not by scanning directories. - Validate initialization parameters before exposing a tool. +- Validate generated tools against the `Tool` contract before adding them to the + manifest. - Require explicit networking enablement for tools that call external services. - Require confirmation for tools with external side effects. - Avoid persisting agent tools beyond the conversation or workspace unless a user reviews and promotes them. - Keep large logs in artifacts, not event payloads. +Agent tools currently run in the TypeScript harness process. They can use Node +built-ins, global APIs such as `fetch`, and dependencies already available to +the harness. Dependencies installed inside the conversation sandbox are not +automatically available to host-loaded agent tools; tools that need sandbox +state should call sandbox APIs from their handler. + ## Current Status -The generic registry, built-in shell registration, library tool loading, and +The generic registry, built-in tool registration, library tool loading, and agent tool manifest loading are implemented in the TypeScript harness API. -The basic TypeScript harness currently opts into the built-in shell tool and -loads library tool manifests stored on the agent config. +The basic TypeScript harness currently opts into `shell`, library tool manifests +stored on the agent config, and agent-created tools from +`.exo/agent-tools/manifest.json` when agent tool creation is enabled. There is an example library tool at `examples/typescript/tools/uppercase.ts`. It exists to test and demonstrate the registry contract, and can be enabled with diff --git a/examples/typescript/basic-harness.ts b/examples/typescript/basic-harness.ts index 2eddde7..2fc2d6f 100644 --- a/examples/typescript/basic-harness.ts +++ b/examples/typescript/basic-harness.ts @@ -3,8 +3,10 @@ import { defineHarness, materializePromptMessages, registerBuiltInTools, + registerAgentToolsFromManifestPathIfExists, registerLibraryToolsFromManifest, turnMetadata, + type Message, type TurnContext, } from "@exo/harness"; import { @@ -39,11 +41,6 @@ async function runBasicTurnLoop( ): Promise { const { conversation, turn } = context.exoharness.current; const maxToolRoundTrips = context.agentConfig.maxToolRoundTrips; - const tools = createToolRegistry(context); - registerBuiltInTools(tools, context, ["shell"]); - await registerLibraryToolsFromManifest(tools, context, { - tools: context.agentConfig.libraryTools, - }); let latestEventId: string | null = null; for (let round = 0; ; round += 1) { @@ -55,9 +52,10 @@ async function runBasicTurnLoop( return latestEventId; } + const tools = await createBasicToolRegistry(context); const messages = await materializePromptMessages( conversation, - context.agentConfig.instructions, + basicHarnessInstructions(context), ); const request: NativeResponsesRequest = { model, @@ -108,3 +106,37 @@ async function runBasicTurnLoop( } } } + +function basicHarnessInstructions(context: TurnContext): Message[] { + return context.agentConfig.enableAgentToolCreation + ? [...context.agentConfig.instructions, agentToolCreationInstruction()] + : context.agentConfig.instructions; +} + +function agentToolCreationInstruction(): Message { + return { + role: "developer", + content: + "Agent-created tools are supported. When the user asks you to create a reusable tool, call install_agent_tool with a complete TypeScript moduleSource. Do not claim the tool was created unless install_agent_tool returns ok: true. The moduleSource must default-export a Tool from @exo/harness using { definition, initializationParameters, initialize(...) }; definition.parameters must be a strict JSON schema object with additionalProperties: false; handlers must implement execute(args, execution), not invoke or call. Do not use zod, inputSchema, or external npm packages. After install_agent_tool succeeds, the new tool is available in the next model round of the same turn, so use it directly rather than falling back to shell.", + }; +} + +async function createBasicToolRegistry(context: TurnContext) { + const tools = createToolRegistry(context); + registerBuiltInTools(tools, context, builtInToolNames(context)); + await registerLibraryToolsFromManifest(tools, context, { + tools: context.agentConfig.libraryTools, + }); + if (context.agentConfig.enableAgentToolCreation) { + await registerAgentToolsFromManifestPathIfExists(tools, context); + } + return tools; +} + +function builtInToolNames( + context: TurnContext, +): Array<"shell" | "install_agent_tool"> { + return context.agentConfig.enableAgentToolCreation + ? ["shell", "install_agent_tool"] + : ["shell"]; +} diff --git a/typescript/harness/built-in-tools.ts b/typescript/harness/built-in-tools.ts index 682f71e..2fee284 100644 --- a/typescript/harness/built-in-tools.ts +++ b/typescript/harness/built-in-tools.ts @@ -1,3 +1,6 @@ +import fs from "node:fs/promises"; +import path from "node:path"; + import type { ConversationConfig, JsonObject, @@ -6,8 +9,14 @@ import type { TurnContext, } from "./index"; import type { HarnessToolRegistry, ToolInstance } from "./tools"; +import { + DEFAULT_AGENT_TOOL_MANIFEST_PATH, + loadAgentTool, + type AgentToolManifest, + type AgentToolManifestEntry, +} from "./tool-manifest"; -export type BuiltInToolName = "shell"; +export type BuiltInToolName = "shell" | "install_agent_tool"; export function registerBuiltInTools( registry: HarnessToolRegistry, @@ -20,6 +29,8 @@ export function registerBuiltInTools( if (shell) { registry.register(shell); } + } else if (name === "install_agent_tool") { + registry.register(createInstallAgentToolInstance()); } } } @@ -80,3 +91,200 @@ export function shellToolRequest(args: JsonObject): { } export type ShellToolResult = ToolResult; + +function createInstallAgentToolInstance(): ToolInstance { + return { + source: "built_in", + definition: { + name: "install_agent_tool", + description: + "Install or replace an agent-created TypeScript tool so it can be used in the next model round. The moduleSource must default-export a Tool from @exo/harness. Do not import external npm packages; use Node built-ins, global APIs like fetch, and type-only imports from @exo/harness.", + parameters: { + type: "object", + additionalProperties: false, + properties: { + name: { + type: "string", + description: + "Filesystem-safe tool module name, for example curl-tool or grab_webpage.", + }, + moduleSource: { + type: "string", + description: + "Complete TypeScript source for a module that default-exports a Tool. Use the shape { definition, initializationParameters, initialize(...) }. Do not use zod, inputSchema, outputSchema validators, or call(...) handlers.", + }, + initialization: { + type: "object", + additionalProperties: false, + properties: {}, + description: + "Initialization arguments for the tool's initializationParameters schema.", + }, + }, + required: ["name", "moduleSource", "initialization"], + }, + outputSchema: { + type: "object", + additionalProperties: false, + properties: { + ok: { type: "boolean" }, + toolName: { type: "string" }, + modulePath: { type: "string" }, + manifestPath: { type: "string" }, + availableNextRound: { type: "boolean" }, + }, + required: [ + "ok", + "toolName", + "modulePath", + "manifestPath", + "availableNextRound", + ], + }, + }, + handler: { + execute(args, execution) { + return installAgentTool(execution.context, args); + }, + }, + }; +} + +async function installAgentTool( + context: TurnContext, + args: JsonObject, +): Promise { + const name = stringArgument(args, "name"); + if (!/^[A-Za-z0-9_-]+$/.test(name)) { + throw new Error( + "agent tool name must contain only letters, numbers, underscores, and dashes", + ); + } + const moduleSource = stringArgument(args, "moduleSource"); + const initialization = objectArgument(args, "initialization"); + const toolsDirectory = path.dirname(DEFAULT_AGENT_TOOL_MANIFEST_PATH); + const manifestPath = DEFAULT_AGENT_TOOL_MANIFEST_PATH; + const modulePath = path.join(toolsDirectory, `${name}.ts`); + + await fs.mkdir(toolsDirectory, { recursive: true }); + await fs.writeFile(modulePath, moduleSource, "utf8"); + + const tool = await loadAgentTool(context, { + modulePath: path.resolve(modulePath), + initialization, + }); + if ( + tool.definition.name === "shell" || + tool.definition.name === "install_agent_tool" + ) { + throw new Error( + `agent tool cannot replace built-in tool: ${tool.definition.name}`, + ); + } + + const manifest = await readWritableAgentToolManifest(manifestPath); + const manifestEntry: AgentToolManifestEntry = { + modulePath: `./${name}.ts`, + initialization, + }; + const existingIndex = manifest.tools.findIndex( + (entry) => entry.modulePath === manifestEntry.modulePath, + ); + if (existingIndex >= 0) { + manifest.tools[existingIndex] = manifestEntry; + } else { + manifest.tools.push(manifestEntry); + } + await fs.writeFile( + manifestPath, + `${JSON.stringify(manifest, null, 2)}\n`, + "utf8", + ); + + return { + ok: true, + toolName: tool.definition.name, + modulePath, + manifestPath, + availableNextRound: true, + }; +} + +async function readWritableAgentToolManifest( + manifestPath: string, +): Promise { + try { + const value = JSON.parse( + await fs.readFile(manifestPath, "utf8"), + ) as unknown; + if (!isRecord(value) || !Array.isArray(value.tools)) { + throw new Error( + `agent tool manifest must contain a tools array: ${manifestPath}`, + ); + } + return { + tools: value.tools.map((entry, index) => + parseWritableManifestEntry(entry, index), + ), + }; + } catch (error) { + if (isNotFoundError(error)) { + return { tools: [] }; + } + throw error; + } +} + +function parseWritableManifestEntry( + value: unknown, + index: number, +): AgentToolManifestEntry { + if (!isRecord(value)) { + throw new Error(`agent tool manifest entry ${index} must be an object`); + } + if (typeof value.modulePath !== "string" || value.modulePath.length === 0) { + throw new Error( + `agent tool manifest entry ${index} must have a modulePath`, + ); + } + if (!isRecord(value.initialization)) { + throw new Error( + `agent tool manifest entry ${index} must have an object initialization value`, + ); + } + return { + modulePath: value.modulePath, + initialization: value.initialization, + }; +} + +function stringArgument(args: JsonObject, name: string): string { + const value = args[name]; + if (typeof value !== "string" || value.length === 0) { + throw new Error( + `install_agent_tool argument ${name} must be a non-empty string`, + ); + } + return value; +} + +function objectArgument(args: JsonObject, name: string): JsonObject { + const value = args[name]; + if (!isRecord(value)) { + throw new Error(`install_agent_tool argument ${name} must be an object`); + } + return value; +} + +function isRecord(value: unknown): value is JsonObject { + return Boolean(value) && typeof value === "object" && !Array.isArray(value); +} + +function isNotFoundError(error: unknown): boolean { + return ( + error !== null && + typeof error === "object" && + "code" in error && + (error as { code?: unknown }).code === "ENOENT" + ); +} diff --git a/typescript/harness/index.test.ts b/typescript/harness/index.test.ts index 841514c..0ad0f72 100644 --- a/typescript/harness/index.test.ts +++ b/typescript/harness/index.test.ts @@ -1,4 +1,7 @@ import { describe, expect, it } from "vitest"; +import fs from "node:fs/promises"; +import os from "node:os"; +import path from "node:path"; import { buildShellToolDefinitions, @@ -6,14 +9,19 @@ import { createToolRegistry, initializeTool, registerBuiltInTools, + registerAgentToolsFromManifestPathIfExists, registerAgentToolsFromManifest, registerLibraryToolsFromManifest, registerToolsFromManifest, + materializeEventsToMessages, + toolResultMessage, toolResultEvent, + type Event, type EventData, type JsonObject, type ToolExecutionContext, type ToolInstance, + type Tool, type ToolResult, type TurnContext, } from "./index"; @@ -119,7 +127,107 @@ describe("HarnessToolRegistry", () => { }, }, ]), - ).rejects.toThrow("tool execution is not configured for missing"); + ).resolves.toEqual([ + toolResultEvent("call_1", { + ok: false, + error: "tool execution is not configured for missing", + }), + ]); + }); + + it("returns tool result errors instead of throwing tool failures", async () => { + const context = fakeTurnContext(); + const registry = createToolRegistry(context).register( + fakeTool("fail", async () => { + throw new Error("boom"); + }), + ); + + await expect( + registry.executePending([ + { + toolCallId: "call_1", + request: { + functionName: "fail", + arguments: {}, + }, + }, + ]), + ).resolves.toEqual([ + toolResultEvent("call_1", { + ok: false, + error: "boom", + }), + ]); + }); +}); + +describe("materializeEventsToMessages", () => { + it("synthesizes results for dangling tool calls before later messages", () => { + const events: Event[] = [ + { + id: "1", + conversationId: "conversation", + createdAt: "2026-01-01T00:00:00Z", + data: { + type: "messages", + messages: [ + { + role: "assistant", + content: [ + { + type: "tool_call", + tool_call_id: "call_1", + tool_name: "install_agent_tool", + arguments: {}, + }, + ], + }, + ], + }, + }, + { + id: "2", + conversationId: "conversation", + createdAt: "2026-01-01T00:00:01Z", + data: { + type: "tool_requested", + tool_call_id: "call_1", + request: { + function_name: "install_agent_tool", + arguments: {}, + }, + }, + }, + { + id: "3", + conversationId: "conversation", + createdAt: "2026-01-01T00:00:02Z", + data: { + type: "messages", + messages: [{ role: "user", content: "try again" }], + }, + }, + ]; + + expect(materializeEventsToMessages(events)).toEqual([ + { + role: "assistant", + content: [ + { + type: "tool_call", + tool_call_id: "call_1", + tool_name: "install_agent_tool", + arguments: {}, + }, + ], + }, + toolResultMessage("call_1", "install_agent_tool", { + ok: false, + error: "tool execution did not complete before the previous turn ended", + }), + { role: "user", content: "try again" }, + ]); }); }); @@ -395,6 +503,66 @@ describe("agent tool loading", () => { expect(registry.get("uppercase")?.source).toBe("library"); }); + it("installs an agent tool and loads it from the default manifest path", async () => { + const previousCwd = process.cwd(); + const tempdir = await fs.mkdtemp(path.join(os.tmpdir(), "exo-agent-tool-")); + process.chdir(tempdir); + try { + const context = fakeTurnContext(); + const installerRegistry = createToolRegistry(context); + registerBuiltInTools(installerRegistry, context, ["install_agent_tool"]); + + await expect( + installerRegistry.executePending([ + { + toolCallId: "install_1", + request: { + functionName: "install_agent_tool", + arguments: { + name: "reverse-text", + moduleSource: reverseTextToolSource(), + initialization: {}, + }, + }, + }, + ]), + ).resolves.toEqual([ + toolResultEvent("install_1", { + ok: true, + toolName: "reverse_text", + modulePath: ".exo/agent-tools/reverse-text.ts", + manifestPath: ".exo/agent-tools/manifest.json", + availableNextRound: true, + }), + ]); + + const registry = createToolRegistry(context); + await registerAgentToolsFromManifestPathIfExists(registry, context); + + expect(registry.get("reverse_text")?.source).toBe("agent"); + await expect( + registry.executePending([ + { + toolCallId: "call_1", + request: { + functionName: "reverse_text", + arguments: { + text: "hello", + }, + }, + }, + ]), + ).resolves.toEqual([ + toolResultEvent("call_1", { + text: "olleh", + }), + ]); + } finally { + process.chdir(previousCwd); + await fs.rm(tempdir, { recursive: true, force: true }); + } + }); + it("rejects agent tool modules without a default Tool export", async () => { const registry = createToolRegistry(fakeTurnContext()); @@ -424,6 +592,39 @@ describe("agent tool loading", () => { }), ).rejects.toThrow("tool initialization.prefix is required"); }); + + it("rejects generated tools using legacy inputSchema and invoke shapes", async () => { + const generatedTool = { + definition: { + name: "curl-tool", + description: "Fetch a URL.", + inputSchema: { + type: "object", + additionalProperties: false, + properties: { + url: { type: "string" }, + }, + required: ["url"], + }, + }, + initializationParameters: { + type: "object", + additionalProperties: false, + properties: {}, + }, + initialize() { + return { + async *invoke() { + yield { ok: true }; + }, + }; + }, + } as unknown as Tool; + + await expect( + initializeTool(generatedTool, "agent", {}, fakeTurnContext()), + ).rejects.toThrow("tool definition must use parameters, not inputSchema"); + }); }); function fakeTool( @@ -456,6 +657,45 @@ function uppercaseToolModulePath(): string { ).href; } +function reverseTextToolSource(): string { + return ` +import type { JsonObject, Tool, ToolResult } from "@exo/harness"; + +const reverseTextTool = { + definition: { + name: "reverse_text", + description: "Reverse text.", + parameters: { + type: "object", + additionalProperties: false, + properties: { + text: { type: "string" }, + }, + required: ["text"], + }, + }, + initializationParameters: { + type: "object", + additionalProperties: false, + properties: {}, + }, + initialize() { + return { + async execute(args: JsonObject): Promise { + const value = args.text; + if (typeof value !== "string") { + throw new Error("text must be a string"); + } + return { text: value.split("").reverse().join("") }; + }, + }; + }, +} satisfies Tool; + +export default reverseTextTool; +`; +} + function fakeTurnContext( options: { streaming?: boolean; @@ -471,6 +711,7 @@ function fakeTurnContext( harness: "typescript", typescript: null, libraryTools: [], + enableAgentToolCreation: true, sandboxImage: null, enableNetworking: false, model: "test-model", diff --git a/typescript/harness/index.ts b/typescript/harness/index.ts index b0f85be..22340c5 100644 --- a/typescript/harness/index.ts +++ b/typescript/harness/index.ts @@ -31,6 +31,7 @@ export interface AgentConfig { modulePath: string; initialization: JsonObject; }>; + enableAgentToolCreation: boolean; sandboxImage?: string | null; enableNetworking: boolean; model: string; @@ -559,10 +560,17 @@ export async function materializeConversationMessages( export function materializeEventsToMessages(events: Event[]): Message[] { const messages: Message[] = []; const toolCallNames = new Map(); + const pendingToolCallIds: string[] = []; for (const event of events) { - extendMaterializedMessages(messages, toolCallNames, event); + extendMaterializedMessages( + messages, + toolCallNames, + pendingToolCallIds, + event, + ); } + flushDanglingToolResults(messages, toolCallNames, pendingToolCallIds); return messages; } @@ -700,9 +708,11 @@ function contentText(content: unknown): string { function extendMaterializedMessages( messages: Message[], toolCallNames: Map, + pendingToolCallIds: string[], event: Event, ): void { if (isMessagesEvent(event.data)) { + flushDanglingToolResults(messages, toolCallNames, pendingToolCallIds); messages.push(...event.data.messages); return; } @@ -712,6 +722,7 @@ function extendMaterializedMessages( event.data.tool_call_id, event.data.request.function_name, ); + pendingToolCallIds.push(event.data.tool_call_id); return; } @@ -720,12 +731,46 @@ function extendMaterializedMessages( if (!toolName) { return; } + removePendingToolCall(pendingToolCallIds, event.data.tool_call_id); messages.push( toolResultMessage(event.data.tool_call_id, toolName, event.data.result), ); } } +function flushDanglingToolResults( + messages: Message[], + toolCallNames: Map, + pendingToolCallIds: string[], +): void { + while (pendingToolCallIds.length > 0) { + const toolCallId = pendingToolCallIds.shift(); + if (!toolCallId) { + continue; + } + const toolName = toolCallNames.get(toolCallId); + if (!toolName) { + continue; + } + messages.push( + toolResultMessage(toolCallId, toolName, { + ok: false, + error: "tool execution did not complete before the previous turn ended", + }), + ); + } +} + +function removePendingToolCall( + pendingToolCallIds: string[], + toolCallId: string, +): void { + const index = pendingToolCallIds.indexOf(toolCallId); + if (index >= 0) { + pendingToolCallIds.splice(index, 1); + } +} + function isMessagesEvent( data: EventData, ): data is EventData & { type: "messages"; messages: Message[] } { diff --git a/typescript/harness/runner.ts b/typescript/harness/runner.ts index 4ab626b..ff8b14c 100644 --- a/typescript/harness/runner.ts +++ b/typescript/harness/runner.ts @@ -48,6 +48,7 @@ interface RawAgentConfig { module_path: string; } | null; library_tools?: RawToolManifestEntry[]; + enable_agent_tool_creation?: boolean; sandbox_image?: string | null; enable_networking: boolean; model: string; @@ -784,6 +785,7 @@ function toAgentConfig(raw: RawAgentConfig): AgentConfig { } : null, libraryTools: (raw.library_tools ?? []).map(toToolManifestEntry), + enableAgentToolCreation: raw.enable_agent_tool_creation ?? true, sandboxImage: raw.sandbox_image ?? null, enableNetworking: raw.enable_networking, model: raw.model, diff --git a/typescript/harness/tool-manifest.ts b/typescript/harness/tool-manifest.ts index 1f7405e..5b5ceff 100644 --- a/typescript/harness/tool-manifest.ts +++ b/typescript/harness/tool-manifest.ts @@ -1,3 +1,7 @@ +import fs from "node:fs/promises"; +import path from "node:path"; +import { pathToFileURL } from "node:url"; + import type { JsonObject, TurnContext } from "./index"; import { initializeTool, @@ -7,6 +11,10 @@ import { type ToolInstance, } from "./tools"; +export const DEFAULT_AGENT_TOOL_MANIFEST_PATH = + ".exo/agent-tools/manifest.json"; +let agentToolImportVersion = 0; + export interface ToolManifest { tools: ToolManifestEntry[]; } @@ -48,6 +56,47 @@ export function registerAgentToolsFromManifest( return registerToolsFromManifest(registry, context, manifest, "agent"); } +export async function registerAgentToolsFromManifestPathIfExists( + registry: HarnessToolRegistry, + context: TurnContext, + manifestPath = DEFAULT_AGENT_TOOL_MANIFEST_PATH, +): Promise { + const manifest = await readToolManifestIfExists(manifestPath); + if (manifest) { + await registerAgentToolsFromManifest(registry, context, manifest); + } +} + +export async function readToolManifestIfExists( + manifestPath: string, +): Promise { + try { + return await readToolManifest(manifestPath); + } catch (error) { + if (isNotFoundError(error)) { + return null; + } + throw error; + } +} + +export async function readToolManifest( + manifestPath: string, +): Promise { + const raw = JSON.parse(await fs.readFile(manifestPath, "utf8")) as unknown; + if (!isRecord(raw) || !Array.isArray(raw.tools)) { + throw new Error( + `tool manifest must contain a tools array: ${manifestPath}`, + ); + } + + return { + tools: raw.tools.map((entry, index) => + parseManifestEntry(entry, manifestPath, index), + ), + }; +} + export async function loadToolFromManifestEntry( context: TurnContext, entry: ToolManifestEntry, @@ -75,7 +124,9 @@ async function importTool( modulePath: string, source: Extract, ): Promise { - const module = (await import(modulePath)) as { default?: unknown }; + const module = (await import(importSpecifier(modulePath, source))) as { + default?: unknown; + }; if (!isTool(module.default)) { throw new Error( `${source} tool module must default export a Tool: ${modulePath}`, @@ -84,6 +135,66 @@ async function importTool( return module.default; } +function importSpecifier( + modulePath: string, + source: Extract, +): string { + if (source !== "agent") { + return modulePath; + } + if (modulePath.startsWith("data:")) { + return modulePath; + } + const url = modulePath.startsWith("file:") + ? new URL(modulePath) + : path.isAbsolute(modulePath) + ? pathToFileURL(modulePath) + : null; + if (!url) { + return modulePath; + } + agentToolImportVersion += 1; + url.searchParams.set("agentToolVersion", String(agentToolImportVersion)); + return url.href; +} + +function parseManifestEntry( + value: unknown, + manifestPath: string, + index: number, +): ToolManifestEntry { + if (!isRecord(value)) { + throw new Error(`tool manifest entry ${index} must be an object`); + } + if (typeof value.modulePath !== "string" || value.modulePath.length === 0) { + throw new Error(`tool manifest entry ${index} must have a modulePath`); + } + if (!isRecord(value.initialization)) { + throw new Error( + `tool manifest entry ${index} must have an object initialization value`, + ); + } + + return { + modulePath: resolveManifestModulePath(manifestPath, value.modulePath), + initialization: value.initialization, + }; +} + +function resolveManifestModulePath( + manifestPath: string, + modulePath: string, +): string { + if ( + modulePath.startsWith("data:") || + modulePath.startsWith("file:") || + path.isAbsolute(modulePath) + ) { + return modulePath; + } + return path.resolve(path.dirname(manifestPath), modulePath); +} + function isTool(value: unknown): value is Tool { if (!value || typeof value !== "object") { return false; @@ -99,3 +210,16 @@ function isTool(value: unknown): value is Tool { typeof candidate.initialize === "function" ); } + +function isRecord(value: unknown): value is JsonObject { + return Boolean(value) && typeof value === "object" && !Array.isArray(value); +} + +function isNotFoundError(error: unknown): boolean { + return ( + error !== null && + typeof error === "object" && + "code" in error && + (error as { code?: unknown }).code === "ENOENT" + ); +} diff --git a/typescript/harness/tools.ts b/typescript/harness/tools.ts index 56ac798..2b7d5c6 100644 --- a/typescript/harness/tools.ts +++ b/typescript/harness/tools.ts @@ -67,12 +67,32 @@ export class HarnessToolRegistry { async executePending(toolCalls: PendingToolCall[]): Promise { const events: EventData[] = []; for (const toolCall of toolCalls) { - const result = await this.executeToolCall(toolCall); + const result = await this.executeToolCallOrError(toolCall); events.push(toolResultEvent(toolCall.toolCallId, result)); } return events; } + private async executeToolCallOrError( + toolCall: PendingToolCall, + ): Promise { + try { + return await this.executeToolCall(toolCall); + } catch (error) { + const result = { + ok: false, + error: errorMessage(error), + }; + if (this.context.streaming) { + await this.context.stream.toolResult({ + toolCallId: toolCall.toolCallId, + result, + }); + } + return result; + } + } + private async executeToolCall( toolCall: PendingToolCall, ): Promise { @@ -103,6 +123,10 @@ export class HarnessToolRegistry { } } +function errorMessage(error: unknown): string { + return error instanceof Error ? error.message : String(error); +} + export function createToolRegistry(context: TurnContext): HarnessToolRegistry { return new HarnessToolRegistry(context); } @@ -113,21 +137,72 @@ export async function initializeTool( initializationArgs: JsonObject, context: TurnContext, ): Promise { + validateToolDefinition(tool.definition); validateJsonSchema( tool.initializationParameters, initializationArgs, "tool initialization", ); + const handler = await tool.initialize(initializationArgs, { + context, + source, + }); + validateToolHandler(handler); return { definition: tool.definition, source, - handler: await tool.initialize(initializationArgs, { - context, - source, - }), + handler, }; } +function validateToolDefinition(definition: ToolDefinition): void { + if (typeof definition.name !== "string" || definition.name.length === 0) { + throw new Error("tool definition.name must be a non-empty string"); + } + if ( + !/^[A-Za-z0-9_-]+$/.test(definition.name) || + definition.name.length > 64 + ) { + throw new Error( + "tool definition.name must contain only letters, numbers, underscores, and dashes, and be at most 64 characters", + ); + } + if ( + typeof definition.description !== "string" || + definition.description.length === 0 + ) { + throw new Error("tool definition.description must be a non-empty string"); + } + const rawDefinition = definition as unknown as { inputSchema?: unknown }; + if (definition.parameters === undefined && rawDefinition.inputSchema) { + throw new Error("tool definition must use parameters, not inputSchema"); + } + if (!isRecord(definition.parameters)) { + throw new Error("tool definition.parameters must be an object JSON schema"); + } + if (definition.parameters.type !== "object") { + throw new Error("tool definition.parameters.type must be object"); + } + if (definition.parameters.additionalProperties !== false) { + throw new Error( + "tool definition.parameters.additionalProperties must be false", + ); + } +} + +function validateToolHandler(handler: ToolHandler): void { + if (!handler || typeof handler !== "object") { + throw new Error("tool initialize must return a handler object"); + } + const candidate = handler as { execute?: unknown; invoke?: unknown }; + if (typeof candidate.execute !== "function" && candidate.invoke) { + throw new Error("tool handler must implement execute, not invoke"); + } + if (typeof candidate.execute !== "function") { + throw new Error("tool handler must implement execute"); + } +} + function validateJsonSchema( schema: JsonValue, value: JsonValue, From a5a5829e0a1c1a21bdc883fb8d949cb6b15a5c9a Mon Sep 17 00:00:00 2001 From: 61cygni <> Date: Sat, 23 May 2026 10:27:15 -0700 Subject: [PATCH 6/8] Store tool results in artifacts Compact large tool outputs into artifact references so conversations stay within model context, and refresh turn heads after artifact writes to keep event appends consistent. Co-authored-by: Cursor --- crates/exoharness/src/basic.rs | 4 +- crates/exoharness/src/basic_tests.rs | 42 +++++++ docs/tools.md | 14 +++ examples/typescript/basic-harness.ts | 21 +++- typescript/harness/index.test.ts | 171 ++++++++++++++++++++++++--- typescript/harness/tools.ts | 161 +++++++++++++++++++++++-- 6 files changed, 379 insertions(+), 34 deletions(-) diff --git a/crates/exoharness/src/basic.rs b/crates/exoharness/src/basic.rs index 6dc7125..c0b5b8c 100644 --- a/crates/exoharness/src/basic.rs +++ b/crates/exoharness/src/basic.rs @@ -1402,7 +1402,6 @@ impl TurnHandle for BasicTurnHandle { } async fn add_events(&self, data: Vec) -> Result { - let expected_head = *self.latest_event_id.lock().await; let _guard = self.harness.inner.write_lock.lock().await; let mut record = self .harness @@ -1410,6 +1409,7 @@ impl TurnHandle for BasicTurnHandle { .storage .get_json::(self.conversation_dir.join("record.json")) .await?; + let expected_head = record.latest_event_id; let add_result = append_events_to_conversation( &self.harness.inner, &self.conversation_dir, @@ -1440,7 +1440,6 @@ impl TurnHandle for BasicTurnHandle { .ok_or_else(|| anyhow!("turn has no latest event id"))?; return Ok(latest); } - let expected_head = *self.latest_event_id.lock().await; let _guard = self.harness.inner.write_lock.lock().await; let mut record = self .harness @@ -1448,6 +1447,7 @@ impl TurnHandle for BasicTurnHandle { .storage .get_json::(self.conversation_dir.join("record.json")) .await?; + let expected_head = record.latest_event_id; let add_result = append_events_to_conversation( &self.harness.inner, &self.conversation_dir, diff --git a/crates/exoharness/src/basic_tests.rs b/crates/exoharness/src/basic_tests.rs index cf2e1fc..6a614c3 100644 --- a/crates/exoharness/src/basic_tests.rs +++ b/crates/exoharness/src/basic_tests.rs @@ -8,6 +8,7 @@ use crate::{ Artifact, ArtifactVersion, BasicExoHarness, BeginTurnRequest, Binding, CreateSandboxRequest, EventData, EventQuery, EventQueryDirection, ExoHarness, ForkConversationRequest, NewAgentRequest, NewConversationRequest, PutSecretRequest, RunInSandboxRequest, Secret, + WriteArtifactRequest, }; #[tokio::test(flavor = "current_thread")] @@ -110,6 +111,47 @@ async fn begin_turn_tracks_events_through_finish() { assert_eq!(events.last().expect("turn ended").id, latest_event_id); } +#[tokio::test(flavor = "current_thread")] +async fn turn_events_continue_after_artifact_writes() { + let tempdir = TempDir::new().expect("tempdir"); + let harness = BasicExoHarness::new_with_local_process_sandbox(tempdir.path()) + .await + .expect("harness should initialize"); + let agent = harness + .new_agent(NewAgentRequest { + slug: "agent".to_string(), + name: "Agent".to_string(), + }) + .await + .expect("agent"); + let conversation = agent + .new_conversation(NewConversationRequest::default()) + .await + .expect("conversation"); + + let turn = conversation + .begin_turn(BeginTurnRequest { + session_id: None, + input: vec![user_message("ping")], + }) + .await + .expect("turn"); + conversation + .write_artifact(WriteArtifactRequest { + path: "tool-results/example.json".to_string(), + contents: br#"{"ok":true}"#.to_vec(), + }) + .await + .expect("write artifact"); + turn.add_events(vec![EventData::Messages { + messages: vec![assistant_message("pong")], + response_id: None, + }]) + .await + .expect("append after artifact write"); + turn.finish().await.expect("finish after artifact write"); +} + #[tokio::test(flavor = "current_thread")] async fn artifacts_are_versioned_by_path() { let tempdir = TempDir::new().expect("tempdir"); diff --git a/docs/tools.md b/docs/tools.md index 25fcc62..eca3fa3 100644 --- a/docs/tools.md +++ b/docs/tools.md @@ -86,6 +86,18 @@ This means the durable conversation history records: Tracing can separately record richer information, such as duration, errors, or tool source, without changing the canonical event shape. +Tool results are artifact-backed. The registry writes the full tool result to a +conversation artifact and returns a compact model-facing result containing: + +- artifact metadata for the full result +- a small preview +- the inline value only when the serialized result is small enough + +For shell-like results, non-empty `stdout` and `stderr` are also written as +separate text artifacts. This keeps large HTML pages, logs, browser output, and +data dumps out of the model context while preserving the complete data for later +inspection or targeted reads. + ## Configuration And Secrets Tools should use existing exoharness configuration primitives: @@ -141,6 +153,8 @@ Recommended defaults: - Validate initialization parameters before exposing a tool. - Validate generated tools against the `Tool` contract before adding them to the manifest. +- Keep `tool_result` payloads compact; full tool outputs should flow through + artifacts. - Require explicit networking enablement for tools that call external services. - Require confirmation for tools with external side effects. - Avoid persisting agent tools beyond the conversation or workspace unless a diff --git a/examples/typescript/basic-harness.ts b/examples/typescript/basic-harness.ts index 2fc2d6f..c37a75a 100644 --- a/examples/typescript/basic-harness.ts +++ b/examples/typescript/basic-harness.ts @@ -6,6 +6,7 @@ import { registerAgentToolsFromManifestPathIfExists, registerLibraryToolsFromManifest, turnMetadata, + type EventData, type Message, type TurnContext, } from "@exo/harness"; @@ -39,7 +40,7 @@ async function runBasicTurnLoop( turnParent: TraceParent, model: string, ): Promise { - const { conversation, turn } = context.exoharness.current; + const { conversation } = context.exoharness.current; const maxToolRoundTrips = context.agentConfig.maxToolRoundTrips; let latestEventId: string | null = null; @@ -84,7 +85,7 @@ async function runBasicTurnLoop( const events = responseToLinguaEvents(response); if (events.length > 0) { - latestEventId = (await turn.addEvents(events)).latestEventId; + latestEventId = await appendTurnEvents(context, events); } const toolCalls = responseToolCalls(response); @@ -101,12 +102,26 @@ async function runBasicTurnLoop( (toolCall) => tools.executePending([toolCall]), ); if (toolResultEvents.length > 0) { - latestEventId = (await turn.addEvents(toolResultEvents)).latestEventId; + latestEventId = await appendTurnEvents(context, toolResultEvents); } } } } +async function appendTurnEvents( + context: TurnContext, + data: EventData[], +): Promise { + const { conversation, turn } = context.exoharness.current; + return ( + await conversation.addEvents({ + sessionId: turn.record.sessionId, + turnId: turn.record.id, + data, + }) + ).latestEventId; +} + function basicHarnessInstructions(context: TurnContext): Message[] { return context.agentConfig.enableAgentToolCreation ? [...context.agentConfig.instructions, agentToolCreationInstruction()] diff --git a/typescript/harness/index.test.ts b/typescript/harness/index.test.ts index 0ad0f72..1fa210b 100644 --- a/typescript/harness/index.test.ts +++ b/typescript/harness/index.test.ts @@ -70,7 +70,7 @@ describe("HarnessToolRegistry", () => { ]); expect(events).toEqual([ - toolResultEvent("call_1", { + wrappedToolResultEvent("call_1", "echo", "library", 1, { echoed: "hello", }), ]); @@ -109,7 +109,9 @@ describe("HarnessToolRegistry", () => { { type: "tool_result_streamed", toolCallId: "call_1", - result: { echoed: "hello" }, + result: wrappedToolResult("call_1", "echo", "library", 1, { + echoed: "hello", + }), }, ]); }); @@ -128,7 +130,7 @@ describe("HarnessToolRegistry", () => { }, ]), ).resolves.toEqual([ - toolResultEvent("call_1", { + wrappedToolResultEvent("call_1", "missing", "built_in", 1, { ok: false, error: "tool execution is not configured for missing", }), @@ -154,12 +156,77 @@ describe("HarnessToolRegistry", () => { }, ]), ).resolves.toEqual([ - toolResultEvent("call_1", { + wrappedToolResultEvent("call_1", "fail", "library", 1, { ok: false, error: "boom", }), ]); }); + + it("stores large shell-style output in artifacts instead of inline value", async () => { + const context = fakeTurnContext(); + const stdout = "x".repeat(9_000); + const registry = createToolRegistry(context).register( + fakeTool("shell", async () => ({ + stdout, + stderr: "", + exit_code: 0, + })), + ); + + const events = await registry.executePending([ + { + toolCallId: "call_1", + request: { + functionName: "shell", + arguments: {}, + }, + }, + ]); + + expect(events).toEqual([ + toolResultEvent("call_1", { + ok: true, + toolName: "shell", + toolCallId: "call_1", + source: "library", + resultArtifact: { + artifactId: "artifact-1", + path: "tool-results/shell/call_1/result.json", + version: 1, + sizeBytes: 9053, + mimeType: "application/json", + }, + artifacts: [ + { + artifactId: "artifact-1", + path: "tool-results/shell/call_1/result.json", + version: 1, + sizeBytes: 9053, + mimeType: "application/json", + }, + { + artifactId: "artifact-2", + path: "tool-results/shell/call_1/stdout.txt", + version: 1, + sizeBytes: 9000, + mimeType: "text/plain", + }, + ], + truncated: true, + preview: `${JSON.stringify( + { + stdout, + stderr: "", + exit_code: 0, + }, + null, + 2, + ).slice(0, 4_000)}\n...[truncated]`, + value: null, + }), + ]); + }); }); describe("materializeEventsToMessages", () => { @@ -356,7 +423,7 @@ describe("library tool modules", () => { }, ]), ).resolves.toEqual([ - toolResultEvent("call_1", { + wrappedToolResultEvent("call_1", "uppercase", "library", 1, { text: "result: HELLO", }), ]); @@ -396,7 +463,7 @@ describe("library tool modules", () => { }, ]), ).resolves.toEqual([ - toolResultEvent("call_1", { + wrappedToolResultEvent("call_1", "irc_send_message", "library", 1, { ok: true, dryRun: true, registered: false, @@ -438,7 +505,7 @@ describe("agent tool loading", () => { }, ]), ).resolves.toEqual([ - toolResultEvent("call_1", { + wrappedToolResultEvent("call_1", "uppercase", "library", 1, { text: "library: HELLO", }), ]); @@ -474,7 +541,7 @@ describe("agent tool loading", () => { }, ]), ).resolves.toEqual([ - toolResultEvent("call_1", { + wrappedToolResultEvent("call_1", "uppercase", "agent", 1, { text: "agent: HELLO", }), ]); @@ -527,13 +594,19 @@ describe("agent tool loading", () => { }, ]), ).resolves.toEqual([ - toolResultEvent("install_1", { - ok: true, - toolName: "reverse_text", - modulePath: ".exo/agent-tools/reverse-text.ts", - manifestPath: ".exo/agent-tools/manifest.json", - availableNextRound: true, - }), + wrappedToolResultEvent( + "install_1", + "install_agent_tool", + "built_in", + 1, + { + ok: true, + toolName: "reverse_text", + modulePath: ".exo/agent-tools/reverse-text.ts", + manifestPath: ".exo/agent-tools/manifest.json", + availableNextRound: true, + }, + ), ]); const registry = createToolRegistry(context); @@ -553,7 +626,7 @@ describe("agent tool loading", () => { }, ]), ).resolves.toEqual([ - toolResultEvent("call_1", { + wrappedToolResultEvent("call_1", "reverse_text", "agent", 2, { text: "olleh", }), ]); @@ -696,6 +769,58 @@ export default reverseTextTool; `; } +function wrappedToolResultEvent( + toolCallId: string, + toolName: string, + source: "built_in" | "library" | "agent", + artifactIndex: number, + value: ToolResult, +): EventData { + return toolResultEvent( + toolCallId, + wrappedToolResult(toolCallId, toolName, source, artifactIndex, value), + ); +} + +function wrappedToolResult( + toolCallId: string, + toolName: string, + source: "built_in" | "library" | "agent", + artifactIndex: number, + value: ToolResult, +): ToolResult { + const serialized = + typeof value === "string" ? value : JSON.stringify(value, null, 2); + const artifact = { + artifactId: `artifact-${artifactIndex}`, + path: `tool-results/${toolName}/${toolCallId}/result.json`, + version: 1, + sizeBytes: `${serialized}\n`.length, + mimeType: "application/json", + }; + return { + ok: resultOk(value), + toolName, + toolCallId, + source, + resultArtifact: artifact, + artifacts: [artifact], + truncated: false, + preview: serialized, + value, + }; +} + +function resultOk(value: ToolResult): boolean { + return ( + !value || + typeof value !== "object" || + Array.isArray(value) || + typeof (value as { ok?: unknown }).ok !== "boolean" || + (value as { ok: boolean }).ok + ); +} + function fakeTurnContext( options: { streaming?: boolean; @@ -705,6 +830,7 @@ function fakeTurnContext( } = {}, ): TurnContext { const streamEvents = options.streamEvents ?? []; + let artifactIndex = 0; return { agentConfig: { instructions: [], @@ -733,7 +859,18 @@ function fakeTurnContext( exoharness: { current: { agent: {}, - conversation: {}, + conversation: { + async writeArtifactText(args: { path: string; text: string }) { + artifactIndex += 1; + return { + artifactId: `artifact-${artifactIndex}`, + path: args.path, + version: 1, + createdAt: "2026-01-01T00:00:00Z", + sizeBytes: args.text.length, + }; + }, + }, turn: {}, }, }, diff --git a/typescript/harness/tools.ts b/typescript/harness/tools.ts index 2b7d5c6..386408f 100644 --- a/typescript/harness/tools.ts +++ b/typescript/harness/tools.ts @@ -1,4 +1,5 @@ import type { + ArtifactVersion, EventData, JsonObject, JsonValue, @@ -10,6 +11,9 @@ import type { export type HarnessToolSource = "built_in" | "library" | "agent"; +const TOOL_RESULT_INLINE_LIMIT_CHARS = 8_000; +const TOOL_RESULT_PREVIEW_CHARS = 4_000; + export interface ToolExecutionContext { readonly context: TurnContext; readonly toolCallId?: string; @@ -76,26 +80,27 @@ export class HarnessToolRegistry { private async executeToolCallOrError( toolCall: PendingToolCall, ): Promise { + const configuredTool = + this.tools.get(toolCall.request.functionName) ?? null; try { - return await this.executeToolCall(toolCall); + const { tool, result } = await this.executeToolCall(toolCall); + return await this.normalizeAndStreamToolResult(toolCall, tool, result); } catch (error) { - const result = { + const result: ToolResult = { ok: false, error: errorMessage(error), }; - if (this.context.streaming) { - await this.context.stream.toolResult({ - toolCallId: toolCall.toolCallId, - result, - }); - } - return result; + return await this.normalizeAndStreamToolResult( + toolCall, + configuredTool, + result, + ); } } private async executeToolCall( toolCall: PendingToolCall, - ): Promise { + ): Promise<{ tool: ToolInstance; result: ToolResult }> { const tool = this.tools.get(toolCall.request.functionName); if (!tool) { throw new Error( @@ -113,16 +118,148 @@ export class HarnessToolRegistry { context: this.context, toolCallId: toolCall.toolCallId, }); + return { tool, result }; + } + + private async normalizeAndStreamToolResult( + toolCall: PendingToolCall, + tool: ToolInstance | null, + result: ToolResult, + ): Promise { + const normalized = await compactToolResult(this.context, { + toolCallId: toolCall.toolCallId, + toolName: tool?.definition.name ?? toolCall.request.functionName, + source: tool?.source ?? "built_in", + result, + }); if (this.context.streaming) { await this.context.stream.toolResult({ toolCallId: toolCall.toolCallId, - result, + result: normalized, }); } - return result; + return normalized; } } +interface CompactToolResultArgs { + toolCallId: string; + toolName: string; + source: HarnessToolSource; + result: ToolResult; +} + +interface ToolResultArtifactReference extends JsonObject { + artifactId: string; + path: string; + version: number; + sizeBytes: number; + mimeType: string; +} + +async function compactToolResult( + context: TurnContext, + args: CompactToolResultArgs, +): Promise { + const fullResultArtifact = await writeToolResultArtifact( + context, + args, + "result.json", + `${JSON.stringify(args.result, null, 2)}\n`, + "application/json", + ); + const serialized = stringifyToolResult(args.result); + const shellArtifacts = await writeShellOutputArtifacts(context, args); + const value = + serialized.length <= TOOL_RESULT_INLINE_LIMIT_CHARS ? args.result : null; + return { + ok: resultOk(args.result), + toolName: args.toolName, + toolCallId: args.toolCallId, + source: args.source, + resultArtifact: fullResultArtifact, + artifacts: [fullResultArtifact, ...shellArtifacts], + truncated: serialized.length > TOOL_RESULT_INLINE_LIMIT_CHARS, + preview: previewText(serialized), + value, + }; +} + +async function writeShellOutputArtifacts( + context: TurnContext, + args: CompactToolResultArgs, +): Promise { + if (!isRecord(args.result)) { + return []; + } + const artifacts: ToolResultArtifactReference[] = []; + for (const key of ["stdout", "stderr"] as const) { + const value = args.result[key]; + if (typeof value !== "string" || value.length === 0) { + continue; + } + artifacts.push( + await writeToolResultArtifact( + context, + args, + `${key}.txt`, + value, + "text/plain", + ), + ); + } + return artifacts; +} + +async function writeToolResultArtifact( + context: TurnContext, + args: CompactToolResultArgs, + fileName: string, + text: string, + mimeType: string, +): Promise { + const artifact = + await context.exoharness.current.conversation.writeArtifactText({ + path: `tool-results/${sanitizePathSegment(args.toolName)}/${sanitizePathSegment(args.toolCallId)}/${fileName}`, + text, + }); + return artifactReference(artifact, mimeType); +} + +function artifactReference( + artifact: ArtifactVersion, + mimeType: string, +): ToolResultArtifactReference { + return { + artifactId: artifact.artifactId, + path: artifact.path, + version: artifact.version, + sizeBytes: artifact.sizeBytes, + mimeType, + }; +} + +function resultOk(result: ToolResult): boolean { + if (isRecord(result) && typeof result.ok === "boolean") { + return result.ok; + } + return true; +} + +function stringifyToolResult(result: ToolResult): string { + return typeof result === "string" ? result : JSON.stringify(result, null, 2); +} + +function previewText(text: string): string { + return text.length > TOOL_RESULT_PREVIEW_CHARS + ? `${text.slice(0, TOOL_RESULT_PREVIEW_CHARS)}\n...[truncated]` + : text; +} + +function sanitizePathSegment(value: string): string { + return value.replace(/[^A-Za-z0-9_.-]+/g, "_").slice(0, 96) || "unknown"; +} + function errorMessage(error: unknown): string { return error instanceof Error ? error.message : String(error); } From a28d6fb0e1c97d8312f7a399d3054a7bddd737d3 Mon Sep 17 00:00:00 2001 From: 61cygni <> Date: Sat, 23 May 2026 16:31:40 -0700 Subject: [PATCH 7/8] Add Exoclaw scheduled task harness Co-authored-by: Cursor --- README.md | 21 + crates/cli/src/main.rs | 93 +- crates/cli/src/schedule.rs | 110 ++ crates/cli/src/tui.rs | 116 +- crates/executor/src/conversation_sandbox.rs | 142 +++ crates/executor/src/conversation_wakeup.rs | 21 + crates/executor/src/executor_types.rs | 1 + crates/executor/src/harness_basic_tests.rs | 10 +- crates/executor/src/harness_tool.rs | 332 +++--- crates/executor/src/lib.rs | 13 +- crates/executor/src/scheduler_runtime.rs | 413 +++++++ crates/executor/src/scheduler_store.rs | 231 ++++ crates/executor/src/scheduler_types.rs | 291 +++++ crates/executor/src/typescript.rs | 29 +- crates/exoharness/src/basic.rs | 31 +- crates/exoharness/src/basic_tests.rs | 79 ++ crates/exoharness/src/sandbox.rs | 117 +- examples/exoclaw/README.md | 62 ++ examples/exoclaw/harness.ts | 55 + examples/typescript/basic-harness.ts | 154 +-- examples/typescript/turn-loop.ts | 184 ++++ scripts/exoclaw-repl | 416 +++++++ tools.md | 1079 ------------------- typescript/harness/index.ts | 3 +- typescript/harness/runner.ts | 2 +- typescript/harness/scheduler-tools.ts | 215 ++++ 26 files changed, 2792 insertions(+), 1428 deletions(-) create mode 100644 crates/cli/src/schedule.rs create mode 100644 crates/executor/src/conversation_sandbox.rs create mode 100644 crates/executor/src/conversation_wakeup.rs create mode 100644 crates/executor/src/scheduler_runtime.rs create mode 100644 crates/executor/src/scheduler_store.rs create mode 100644 crates/executor/src/scheduler_types.rs create mode 100644 examples/exoclaw/README.md create mode 100644 examples/exoclaw/harness.ts create mode 100644 examples/typescript/turn-loop.ts create mode 100755 scripts/exoclaw-repl delete mode 100644 tools.md create mode 100644 typescript/harness/scheduler-tools.ts diff --git a/README.md b/README.md index 5a1a62f..fe6ba7e 100644 --- a/README.md +++ b/README.md @@ -107,12 +107,33 @@ and recursive-language-model harness experiments. For the coding-agent setup commands, see [docs/coding-agent-harnesses.md](./docs/coding-agent-harnesses.md). +## Exoclaw Long-Running Harness + +Exoclaw is the TypeScript harness example for long-running local agents. It uses +the same TypeScript runner path as the minimal harness, but opts into scheduled +task tools and scheduler-aware sandbox policy. + +Start a local Exoclaw REPL with: + +```bash +scripts/exoclaw-repl +``` + +Scheduled tasks can be created, listed, cancelled, and deleted by the agent. The +corresponding CLI commands are under: + +```bash +./target/debug/exo --harness exoclaw schedule --help +``` + ## Repository Layout - `crates`: Rust workspace for the CLI, exoharness substrate, and executors. - `typescript`: TypeScript harness runtime, model-runtime helpers, and adapter-specific support code. - `examples/typescript`: runnable TypeScript harness examples. +- `examples/exoclaw`: long-running TypeScript harness example with scheduled + task support. - `containers`: sandbox images used by the coding-agent harness examples. - `spec`: core architecture and terminology. - `docs`: design notes for in-progress directions. diff --git a/crates/cli/src/main.rs b/crates/cli/src/main.rs index 3f5d33a..cd9e73f 100644 --- a/crates/cli/src/main.rs +++ b/crates/cli/src/main.rs @@ -7,6 +7,7 @@ mod mount_tests; mod naming_tests; #[cfg(test)] mod repl_tests; +mod schedule; #[cfg(test)] mod secret_tests; mod tui; @@ -14,15 +15,17 @@ mod tui; use std::collections::HashMap; use std::path::{Path, PathBuf}; use std::sync::Arc; +use std::time::{SystemTime, UNIX_EPOCH}; use clap::{Parser, Subcommand, ValueEnum}; use executor::{ AgentHarnessKind, BasicExoHarness, BasicHarness, Binding, BraintrustProject, BraintrustRuntimeConfig, BraintrustTracingConfig, ConversationModelConfig, CreateAgentRequest, - CreateConversationRequest, EventQuery, EventQueryDirection, ExoHarness, FileSystemMount, - FileSystemMountMode, ForkConversationRequest, Harness, HarnessAgent, HarnessConversation, - PutSecretRequest, RlmHarness, SANDBOX_MAIN_MOUNT_DIR, Secret, SendRequest, ToolManifestEntry, - TypeScriptHarness, TypeScriptHarnessConfig, Uuid7, load_agent_config, + CreateConversationRequest, EventQuery, EventQueryDirection, ExoHarness, ExoclawToolRuntime, + FileSystemMount, FileSystemMountMode, ForkConversationRequest, Harness, HarnessAgent, + HarnessConversation, PutSecretRequest, RlmHarness, SANDBOX_MAIN_MOUNT_DIR, Secret, + ToolManifestEntry, TypeScriptHarness, TypeScriptHarnessConfig, Uuid7, load_agent_config, + send_conversation_wakeup, }; use lingua::Message; use lingua::universal::{AssistantContent, AssistantContentPart, ToolContentPart, UserContent}; @@ -61,6 +64,7 @@ enum HarnessKind { Rlm, #[value(name = "typescript")] TypeScript, + Exoclaw, } #[derive(Debug, Clone, Copy, ValueEnum)] @@ -104,6 +108,10 @@ enum Commands { #[command(subcommand)] command: ModelCommands, }, + Schedule { + #[command(subcommand)] + command: schedule::ScheduleCommands, + }, } #[derive(Debug, Subcommand)] @@ -359,6 +367,8 @@ async fn main() -> Result<(), Box> { name: Some(agent_slug), harness: to_agent_harness_kind(harness_kind), typescript: None, + library_tools: Vec::new(), + enable_agent_tool_creation: true, sandbox_image: None, enable_networking: false, model, @@ -392,6 +402,9 @@ async fn main() -> Result<(), Box> { run_chat_repl(conversation).await?; } + Commands::Schedule { command } => { + schedule::handle_schedule_command(&cli.root, Arc::clone(&harness), command).await?; + } Commands::Agent { command } => match command { AgentCommands::List => { println!("AGENT\tNAME"); @@ -631,9 +644,14 @@ async fn main() -> Result<(), Box> { if !changed { return Err("no changes provided".into()); } - if config.harness == AgentHarnessKind::TypeScript && config.typescript.is_none() { + if matches!( + config.harness, + AgentHarnessKind::TypeScript | AgentHarnessKind::Exoclaw + ) && config.typescript.is_none() + { return Err( - "typescript agents require a module path; pass --module ".into(), + "TypeScript and Exoclaw agents require a module path; pass --module " + .into(), ); } agent.put_config(config).await?; @@ -1070,15 +1088,7 @@ async fn main() -> Result<(), Box> { let conversation = must_get_conversation(harness.as_ref(), &agent, &conversation).await?; let previous_messages = conversation.messages().await?; - let result = conversation - .send(SendRequest { - input: vec![Message::User { - content: UserContent::String(prompt), - }], - session_id: None, - }) - .await?; - conversation.close_session(result.session_id).await?; + send_conversation_wakeup(conversation.as_ref(), prompt).await?; let messages = conversation.messages().await?; for message in &messages[previous_messages.len()..] { print_message(message); @@ -1209,7 +1219,7 @@ fn command_agent_ref(command: &Commands) -> Option<&str> { } }, Commands::Repl { agent, .. } => Some(agent.as_deref().unwrap_or(DEFAULT_REPL_SLUG)), - Commands::Secret { .. } | Commands::Model { .. } => None, + Commands::Secret { .. } | Commands::Model { .. } | Commands::Schedule { .. } => None, } } @@ -1249,6 +1259,14 @@ async fn instantiate_harness( HarnessKind::TypeScript => { Arc::new(TypeScriptHarness::from_root(root, runtime_config, env_vars).await?) } + HarnessKind::Exoclaw => Arc::new( + TypeScriptHarness::::exoclaw_from_root( + root, + runtime_config, + env_vars, + ) + .await?, + ), }; Ok(harness) } @@ -1258,6 +1276,7 @@ fn to_agent_harness_kind(kind: HarnessKind) -> AgentHarnessKind { HarnessKind::Basic => AgentHarnessKind::Basic, HarnessKind::Rlm => AgentHarnessKind::Rlm, HarnessKind::TypeScript => AgentHarnessKind::TypeScript, + HarnessKind::Exoclaw => AgentHarnessKind::Exoclaw, } } @@ -1266,6 +1285,7 @@ fn from_agent_harness_kind(kind: AgentHarnessKind) -> HarnessKind { AgentHarnessKind::Basic => HarnessKind::Basic, AgentHarnessKind::Rlm => HarnessKind::Rlm, AgentHarnessKind::TypeScript => HarnessKind::TypeScript, + AgentHarnessKind::Exoclaw => HarnessKind::Exoclaw, } } @@ -1274,6 +1294,7 @@ fn format_harness_kind(kind: AgentHarnessKind) -> &'static str { AgentHarnessKind::Basic => "basic", AgentHarnessKind::Rlm => "rlm", AgentHarnessKind::TypeScript => "typescript", + AgentHarnessKind::Exoclaw => "exoclaw", } } @@ -1282,11 +1303,13 @@ fn build_typescript_harness_config( module: Option<&Path>, ) -> Result, Box> { match (harness_kind, module) { - (HarnessKind::TypeScript, Some(module)) => { + (HarnessKind::TypeScript | HarnessKind::Exoclaw, Some(module)) => { Ok(Some(resolve_typescript_module_path(module)?)) } - (HarnessKind::TypeScript, None) => Err("typescript agents require --module ".into()), - (_, Some(_)) => Err("--module is only valid with --harness typescript".into()), + (HarnessKind::TypeScript | HarnessKind::Exoclaw, None) => { + Err("TypeScript and Exoclaw agents require --module ".into()) + } + (_, Some(_)) => Err("--module is only valid with --harness typescript or exoclaw".into()), (_, None) => Ok(None), } } @@ -1324,8 +1347,11 @@ fn load_tool_manifests( if paths.is_empty() { return Ok(Vec::new()); } - if harness_kind != AgentHarnessKind::TypeScript { - return Err("--tool-manifest is only valid with TypeScript agents".into()); + if !matches!( + harness_kind, + AgentHarnessKind::TypeScript | AgentHarnessKind::Exoclaw + ) { + return Err("--tool-manifest is only valid with TypeScript or Exoclaw agents".into()); } let mut tools = Vec::new(); @@ -1595,28 +1621,43 @@ async fn must_get_conversation( } pub(crate) fn print_message(message: &Message) { + let timestamp = compact_timestamp(); match message { Message::User { content } => { - println!("user: {}", render_user_content(content)); + println!("{timestamp} user: {}", render_user_content(content)); } Message::Assistant { content, .. } => { - println!("assistant: {}", render_assistant_content(content)); + println!( + "{timestamp} assistant: {}", + render_assistant_content(content) + ); } Message::Tool { content } => { for part in content { let ToolContentPart::ToolResult(result) = part; - println!("tool {}: {}", result.tool_name, result.output); + println!("{timestamp} tool {}: {}", result.tool_name, result.output); } } Message::System { content } => { - println!("system: {}", render_user_content(content)); + println!("{timestamp} system: {}", render_user_content(content)); } Message::Developer { content } => { - println!("developer: {}", render_user_content(content)); + println!("{timestamp} developer: {}", render_user_content(content)); } } } +pub(crate) fn compact_timestamp() -> String { + let seconds = SystemTime::now() + .duration_since(UNIX_EPOCH) + .map(|duration| duration.as_secs() % 86_400) + .unwrap_or(0); + let hours = seconds / 3_600; + let minutes = (seconds % 3_600) / 60; + let seconds = seconds % 60; + format!("[{hours:02}:{minutes:02}:{seconds:02}]") +} + fn render_user_content(content: &UserContent) -> String { match content { UserContent::String(text) => text.clone(), diff --git a/crates/cli/src/schedule.rs b/crates/cli/src/schedule.rs new file mode 100644 index 0000000..22eba9a --- /dev/null +++ b/crates/cli/src/schedule.rs @@ -0,0 +1,110 @@ +use std::path::Path; +use std::sync::Arc; +use std::time::Duration; + +use clap::Subcommand; +use executor::{Harness, SchedulerRunOptions, SchedulerStore, run_due_tasks}; + +#[derive(Debug, Subcommand)] +pub enum ScheduleCommands { + List { + #[arg(long)] + include_disabled: bool, + }, + Run { + #[arg(long)] + watch: bool, + #[arg(long, default_value_t = 60)] + interval_seconds: u64, + #[arg(long, default_value_t = 10)] + limit: usize, + }, + Cancel { + task_id: String, + }, + Delete { + task_id: String, + }, +} + +pub async fn handle_schedule_command( + root: &Path, + harness: Arc, + command: ScheduleCommands, +) -> Result<(), Box> { + let store = SchedulerStore::new(root.join("scheduled-tasks")); + match command { + ScheduleCommands::List { include_disabled } => { + println!("TASK\tENABLED\tSCHEDULE\tNEXT_RUN_AT_MS\tNAME"); + for task in store + .list_tasks() + .await? + .into_iter() + .filter(|task| include_disabled || task.enabled) + { + println!( + "{}\t{}\t{}\t{}\t{}", + task.id, task.enabled, task.schedule, task.next_run_at_ms, task.name + ); + } + } + ScheduleCommands::Run { + watch, + interval_seconds, + limit, + } => loop { + let runs = + run_due_tasks(Arc::clone(&harness), &store, SchedulerRunOptions { limit }).await?; + for run in runs { + println!( + "{}\t{}\texit={}\terror={}", + run.task_id, + run.id, + run.exit_code + .map(|code| code.to_string()) + .unwrap_or_else(|| "none".to_string()), + run.error.unwrap_or_else(|| "none".to_string()) + ); + } + if !watch { + break; + } + tokio::time::sleep(Duration::from_secs(interval_seconds)).await; + }, + ScheduleCommands::Cancel { task_id } => { + if let Some(task) = store.disable_task(&task_id).await? { + stop_task_owned_sandbox(harness.as_ref(), &task).await?; + println!("cancelled scheduled task {}", task_id); + } else { + return Err(format!("scheduled task not found: {task_id}").into()); + } + } + ScheduleCommands::Delete { task_id } => { + if let Some(task) = store.delete_task(&task_id).await? { + stop_task_owned_sandbox(harness.as_ref(), &task).await?; + println!("deleted scheduled task {}", task_id); + } else { + return Err(format!("scheduled task not found: {task_id}").into()); + } + } + } + Ok(()) +} + +async fn stop_task_owned_sandbox( + harness: &dyn Harness, + task: &executor::ScheduledTaskRecord, +) -> Result<(), Box> { + let Some(sandbox_id) = task.task_sandbox_id.clone() else { + return Ok(()); + }; + if let Some(agent) = harness.get_agent(&task.agent_id).await? + && let Some(conversation) = agent.get_conversation(&task.conversation_id).await? + { + conversation + .exoharness_handle() + .stop_sandbox(sandbox_id) + .await?; + } + Ok(()) +} diff --git a/crates/cli/src/tui.rs b/crates/cli/src/tui.rs index 8d950d8..15df61b 100644 --- a/crates/cli/src/tui.rs +++ b/crates/cli/src/tui.rs @@ -13,12 +13,13 @@ use lingua::universal::{UserContent, UserContentPart}; use lingua::{Message, UniversalStreamChunk}; use rustyline::error::ReadlineError; use rustyline::history::{History, MemHistory, SearchDirection, SearchResult}; -use rustyline::{Cmd, Config, Editor, KeyCode, KeyEvent, Modifiers}; +use rustyline::{Cmd, Config, Editor, ExternalPrinter, KeyCode, KeyEvent, Modifiers}; use serde_json::{Map, Value}; use tokio::runtime::Handle; +use tokio::task::JoinHandle; use tokio_stream::StreamExt; -use crate::{print_message, render_assistant_content}; +use crate::{compact_timestamp, print_message, render_assistant_content}; const REMOTE_HISTORY_BASE: usize = 1_000_000; const REMOTE_HISTORY_PAGE_SIZE: u32 = 32; @@ -342,20 +343,20 @@ struct ChatRepl { conversation: Arc, editor: Editor<(), ChatHistory>, session_id: Option, + watch_after: Arc>>, } impl ChatRepl { fn new(conversation: Arc) -> Result> { - let history = ChatHistory::new( - conversation.exoharness_handle(), - conversation.record().latest_event_id, - ); + let latest_event_id = conversation.record().latest_event_id; + let history = ChatHistory::new(conversation.exoharness_handle(), latest_event_id); let mut editor = Editor::with_history(Config::default(), history)?; editor.bind_sequence(KeyEvent(KeyCode::Enter, Modifiers::ALT), Cmd::Newline); Ok(Self { conversation, editor, session_id: None, + watch_after: Arc::new(Mutex::new(latest_event_id)), }) } @@ -369,7 +370,12 @@ impl ChatRepl { async fn run(&mut self) -> Result<(), Box> { loop { let prompt = format!("{}> ", self.conversation.record().slug); - match self.editor.readline(&prompt) { + let event_printer = self.spawn_event_printer()?; + let readline_result = self.editor.readline(&prompt); + event_printer.abort(); + let _ = event_printer.await; + + match readline_result { Ok(line) => { let trimmed = line.trim(); if trimmed.is_empty() { @@ -430,7 +436,7 @@ impl ChatRepl { continue; } if !printed_assistant { - print!("assistant: "); + print!("{} assistant: ", compact_timestamp()); stdout.flush()?; printed_assistant = true; } @@ -455,6 +461,8 @@ impl ChatRepl { } ExecutionStreamEvent::Completed(result) => { self.session_id = Some(result.session_id); + *self.watch_after.lock().expect("chat event watch poisoned") = + Some(result.latest_event_id); } } } @@ -465,13 +473,51 @@ impl ChatRepl { if let Message::Assistant { content, .. } = last_message { let rendered = render_assistant_content(&content); if !rendered.is_empty() { - println!("assistant: {}", rendered); + println!("{} assistant: {}", compact_timestamp(), rendered); } } } println!(); Ok(()) } + + fn spawn_event_printer(&mut self) -> Result, Box> { + let conversation = self.conversation.exoharness_handle(); + let watch_after = Arc::clone(&self.watch_after); + let mut printer = self.editor.create_external_printer()?; + Ok(tokio::spawn(async move { + let mut interval = tokio::time::interval(Duration::from_secs(1)); + loop { + interval.tick().await; + let cursor = *watch_after.lock().expect("chat event watch poisoned"); + match conversation + .get_events(Some(EventQuery { + cursor, + direction: Some(EventQueryDirection::Asc), + limit: Some(100), + session_id: None, + turn_id: None, + types: None, + })) + .await + { + Ok(result) => { + for event in result.events { + *watch_after.lock().expect("chat event watch poisoned") = + Some(event.id); + for rendered in render_external_event(&event.data) { + let _ = printer.print(format!("{rendered}\n")); + } + } + } + Err(error) => { + let _ = printer.print(format!("event watcher error: {error}\n")); + break; + } + } + } + })) + } } fn chunk_text(chunk: &UniversalStreamChunk) -> String { @@ -547,6 +593,35 @@ fn render_value_inline(value: &Value) -> String { } } +fn render_external_event(data: &EventData) -> Vec { + let EventData::Messages { messages, .. } = data else { + return Vec::new(); + }; + messages + .iter() + .filter_map(|message| match message { + Message::User { content } => render_external_user_content(content) + .map(|rendered| format!("{} user: {rendered}", compact_timestamp())), + Message::Assistant { content, .. } => { + let rendered = render_assistant_content(content); + (!rendered.is_empty()) + .then(|| format!("{} assistant: {rendered}", compact_timestamp())) + } + _ => None, + }) + .collect() +} + +fn render_external_user_content(content: &UserContent) -> Option { + let rendered = render_user_content_for_history(content); + let trimmed = rendered.trim(); + if trimmed.starts_with("Scheduled task `") { + Some(trimmed.to_string()) + } else { + None + } +} + fn render_user_content_for_history(content: &UserContent) -> String { match content { UserContent::String(text) => text.clone(), @@ -563,7 +638,12 @@ fn render_user_content_for_history(content: &UserContent) -> String { #[cfg(test)] mod tests { - use super::{render_tool_call, render_tool_result, render_user_content_for_history}; + use super::{ + render_external_event, render_tool_call, render_tool_result, + render_user_content_for_history, + }; + use executor::EventData; + use lingua::Message; use lingua::universal::UserContent; use serde_json::{Map, Value}; @@ -600,6 +680,22 @@ mod tests { ); } + #[test] + fn renders_scheduled_task_wakeup_user_messages() { + let rendered = render_external_event(&EventData::Messages { + messages: vec![Message::User { + content: UserContent::String( + "Scheduled task `joke` completed.\n\nstdout preview:\nhello".to_string(), + ), + }], + response_id: None, + }); + + assert_eq!(rendered.len(), 1); + assert!(rendered[0].contains("user: Scheduled task `joke` completed.")); + assert!(rendered[0].contains("stdout preview:\nhello")); + } + #[test] fn renders_user_string_content_for_history() { let content = UserContent::String("first second".to_string()); diff --git a/crates/executor/src/conversation_sandbox.rs b/crates/executor/src/conversation_sandbox.rs new file mode 100644 index 0000000..7347fa1 --- /dev/null +++ b/crates/executor/src/conversation_sandbox.rs @@ -0,0 +1,142 @@ +use crate::{AgentConfig, ConversationConfig}; +use exoharness::{ + ConversationHandle, CreateSandboxRequest, DEFAULT_SANDBOX_IMAGE, EventData, EventQuery, + EventQueryDirection, FileSystemMount, FileSystemMountMode, Result, +}; + +#[derive(Debug, Clone, PartialEq, Eq)] +pub(crate) struct ConversationSandboxInfo { + pub(crate) id: String, + image: String, + default_workdir: String, + file_system_mounts: Vec, + enable_networking: bool, + idle_seconds: u64, +} + +impl ConversationSandboxInfo { + fn matches_spec(&self, spec: &ConversationSandboxSpec) -> bool { + self.image == spec.image + && self.default_workdir == spec.default_workdir + && self.file_system_mounts == spec.file_system_mounts + && self.enable_networking == spec.enable_networking + && self.idle_seconds == spec.idle_seconds + } +} + +#[derive(Debug, Clone, PartialEq, Eq)] +struct ConversationSandboxSpec { + image: String, + default_workdir: String, + file_system_mounts: Vec, + enable_networking: bool, + idle_seconds: u64, +} + +pub(crate) async fn ensure_conversation_sandbox( + conversation: &dyn ConversationHandle, + agent_config: &AgentConfig, + config: &ConversationConfig, +) -> Result { + let spec = conversation_sandbox_spec(agent_config, config); + + for sandbox in conversation_sandboxes(conversation).await? { + if sandbox.matches_spec(&spec) { + return Ok(sandbox.id); + } + } + + create_conversation_sandbox(conversation, agent_config, config).await +} + +pub(crate) async fn create_conversation_sandbox( + conversation: &dyn ConversationHandle, + agent_config: &AgentConfig, + config: &ConversationConfig, +) -> Result { + let spec = conversation_sandbox_spec(agent_config, config); + conversation + .create_sandbox(CreateSandboxRequest { + image: spec.image, + default_workdir: Some(spec.default_workdir), + file_system_mounts: Some(spec.file_system_mounts), + enable_networking: Some(spec.enable_networking), + idle_seconds: Some(spec.idle_seconds), + }) + .await +} + +pub(crate) async fn conversation_sandboxes( + conversation: &dyn ConversationHandle, +) -> Result> { + let events = conversation + .get_events(Some(EventQuery { + cursor: None, + direction: Some(EventQueryDirection::Asc), + limit: None, + session_id: None, + turn_id: None, + types: Some(vec!["sandbox_created".to_string()]), + })) + .await? + .events; + + let mut sandboxes = Vec::new(); + for event in events { + if let EventData::SandboxCreated { + sandbox_id, + image, + default_workdir, + file_system_mounts, + enable_networking, + idle_seconds, + } = event.data + { + sandboxes.push(ConversationSandboxInfo { + id: sandbox_id, + image, + default_workdir, + file_system_mounts, + enable_networking, + idle_seconds, + }); + } + } + + Ok(sandboxes) +} + +fn conversation_sandbox_spec( + agent_config: &AgentConfig, + config: &ConversationConfig, +) -> ConversationSandboxSpec { + ConversationSandboxSpec { + image: agent_config + .sandbox_image + .clone() + .unwrap_or_else(|| DEFAULT_SANDBOX_IMAGE.to_string()), + default_workdir: config + .mounts + .first() + .map(|mount| mount.mount_path.clone()) + .unwrap_or_else(|| "/".to_string()), + file_system_mounts: normalize_mounts(&config.mounts), + enable_networking: agent_config.enable_networking || config.enable_networking, + idle_seconds: 300, + } +} + +fn normalize_mounts(mounts: &[FileSystemMount]) -> Vec { + mounts + .iter() + .map(|mount| FileSystemMount { + host_path: mount.host_path.clone(), + mount_path: mount.mount_path.clone(), + mode: match mount.mode { + FileSystemMountMode::ReadOnly => FileSystemMountMode::ReadOnly, + FileSystemMountMode::ReadWrite => FileSystemMountMode::ReadWrite, + }, + internal: Some(mount.internal.unwrap_or(false)), + }) + .collect() +} diff --git a/crates/executor/src/conversation_wakeup.rs b/crates/executor/src/conversation_wakeup.rs new file mode 100644 index 0000000..546dd82 --- /dev/null +++ b/crates/executor/src/conversation_wakeup.rs @@ -0,0 +1,21 @@ +use exoharness::Result; +use lingua::Message; +use lingua::universal::UserContent; + +use crate::{HarnessConversation, SendRequest, SendResult}; + +pub async fn send_conversation_wakeup( + conversation: &dyn HarnessConversation, + prompt: String, +) -> Result { + let result = conversation + .send(SendRequest { + input: vec![Message::User { + content: UserContent::String(prompt), + }], + session_id: None, + }) + .await?; + conversation.close_session(result.session_id).await?; + Ok(result) +} diff --git a/crates/executor/src/executor_types.rs b/crates/executor/src/executor_types.rs index c8c63aa..6c3ae96 100644 --- a/crates/executor/src/executor_types.rs +++ b/crates/executor/src/executor_types.rs @@ -44,6 +44,7 @@ pub enum AgentHarnessKind { Rlm, #[serde(rename = "typescript")] TypeScript, + Exoclaw, } #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] diff --git a/crates/executor/src/harness_basic_tests.rs b/crates/executor/src/harness_basic_tests.rs index 7668812..cb5f905 100644 --- a/crates/executor/src/harness_basic_tests.rs +++ b/crates/executor/src/harness_basic_tests.rs @@ -327,7 +327,11 @@ async fn send_executes_shell_tool_when_enabled() { usage: None, }, ])); - let harness = BasicHarness::new(exoharness, Arc::clone(&model), Arc::new(BasicToolRuntime)); + let harness = BasicHarness::new( + exoharness, + Arc::clone(&model), + Arc::new(BasicToolRuntime::default()), + ); register_test_models(harness.exoharness_handle().as_ref()).await; let agent = harness @@ -474,7 +478,7 @@ async fn harness_exposes_raw_exoharness_handles() { } #[tokio::test(flavor = "current_thread")] -async fn updating_mounts_recreates_shell_sandbox() { +async fn updating_mounts_recreates_conversation_sandbox() { let tempdir = TempDir::new().expect("tempdir should exist"); let mount_dir = tempdir.path().join("mount"); std::fs::create_dir_all(&mount_dir).expect("mount dir should exist"); @@ -498,7 +502,7 @@ async fn updating_mounts_recreates_shell_sandbox() { usage: None, }, ])); - let harness = BasicHarness::new(exoharness, model, Arc::new(BasicToolRuntime)); + let harness = BasicHarness::new(exoharness, model, Arc::new(BasicToolRuntime::default())); register_test_models(harness.exoharness_handle().as_ref()).await; let agent = harness diff --git a/crates/executor/src/harness_tool.rs b/crates/executor/src/harness_tool.rs index 6883db0..56b4df9 100644 --- a/crates/executor/src/harness_tool.rs +++ b/crates/executor/src/harness_tool.rs @@ -1,10 +1,13 @@ +use std::path::PathBuf; + +use crate::conversation_sandbox::{conversation_sandboxes, ensure_conversation_sandbox}; +use crate::scheduler_store::SchedulerStore; +use crate::scheduler_types::{ + DEFAULT_MAX_OUTPUT_BYTES, NewScheduledTask, ScheduledTaskSandboxMode, +}; use crate::{AgentConfig, ConversationConfig, ToolRuntime}; use async_trait::async_trait; -use exoharness::{ - ConversationHandle, CreateSandboxRequest, DEFAULT_SANDBOX_IMAGE, EventData, EventQuery, - EventQueryDirection, FileSystemMount, FileSystemMountMode, Result, RunInSandboxRequest, - ToolRequest, ToolResult, -}; +use exoharness::{ConversationHandle, Result, RunInSandboxRequest, ToolRequest, ToolResult}; use futures::io::AsyncReadExt; use serde::{Deserialize, Serialize}; use serde_json::Value; @@ -12,6 +15,19 @@ use serde_json::Value; #[derive(Debug, Clone, Default)] pub struct BasicToolRuntime; +#[derive(Debug, Clone)] +pub struct ExoclawToolRuntime { + scheduler_store: SchedulerStore, +} + +impl ExoclawToolRuntime { + pub fn with_scheduler_root(root: impl Into) -> Self { + Self { + scheduler_store: SchedulerStore::new(root), + } + } +} + #[async_trait] impl ToolRuntime for BasicToolRuntime { async fn prepare_conversation( @@ -21,8 +37,35 @@ impl ToolRuntime for BasicToolRuntime { config: &ConversationConfig, ) -> Result<()> { if config.shell_program.is_some() { - ensure_shell_sandbox(conversation, agent_config, config).await?; + ensure_conversation_sandbox(conversation, agent_config, config).await?; + } + Ok(()) + } + + async fn execute( + &self, + conversation: &dyn ConversationHandle, + config: &ConversationConfig, + request: &ToolRequest, + ) -> Result { + match request.function_name.as_str() { + "shell" => execute_shell_tool(conversation, config, request).await, + other => Err(anyhow::anyhow!( + "tool execution is not configured for {other}" + )), } + } +} + +#[async_trait] +impl ToolRuntime for ExoclawToolRuntime { + async fn prepare_conversation( + &self, + conversation: &dyn ConversationHandle, + agent_config: &AgentConfig, + config: &ConversationConfig, + ) -> Result<()> { + ensure_conversation_sandbox(conversation, agent_config, config).await?; Ok(()) } @@ -34,6 +77,20 @@ impl ToolRuntime for BasicToolRuntime { ) -> Result { match request.function_name.as_str() { "shell" => execute_shell_tool(conversation, config, request).await, + "schedule_sandbox_task" => { + execute_schedule_task_tool(&self.scheduler_store, request).await + } + "list_scheduled_tasks" => { + execute_list_scheduled_tasks_tool(&self.scheduler_store, request).await + } + "cancel_scheduled_task" => { + execute_cancel_scheduled_task_tool(conversation, &self.scheduler_store, request) + .await + } + "delete_scheduled_task" => { + execute_delete_scheduled_task_tool(conversation, &self.scheduler_store, request) + .await + } other => Err(anyhow::anyhow!( "tool execution is not configured for {other}" )), @@ -53,6 +110,148 @@ struct ShellToolResult { exit_code: i32, } +#[derive(Debug, Deserialize)] +#[serde(rename_all = "camelCase")] +struct ScheduleTaskArguments { + agent_id: String, + conversation_id: String, + name: String, + schedule: String, + sandbox_mode: Option, + setup_command: Option>, + command: Vec, + report_prompt: String, + max_output_bytes: Option, +} + +#[derive(Debug, Deserialize)] +#[serde(rename_all = "camelCase")] +struct ConversationScopedArguments { + agent_id: String, + conversation_id: String, + include_disabled: Option, +} + +#[derive(Debug, Deserialize)] +#[serde(rename_all = "camelCase")] +struct ScheduledTaskIdArguments { + agent_id: String, + conversation_id: String, + task_id: String, +} + +async fn execute_schedule_task_tool( + store: &SchedulerStore, + request: &ToolRequest, +) -> Result { + let args = + serde_json::from_value::(Value::Object(request.arguments.clone()))?; + let task = store + .create_task(NewScheduledTask { + agent_id: args.agent_id, + conversation_id: args.conversation_id, + name: args.name, + schedule: args.schedule, + sandbox_mode: args.sandbox_mode, + setup_command: args.setup_command, + command: args.command, + report_prompt: args.report_prompt, + max_output_bytes: Some(args.max_output_bytes.unwrap_or(DEFAULT_MAX_OUTPUT_BYTES)), + }) + .await?; + Ok(serde_json::json!({ + "ok": true, + "taskId": task.id, + "name": task.name, + "schedule": task.schedule, + "nextRunAtMs": task.next_run_at_ms, + })) +} + +async fn execute_list_scheduled_tasks_tool( + store: &SchedulerStore, + request: &ToolRequest, +) -> Result { + let args = serde_json::from_value::(Value::Object( + request.arguments.clone(), + ))?; + let tasks = store + .list_tasks_for_conversation( + &args.agent_id, + &args.conversation_id, + args.include_disabled.unwrap_or(false), + ) + .await?; + Ok(serde_json::json!({ + "ok": true, + "tasks": tasks, + })) +} + +async fn execute_cancel_scheduled_task_tool( + conversation: &dyn ConversationHandle, + store: &SchedulerStore, + request: &ToolRequest, +) -> Result { + let args = serde_json::from_value::(Value::Object( + request.arguments.clone(), + ))?; + let Some(task) = store.get_task(&args.task_id).await? else { + return Ok(serde_json::json!({ + "ok": false, + "error": "scheduled task not found for this conversation", + })); + }; + if task.agent_id != args.agent_id || task.conversation_id != args.conversation_id { + return Ok(serde_json::json!({ + "ok": false, + "error": "scheduled task not found for this conversation", + })); + } + let task_sandbox_id = task.task_sandbox_id.clone(); + store.disable_task(&args.task_id).await?; + if let Some(sandbox_id) = task_sandbox_id { + conversation.stop_sandbox(sandbox_id).await?; + } + Ok(serde_json::json!({ + "ok": true, + "taskId": args.task_id, + "cancelled": true, + })) +} + +async fn execute_delete_scheduled_task_tool( + conversation: &dyn ConversationHandle, + store: &SchedulerStore, + request: &ToolRequest, +) -> Result { + let args = serde_json::from_value::(Value::Object( + request.arguments.clone(), + ))?; + let Some(task) = store.get_task(&args.task_id).await? else { + return Ok(serde_json::json!({ + "ok": false, + "error": "scheduled task not found for this conversation", + })); + }; + if task.agent_id != args.agent_id || task.conversation_id != args.conversation_id { + return Ok(serde_json::json!({ + "ok": false, + "error": "scheduled task not found for this conversation", + })); + } + if let Some(sandbox_id) = task.task_sandbox_id.clone() { + conversation.stop_sandbox(sandbox_id).await?; + } + store.delete_task(&args.task_id).await?; + Ok(serde_json::json!({ + "ok": true, + "taskId": args.task_id, + "deleted": true, + "runsDeleted": true, + })) +} + async fn execute_shell_tool( conversation: &dyn ConversationHandle, config: &ConversationConfig, @@ -64,8 +263,10 @@ async fn execute_shell_tool( .shell_program .clone() .ok_or_else(|| anyhow::anyhow!("shell tool is not enabled for this conversation"))?; - let sandbox_id = latest_shell_sandbox(conversation) + let sandbox_id = conversation_sandboxes(conversation) .await? + .into_iter() + .next() .ok_or_else(|| anyhow::anyhow!("shell sandbox is not available for this conversation"))? .id; let process = conversation @@ -97,120 +298,3 @@ async fn execute_shell_tool( exit_code, })?) } - -pub(crate) async fn ensure_shell_sandbox( - conversation: &dyn ConversationHandle, - agent_config: &AgentConfig, - config: &ConversationConfig, -) -> Result { - let desired_default_workdir = config - .mounts - .first() - .map(|mount| mount.mount_path.clone()) - .unwrap_or_else(|| "/".to_string()); - let desired_mounts = normalize_mounts(&config.mounts); - let desired_image = agent_config - .sandbox_image - .clone() - .unwrap_or_else(|| DEFAULT_SANDBOX_IMAGE.to_string()); - let desired_enable_networking = agent_config.enable_networking || config.enable_networking; - - if let Some(sandbox) = latest_shell_sandbox(conversation).await? { - let Some(program) = &config.shell_program else { - return Ok(sandbox.id); - }; - - let config_matches = sandbox.image == desired_image - && sandbox.default_workdir == desired_default_workdir - && sandbox.file_system_mounts == desired_mounts - && sandbox.enable_networking == desired_enable_networking - && sandbox.idle_seconds == 300; - - if config_matches { - let healthcheck = conversation - .run_in_sandbox(RunInSandboxRequest { - id: sandbox.id.clone(), - command: vec![program.clone(), "-lc".to_string(), "true".to_string()], - env: Default::default(), - }) - .await; - if healthcheck.is_ok() { - return Ok(sandbox.id); - } - } - } - - conversation - .create_sandbox(CreateSandboxRequest { - image: desired_image, - default_workdir: Some(desired_default_workdir), - file_system_mounts: Some(desired_mounts), - enable_networking: Some(desired_enable_networking), - idle_seconds: Some(300), - }) - .await -} - -fn normalize_mounts(mounts: &[FileSystemMount]) -> Vec { - mounts - .iter() - .map(|mount| FileSystemMount { - host_path: mount.host_path.clone(), - mount_path: mount.mount_path.clone(), - mode: match mount.mode { - FileSystemMountMode::ReadOnly => FileSystemMountMode::ReadOnly, - FileSystemMountMode::ReadWrite => FileSystemMountMode::ReadWrite, - }, - internal: Some(mount.internal.unwrap_or(false)), - }) - .collect() -} - -#[derive(Debug, Clone, PartialEq, Eq)] -struct ShellSandboxInfo { - id: String, - image: String, - default_workdir: String, - file_system_mounts: Vec, - enable_networking: bool, - idle_seconds: u64, -} - -async fn latest_shell_sandbox( - conversation: &dyn ConversationHandle, -) -> Result> { - let events = conversation - .get_events(Some(EventQuery { - cursor: None, - direction: Some(EventQueryDirection::Desc), - limit: Some(50), - session_id: None, - turn_id: None, - types: Some(vec!["sandbox_created".to_string()]), - })) - .await? - .events; - - for event in events { - if let EventData::SandboxCreated { - sandbox_id, - image, - default_workdir, - file_system_mounts, - enable_networking, - idle_seconds, - } = event.data - { - return Ok(Some(ShellSandboxInfo { - id: sandbox_id, - image, - default_workdir, - file_system_mounts, - enable_networking, - idle_seconds, - })); - } - } - - Ok(None) -} diff --git a/crates/executor/src/lib.rs b/crates/executor/src/lib.rs index 3a9575f..886fed7 100644 --- a/crates/executor/src/lib.rs +++ b/crates/executor/src/lib.rs @@ -4,6 +4,8 @@ mod basic_tests; mod braintrust; #[cfg(test)] mod braintrust_tests; +mod conversation_sandbox; +mod conversation_wakeup; mod execution_tracing; mod executor_types; mod harness_basic; @@ -20,10 +22,14 @@ mod harness_types; mod rlm; #[cfg(test)] mod rlm_tests; +mod scheduler_runtime; +mod scheduler_store; +mod scheduler_types; mod shared; mod typescript; pub use braintrust::{BraintrustProject, BraintrustRuntimeConfig, BraintrustTracingConfig}; +pub use conversation_wakeup::send_conversation_wakeup; pub use executor_types::{ AgentConfig, AgentHarnessKind, ConversationConfig, ConversationModelConfig, ExecutionStreamEvent, ExecutionStreamHandle, ModelClient, ModelRequest, ModelResponse, @@ -38,11 +44,16 @@ pub use exoharness::{ }; pub use harness_basic::BasicHarness; pub use harness_config::load_agent_config; -pub use harness_tool::BasicToolRuntime; +pub use harness_tool::{BasicToolRuntime, ExoclawToolRuntime}; pub use harness_types::{ CreateAgentRequest, CreateConversationRequest, Harness, HarnessAgent, HarnessConversation, }; pub use rlm::RlmHarness; +pub use scheduler_runtime::{SchedulerRunOptions, run_due_tasks, run_task}; +pub use scheduler_store::SchedulerStore; +pub use scheduler_types::{ + DEFAULT_MAX_OUTPUT_BYTES, NewScheduledTask, ScheduledTaskRecord, ScheduledTaskRunRecord, now_ms, +}; pub use typescript::TypeScriptHarness; pub(crate) use basic::BasicExecutor; diff --git a/crates/executor/src/scheduler_runtime.rs b/crates/executor/src/scheduler_runtime.rs new file mode 100644 index 0000000..135284e --- /dev/null +++ b/crates/executor/src/scheduler_runtime.rs @@ -0,0 +1,413 @@ +use std::sync::Arc; + +use anyhow::{Result, anyhow}; +use exoharness::{RunInSandboxRequest, SandboxProcess, WriteArtifactRequest}; +use futures::io::{AsyncRead, AsyncReadExt}; +use serde::Serialize; + +use crate::conversation_sandbox::{create_conversation_sandbox, ensure_conversation_sandbox}; +use crate::conversation_wakeup::send_conversation_wakeup; +use crate::scheduler_store::SchedulerStore; +use crate::scheduler_types::{ + ScheduledTaskRecord, ScheduledTaskRunRecord, ScheduledTaskSandboxMode, now_ms, +}; +use crate::{Harness, Uuid7}; + +#[derive(Debug, Clone, Copy)] +pub struct SchedulerRunOptions { + pub limit: usize, +} + +impl Default for SchedulerRunOptions { + fn default() -> Self { + Self { limit: 10 } + } +} + +#[derive(Debug, Serialize)] +struct ScheduledTaskArtifact { + task_id: String, + task_name: String, + run_id: String, + sandbox_id: Option, + setup_command: Option>, + command: Vec, + exit_code: Option, + setup_stdout: Option, + setup_stderr: Option, + stdout: String, + stderr: String, + truncated: bool, + error: Option, +} + +pub async fn run_due_tasks( + harness: Arc, + store: &SchedulerStore, + options: SchedulerRunOptions, +) -> Result> { + let mut due = store.due_tasks(now_ms()).await?; + due.sort_by_key(|task| task.next_run_at_ms); + due.truncate(options.limit); + + let mut runs = Vec::new(); + for task in due { + runs.push(run_task(Arc::clone(&harness), store, task).await?); + } + Ok(runs) +} + +pub async fn run_task( + harness: Arc, + store: &SchedulerStore, + mut task: ScheduledTaskRecord, +) -> Result { + let started_at_ms = now_ms(); + let run_id = Uuid7::now().to_string(); + let run_result = run_task_inner(Arc::clone(&harness), &mut task, &run_id).await; + let finished_at_ms = now_ms(); + + let (mut run, result_artifact_id) = match run_result { + Ok(output) => { + let stdout_bytes = output.stdout.len() as u64; + let stderr_bytes = output.stderr.len() as u64; + ( + ScheduledTaskRunRecord { + id: run_id, + task_id: task.id.clone(), + started_at_ms, + finished_at_ms, + exit_code: output.exit_code, + stdout_bytes, + stderr_bytes, + truncated: output.truncated, + result_artifact_id: output.result_artifact_id.clone(), + error: output.error.clone(), + }, + output.result_artifact_id, + ) + } + Err(error) => ( + ScheduledTaskRunRecord { + id: run_id, + task_id: task.id.clone(), + started_at_ms, + finished_at_ms, + exit_code: None, + stdout_bytes: 0, + stderr_bytes: 0, + truncated: false, + result_artifact_id: None, + error: Some(error.to_string()), + }, + None, + ), + }; + + task.mark_completed(&run, result_artifact_id, finished_at_ms)?; + run.task_id = task.id.clone(); + store.put_run(&run).await?; + store.put_task(&task).await?; + Ok(run) +} + +struct TaskOutput { + exit_code: Option, + stdout: Vec, + stderr: Vec, + truncated: bool, + result_artifact_id: Option, + error: Option, +} + +async fn run_task_inner( + harness: Arc, + task: &mut ScheduledTaskRecord, + run_id: &str, +) -> Result { + let agent = harness + .get_agent(&task.agent_id) + .await? + .ok_or_else(|| anyhow!("scheduled task agent does not exist: {}", task.agent_id))?; + let conversation = agent + .get_conversation(&task.conversation_id) + .await? + .ok_or_else(|| { + anyhow!( + "scheduled task conversation does not exist: {}", + task.conversation_id + ) + })?; + let agent_config = agent.config().await?; + let conversation_config = conversation.config().await?; + let conversation_handle = conversation.exoharness_handle(); + let sandbox_id = resolve_task_sandbox( + task, + conversation_handle.as_ref(), + &agent_config, + &conversation_config, + ) + .await?; + let command_result: Result = async { + let process = conversation_handle + .run_in_sandbox(RunInSandboxRequest { + id: sandbox_id.clone(), + command: task + .setup_command + .clone() + .unwrap_or_else(|| task.command.clone()), + env: Default::default(), + }) + .await?; + let setup_output = read_process_output(process, task.max_output_bytes).await?; + if task.setup_command.is_none() { + return Ok(CommandOutput { + sandbox_id, + setup: None, + main: setup_output, + error: None, + }); + } + if setup_output.exit_code != Some(0) { + return Ok(CommandOutput { + sandbox_id, + setup: Some(setup_output), + main: ProcessOutput::empty(), + error: Some("setup command exited non-zero".to_string()), + }); + } + let process = conversation_handle + .run_in_sandbox(RunInSandboxRequest { + id: sandbox_id.clone(), + command: task.command.clone(), + env: Default::default(), + }) + .await?; + let main_output = read_process_output(process, task.max_output_bytes).await?; + Ok(CommandOutput { + sandbox_id: sandbox_id.clone(), + setup: Some(setup_output), + main: main_output, + error: None, + }) + } + .await; + + let (exit_code, stdout, stderr, truncated, error, setup, sandbox_id) = match command_result { + Ok(output) => { + let truncated = + output.main.truncated || output.setup.as_ref().is_some_and(|setup| setup.truncated); + ( + output.main.exit_code, + output.main.stdout, + output.main.stderr, + truncated, + output.error, + output.setup, + Some(output.sandbox_id), + ) + } + Err(error) => ( + None, + Vec::new(), + Vec::new(), + false, + Some(error.to_string()), + None, + None, + ), + }; + + let artifact = ScheduledTaskArtifact { + task_id: task.id.clone(), + task_name: task.name.clone(), + run_id: run_id.to_string(), + sandbox_id, + setup_command: task.setup_command.clone(), + command: task.command.clone(), + exit_code, + setup_stdout: setup + .as_ref() + .map(|output| String::from_utf8_lossy(&output.stdout).into_owned()), + setup_stderr: setup + .as_ref() + .map(|output| String::from_utf8_lossy(&output.stderr).into_owned()), + stdout: String::from_utf8_lossy(&stdout).into_owned(), + stderr: String::from_utf8_lossy(&stderr).into_owned(), + truncated, + error: error.clone(), + }; + let artifact_version = conversation_handle + .write_artifact(WriteArtifactRequest { + path: format!("scheduled-tasks/{}/{run_id}.json", task.name), + contents: serde_json::to_vec_pretty(&artifact)?, + }) + .await?; + let artifact_id = artifact_version.artifact_id.to_string(); + let prompt = if let Some(error) = &error { + error_wakeup_prompt(task, run_id, error, &artifact_version) + } else { + wakeup_prompt( + task, + run_id, + exit_code.expect("completed scheduled command has exit code"), + truncated, + &artifact_version, + &stdout, + &stderr, + ) + }; + send_conversation_wakeup(conversation.as_ref(), prompt).await?; + + Ok(TaskOutput { + exit_code, + stdout, + stderr, + truncated, + result_artifact_id: Some(artifact_id), + error, + }) +} + +async fn resolve_task_sandbox( + task: &mut ScheduledTaskRecord, + conversation: &dyn exoharness::ConversationHandle, + agent_config: &crate::AgentConfig, + conversation_config: &crate::ConversationConfig, +) -> Result { + match task.sandbox_mode { + ScheduledTaskSandboxMode::Conversation => { + ensure_conversation_sandbox(conversation, agent_config, conversation_config).await + } + ScheduledTaskSandboxMode::TaskFresh => { + if let Some(sandbox_id) = &task.task_sandbox_id { + return Ok(sandbox_id.clone()); + } + let sandbox_id = + create_conversation_sandbox(conversation, agent_config, conversation_config) + .await?; + task.task_sandbox_id = Some(sandbox_id.clone()); + Ok(sandbox_id) + } + } +} + +async fn read_process_output( + process: Box, + max_stream_bytes: u64, +) -> Result { + let parts = process.into_parts(); + drop(parts.stdin); + + let (stdout_result, stderr_result, exit_result) = tokio::join!( + read_limited(parts.stdout, max_stream_bytes), + read_limited(parts.stderr, max_stream_bytes), + parts.wait, + ); + let (stdout, stdout_truncated) = stdout_result?; + let (stderr, stderr_truncated) = stderr_result?; + let exit_code = exit_result?; + let truncated = stdout_truncated || stderr_truncated; + Ok(ProcessOutput { + exit_code: Some(exit_code), + stdout, + stderr, + truncated, + }) +} + +struct CommandOutput { + sandbox_id: String, + setup: Option, + main: ProcessOutput, + error: Option, +} + +struct ProcessOutput { + exit_code: Option, + stdout: Vec, + stderr: Vec, + truncated: bool, +} + +impl ProcessOutput { + fn empty() -> Self { + Self { + exit_code: None, + stdout: Vec::new(), + stderr: Vec::new(), + truncated: false, + } + } +} + +async fn read_limited(mut reader: R, max_bytes: u64) -> Result<(Vec, bool)> +where + R: AsyncRead + Unpin, +{ + let mut output = Vec::new(); + let mut truncated = false; + let mut buffer = [0u8; 8192]; + loop { + let read = reader.read(&mut buffer).await?; + if read == 0 { + break; + } + let remaining = max_bytes.saturating_sub(output.len() as u64) as usize; + if read > remaining { + output.extend_from_slice(&buffer[..remaining]); + truncated = true; + continue; + } + output.extend_from_slice(&buffer[..read]); + } + Ok((output, truncated)) +} + +fn wakeup_prompt( + task: &ScheduledTaskRecord, + run_id: &str, + exit_code: i32, + truncated: bool, + artifact: &exoharness::ArtifactVersion, + stdout: &[u8], + stderr: &[u8], +) -> String { + format!( + "Scheduled task `{}` completed.\n\nRun id: `{}`\nExit code: {}\nResult artifact: `{}` version {} at `{}`\nOutput truncated: {}\n\nReport instructions:\n{}\n\nstdout preview:\n{}\n\nstderr preview:\n{}", + task.name, + run_id, + exit_code, + artifact.artifact_id, + artifact.version, + artifact.path, + truncated, + task.report_prompt, + preview(stdout), + preview(stderr), + ) +} + +fn error_wakeup_prompt( + task: &ScheduledTaskRecord, + run_id: &str, + error: &str, + artifact: &exoharness::ArtifactVersion, +) -> String { + format!( + "Scheduled task `{}` failed.\n\nRun id: `{}`\nResult artifact: `{}` version {} at `{}`\n\nReport instructions:\n{}\n\nError:\n{}", + task.name, + run_id, + artifact.artifact_id, + artifact.version, + artifact.path, + task.report_prompt, + error, + ) +} + +fn preview(bytes: &[u8]) -> String { + const PREVIEW_BYTES: usize = 4_000; + let end = bytes.len().min(PREVIEW_BYTES); + String::from_utf8_lossy(&bytes[..end]).into_owned() +} diff --git a/crates/executor/src/scheduler_store.rs b/crates/executor/src/scheduler_store.rs new file mode 100644 index 0000000..6a2d3ca --- /dev/null +++ b/crates/executor/src/scheduler_store.rs @@ -0,0 +1,231 @@ +use std::path::{Path, PathBuf}; + +use anyhow::{Context, Result}; +use tokio::fs; + +use crate::scheduler_types::{ + NewScheduledTask, ScheduledTaskRecord, ScheduledTaskRunRecord, now_ms, +}; + +#[derive(Debug, Clone)] +pub struct SchedulerStore { + root: PathBuf, +} + +impl SchedulerStore { + pub fn new(root: impl Into) -> Self { + Self { root: root.into() } + } + + pub fn root(&self) -> &Path { + &self.root + } + + pub async fn create_task(&self, request: NewScheduledTask) -> Result { + let task = ScheduledTaskRecord::new(request, now_ms())?; + self.put_task(&task).await?; + Ok(task) + } + + pub async fn list_tasks(&self) -> Result> { + let task_dir = self.tasks_dir(); + match fs::metadata(&task_dir).await { + Ok(_) => {} + Err(error) if error.kind() == std::io::ErrorKind::NotFound => { + return Ok(Vec::new()); + } + Err(error) => return Err(error.into()), + } + let mut entries = fs::read_dir(&task_dir) + .await + .with_context(|| format!("failed to read scheduled task directory {task_dir:?}"))?; + let mut tasks = Vec::new(); + while let Some(entry) = entries.next_entry().await? { + let path = entry.path(); + if path.extension().and_then(|ext| ext.to_str()) != Some("json") { + continue; + } + let bytes = fs::read(&path) + .await + .with_context(|| format!("failed to read scheduled task {}", path.display()))?; + tasks.push(serde_json::from_slice::(&bytes)?); + } + tasks.sort_by(|left, right| left.name.cmp(&right.name).then(left.id.cmp(&right.id))); + Ok(tasks) + } + + pub async fn list_tasks_for_conversation( + &self, + agent_id: &str, + conversation_id: &str, + include_disabled: bool, + ) -> Result> { + Ok(self + .list_tasks() + .await? + .into_iter() + .filter(|task| task.agent_id == agent_id && task.conversation_id == conversation_id) + .filter(|task| include_disabled || task.enabled) + .collect()) + } + + pub async fn due_tasks(&self, now_ms: u64) -> Result> { + Ok(self + .list_tasks() + .await? + .into_iter() + .filter(|task| task.is_due(now_ms)) + .collect()) + } + + pub async fn get_task(&self, task_id: &str) -> Result> { + let path = self.task_path(task_id); + match fs::read(&path).await { + Ok(bytes) => Ok(Some(serde_json::from_slice::(&bytes)?)), + Err(error) if error.kind() == std::io::ErrorKind::NotFound => Ok(None), + Err(error) => Err(error) + .with_context(|| format!("failed to read scheduled task {}", path.display())), + } + } + + pub async fn put_task(&self, task: &ScheduledTaskRecord) -> Result<()> { + fs::create_dir_all(self.tasks_dir()).await?; + let path = self.task_path(&task.id); + fs::write(&path, serde_json::to_vec_pretty(task)?) + .await + .with_context(|| format!("failed to write scheduled task {}", path.display())) + } + + pub async fn disable_task(&self, task_id: &str) -> Result> { + let Some(mut task) = self.get_task(task_id).await? else { + return Ok(None); + }; + task.enabled = false; + task.updated_at_ms = now_ms(); + self.put_task(&task).await?; + Ok(Some(task)) + } + + pub async fn delete_task(&self, task_id: &str) -> Result> { + let Some(task) = self.get_task(task_id).await? else { + return Ok(None); + }; + remove_file_if_exists(self.task_path(task_id)).await?; + remove_dir_if_exists(self.runs_dir(task_id)).await?; + Ok(Some(task)) + } + + pub async fn put_run(&self, run: &ScheduledTaskRunRecord) -> Result<()> { + fs::create_dir_all(self.runs_dir(&run.task_id)).await?; + let path = self.run_path(&run.task_id, &run.id); + fs::write(&path, serde_json::to_vec_pretty(run)?) + .await + .with_context(|| format!("failed to write scheduled task run {}", path.display())) + } + + fn tasks_dir(&self) -> PathBuf { + self.root.join("tasks") + } + + fn task_path(&self, task_id: &str) -> PathBuf { + self.tasks_dir().join(format!("{task_id}.json")) + } + + fn runs_dir(&self, task_id: &str) -> PathBuf { + self.root.join("runs").join(task_id) + } + + fn run_path(&self, task_id: &str, run_id: &str) -> PathBuf { + self.runs_dir(task_id).join(format!("{run_id}.json")) + } +} + +async fn remove_file_if_exists(path: PathBuf) -> Result<()> { + match fs::remove_file(&path).await { + Ok(()) => Ok(()), + Err(error) if error.kind() == std::io::ErrorKind::NotFound => Ok(()), + Err(error) => { + Err(error).with_context(|| format!("failed to delete file {}", path.display())) + } + } +} + +async fn remove_dir_if_exists(path: PathBuf) -> Result<()> { + match fs::remove_dir_all(&path).await { + Ok(()) => Ok(()), + Err(error) if error.kind() == std::io::ErrorKind::NotFound => Ok(()), + Err(error) => { + Err(error).with_context(|| format!("failed to delete directory {}", path.display())) + } + } +} + +#[cfg(test)] +mod tests { + use tempfile::TempDir; + + use super::*; + + #[tokio::test] + async fn creates_and_lists_tasks() { + let tempdir = TempDir::new().unwrap(); + let store = SchedulerStore::new(tempdir.path()); + let task = store + .create_task(NewScheduledTask { + agent_id: "agent".to_string(), + conversation_id: "conversation".to_string(), + name: "check".to_string(), + schedule: "@every 1m".to_string(), + sandbox_mode: None, + setup_command: None, + command: vec!["true".to_string()], + report_prompt: "Report.".to_string(), + max_output_bytes: None, + }) + .await + .unwrap(); + + assert_eq!(store.list_tasks().await.unwrap(), vec![task]); + } + + #[tokio::test] + async fn disables_and_deletes_tasks() { + let tempdir = TempDir::new().unwrap(); + let store = SchedulerStore::new(tempdir.path()); + let task = store + .create_task(NewScheduledTask { + agent_id: "agent".to_string(), + conversation_id: "conversation".to_string(), + name: "check".to_string(), + schedule: "@every 1m".to_string(), + sandbox_mode: None, + setup_command: None, + command: vec!["true".to_string()], + report_prompt: "Report.".to_string(), + max_output_bytes: None, + }) + .await + .unwrap(); + + store.disable_task(&task.id).await.unwrap(); + assert!( + store + .list_tasks_for_conversation("agent", "conversation", false) + .await + .unwrap() + .is_empty() + ); + assert_eq!( + store + .list_tasks_for_conversation("agent", "conversation", true) + .await + .unwrap() + .len(), + 1 + ); + + let deleted = store.delete_task(&task.id).await.unwrap().unwrap(); + assert_eq!(deleted.id, task.id); + assert!(store.get_task(&task.id).await.unwrap().is_none()); + } +} diff --git a/crates/executor/src/scheduler_types.rs b/crates/executor/src/scheduler_types.rs new file mode 100644 index 0000000..7be99eb --- /dev/null +++ b/crates/executor/src/scheduler_types.rs @@ -0,0 +1,291 @@ +use std::time::{SystemTime, UNIX_EPOCH}; + +use anyhow::{Result, anyhow, bail}; +use exoharness::Uuid7; +use serde::{Deserialize, Serialize}; + +pub const DEFAULT_MAX_OUTPUT_BYTES: u64 = 200_000; + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +pub struct ScheduledTaskRecord { + pub id: String, + pub agent_id: String, + pub conversation_id: String, + pub name: String, + pub schedule: String, + #[serde(default)] + pub sandbox_mode: ScheduledTaskSandboxMode, + #[serde(default)] + pub task_sandbox_id: Option, + #[serde(default)] + pub setup_command: Option>, + pub command: Vec, + pub report_prompt: String, + pub max_output_bytes: u64, + pub enabled: bool, + pub created_at_ms: u64, + pub updated_at_ms: u64, + pub next_run_at_ms: u64, + pub last_run_at_ms: Option, + pub latest_run_id: Option, + pub latest_result_artifact_id: Option, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +pub struct NewScheduledTask { + pub agent_id: String, + pub conversation_id: String, + pub name: String, + pub schedule: String, + pub sandbox_mode: Option, + pub setup_command: Option>, + pub command: Vec, + pub report_prompt: String, + pub max_output_bytes: Option, +} + +#[derive(Debug, Clone, Copy, Default, Serialize, Deserialize, PartialEq, Eq)] +#[serde(rename_all = "snake_case")] +pub enum ScheduledTaskSandboxMode { + #[default] + Conversation, + TaskFresh, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +pub struct ScheduledTaskRunRecord { + pub id: String, + pub task_id: String, + pub started_at_ms: u64, + pub finished_at_ms: u64, + pub exit_code: Option, + pub stdout_bytes: u64, + pub stderr_bytes: u64, + pub truncated: bool, + pub result_artifact_id: Option, + pub error: Option, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct ParsedSchedule { + interval_ms: u64, +} + +impl ScheduledTaskRecord { + pub fn new(request: NewScheduledTask, now_ms: u64) -> Result { + validate_task_name(&request.name)?; + validate_command(&request.command)?; + if let Some(setup_command) = &request.setup_command { + validate_command(setup_command)?; + } + let schedule = parse_schedule(&request.schedule)?; + let max_output_bytes = request.max_output_bytes.unwrap_or(DEFAULT_MAX_OUTPUT_BYTES); + if max_output_bytes == 0 { + bail!("scheduled task maxOutputBytes must be greater than zero"); + } + Ok(Self { + id: Uuid7::now().to_string(), + agent_id: non_empty("agentId", request.agent_id)?, + conversation_id: non_empty("conversationId", request.conversation_id)?, + name: request.name, + schedule: request.schedule, + sandbox_mode: request.sandbox_mode.unwrap_or_default(), + task_sandbox_id: None, + setup_command: request.setup_command, + command: request.command, + report_prompt: non_empty("reportPrompt", request.report_prompt)?, + max_output_bytes, + enabled: true, + created_at_ms: now_ms, + updated_at_ms: now_ms, + next_run_at_ms: schedule.next_after_ms(now_ms), + last_run_at_ms: None, + latest_run_id: None, + latest_result_artifact_id: None, + }) + } + + pub fn is_due(&self, now_ms: u64) -> bool { + self.enabled && self.next_run_at_ms <= now_ms + } + + pub fn mark_completed( + &mut self, + run: &ScheduledTaskRunRecord, + result_artifact_id: Option, + now_ms: u64, + ) -> Result<()> { + let schedule = parse_schedule(&self.schedule)?; + self.updated_at_ms = now_ms; + self.last_run_at_ms = Some(run.finished_at_ms); + self.latest_run_id = Some(run.id.clone()); + self.latest_result_artifact_id = result_artifact_id; + self.next_run_at_ms = schedule.next_after_ms(now_ms); + Ok(()) + } +} + +impl ParsedSchedule { + pub fn next_after_ms(&self, now_ms: u64) -> u64 { + now_ms.saturating_add(self.interval_ms) + } +} + +pub fn now_ms() -> u64 { + SystemTime::now() + .duration_since(UNIX_EPOCH) + .expect("system clock before unix epoch") + .as_millis() as u64 +} + +pub fn parse_schedule(raw: &str) -> Result { + let schedule = raw.trim(); + if let Some(interval) = schedule.strip_prefix("@every ") { + return parse_interval(interval.trim()); + } + + let parts = schedule.split_whitespace().collect::>(); + if parts.len() == 5 + && parts[0].starts_with("*/") + && parts[1] == "*" + && parts[2] == "*" + && parts[3] == "*" + && parts[4] == "*" + { + let minutes = parts[0] + .trim_start_matches("*/") + .parse::() + .map_err(|_| anyhow!("cron minute interval must be a positive integer"))?; + if minutes == 0 { + bail!("cron minute interval must be greater than zero"); + } + return Ok(ParsedSchedule { + interval_ms: minutes.saturating_mul(60_000), + }); + } + + bail!("schedule must be '@every ' or '*/N * * * *'"); +} + +fn parse_interval(raw: &str) -> Result { + if raw.len() < 2 { + bail!("interval must include a value and unit"); + } + let (value, unit) = raw.split_at(raw.len() - 1); + let value = value + .parse::() + .map_err(|_| anyhow!("interval value must be a positive integer"))?; + if value == 0 { + bail!("interval value must be greater than zero"); + } + let multiplier = match unit { + "s" => 1_000, + "m" => 60_000, + "h" => 3_600_000, + "d" => 86_400_000, + _ => bail!("interval unit must be one of s, m, h, or d"), + }; + Ok(ParsedSchedule { + interval_ms: value.saturating_mul(multiplier), + }) +} + +fn validate_task_name(name: &str) -> Result<()> { + if name.is_empty() { + bail!("scheduled task name must not be empty"); + } + if !name + .chars() + .all(|ch| ch.is_ascii_alphanumeric() || ch == '-' || ch == '_') + { + bail!("scheduled task name may only contain letters, numbers, '-' and '_'"); + } + Ok(()) +} + +fn validate_command(command: &[String]) -> Result<()> { + if command.is_empty() { + bail!("scheduled task command must not be empty"); + } + if command.iter().any(|part| part.is_empty()) { + bail!("scheduled task command entries must not be empty"); + } + Ok(()) +} + +fn non_empty(field: &str, value: String) -> Result { + if value.trim().is_empty() { + bail!("{field} must not be empty"); + } + Ok(value) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn parses_every_interval() { + assert_eq!( + parse_schedule("@every 5m").unwrap().next_after_ms(1), + 300_001 + ); + } + + #[test] + fn parses_simple_cron_interval() { + assert_eq!( + parse_schedule("*/30 * * * *").unwrap().next_after_ms(1), + 1_800_001 + ); + } + + #[test] + fn rejects_invalid_schedule() { + assert!(parse_schedule("* * * * *").is_err()); + } + + #[test] + fn scheduled_task_defaults_to_conversation_sandbox() { + let task = ScheduledTaskRecord::new( + NewScheduledTask { + agent_id: "agent".to_string(), + conversation_id: "conversation".to_string(), + name: "check".to_string(), + schedule: "@every 1m".to_string(), + sandbox_mode: None, + setup_command: None, + command: vec!["true".to_string()], + report_prompt: "Report.".to_string(), + max_output_bytes: None, + }, + 1, + ) + .unwrap(); + + assert_eq!(task.sandbox_mode, ScheduledTaskSandboxMode::Conversation); + assert_eq!(task.task_sandbox_id, None); + } + + #[test] + fn scheduled_task_accepts_fresh_task_sandbox_mode() { + let task = ScheduledTaskRecord::new( + NewScheduledTask { + agent_id: "agent".to_string(), + conversation_id: "conversation".to_string(), + name: "check".to_string(), + schedule: "@every 1m".to_string(), + sandbox_mode: Some(ScheduledTaskSandboxMode::TaskFresh), + setup_command: None, + command: vec!["true".to_string()], + report_prompt: "Report.".to_string(), + max_output_bytes: None, + }, + 1, + ) + .unwrap(); + + assert_eq!(task.sandbox_mode, ScheduledTaskSandboxMode::TaskFresh); + assert_eq!(task.task_sandbox_id, None); + } +} diff --git a/crates/executor/src/typescript.rs b/crates/executor/src/typescript.rs index 0bf3432..f075080 100644 --- a/crates/executor/src/typescript.rs +++ b/crates/executor/src/typescript.rs @@ -23,10 +23,11 @@ use tokio::process::{Child, ChildStdout, Command}; use tokio::sync::{Mutex, mpsc}; use tokio::task::JoinHandle; +use crate::conversation_sandbox::ensure_conversation_sandbox; use crate::execution_tracing::TurnExecutionTrace; use crate::harness_executor::{ExecutorHarnessRuntime, ExecutorStreamMode, HarnessExecutor}; use crate::harness_facade::{SharedHarness, SharedHarnessBacked}; -use crate::harness_tool::{BasicToolRuntime, ensure_shell_sandbox}; +use crate::harness_tool::{BasicToolRuntime, ExoclawToolRuntime}; use crate::shared::try_send_stream_event; use crate::{ AgentConfig, BraintrustRuntimeConfig, ConversationConfig, ExecutionStreamEvent, SendRequest, @@ -496,7 +497,7 @@ impl TypeScriptRunnerProcess { env: HashMap, ) -> Result { let sandbox_id = - ensure_shell_sandbox(conversation, agent_config, conversation_config).await?; + ensure_conversation_sandbox(conversation, agent_config, conversation_config).await?; let process = conversation .run_in_sandbox(RunInSandboxRequest { id: sandbox_id, @@ -607,6 +608,30 @@ impl TypeScriptHarness { } } +impl TypeScriptHarness { + pub async fn exoclaw_from_root( + root: impl AsRef, + runtime_config: Option, + env: HashMap, + ) -> Result { + let workspace_root = std::env::current_dir() + .context("failed to resolve current directory for Exoclaw harness")?; + let root = root.as_ref(); + let exoharness: Arc = + Arc::new(BasicExoHarness::new(root.join("exoharness")).await?); + let tools = Arc::new(ExoclawToolRuntime::with_scheduler_root( + root.join("scheduled-tasks"), + )); + let runtime = ExecutorHarnessRuntime::new( + TypeScriptExecutor::new(Arc::clone(&exoharness), workspace_root, env, tools), + runtime_config, + ); + Ok(Self { + inner: SharedHarness::new(exoharness, runtime), + }) + } +} + impl SharedHarnessBacked for TypeScriptHarness where T: ToolRuntime + 'static, diff --git a/crates/exoharness/src/basic.rs b/crates/exoharness/src/basic.rs index c0b5b8c..bcad2e3 100644 --- a/crates/exoharness/src/basic.rs +++ b/crates/exoharness/src/basic.rs @@ -1158,15 +1158,28 @@ impl ConversationHandle for BasicConversationHandle { if request.command.is_empty() { bail!("sandbox command must not be empty"); } - let sandbox_handle = self - .harness - .inner - .running_sandboxes - .lock() - .await - .get(&request.id) - .cloned() - .ok_or_else(|| anyhow!("sandbox is not active in this process: {}", request.id))?; + let sandbox_handle = { + let running_sandboxes = self.harness.inner.running_sandboxes.lock().await; + running_sandboxes.get(&request.id).cloned() + }; + let sandbox_handle = match sandbox_handle { + Some(sandbox_handle) => sandbox_handle, + None => { + let sandbox_handle = self + .harness + .inner + .sandbox_backend + .acquire(sandbox_request(self.record.id, &request.id, &sandbox)) + .await?; + self.harness + .inner + .running_sandboxes + .lock() + .await + .insert(request.id.clone(), sandbox_handle.clone()); + sandbox_handle + } + }; let parts = sandbox_handle .start_process(&SandboxCommand { argv: request.command.clone(), diff --git a/crates/exoharness/src/basic_tests.rs b/crates/exoharness/src/basic_tests.rs index 6a614c3..2813501 100644 --- a/crates/exoharness/src/basic_tests.rs +++ b/crates/exoharness/src/basic_tests.rs @@ -481,6 +481,85 @@ async fn basic_backend_runs_commands_in_created_sandbox() { assert_eq!(wait_result.expect("process should exit"), 0); } +#[tokio::test(flavor = "current_thread")] +async fn basic_backend_reattaches_running_sandbox_in_new_harness_process() { + let tempdir = TempDir::new().expect("tempdir"); + let harness = BasicExoHarness::new_with_local_process_sandbox(tempdir.path()) + .await + .expect("harness should initialize"); + let agent = harness + .new_agent(NewAgentRequest { + slug: "agent".to_string(), + name: "Agent".to_string(), + }) + .await + .expect("agent"); + let conversation = agent + .new_conversation(NewConversationRequest::default()) + .await + .expect("conversation"); + let agent_id = agent.record().id; + let conversation_id = conversation.record().id; + + let sandbox_id = conversation + .create_sandbox(CreateSandboxRequest { + image: "basic-local-process".to_string(), + default_workdir: Some(tempdir.path().display().to_string()), + file_system_mounts: None, + enable_networking: Some(true), + idle_seconds: Some(60), + }) + .await + .expect("sandbox should be created"); + drop(conversation); + drop(agent); + drop(harness); + + let reloaded_harness = BasicExoHarness::new_with_local_process_sandbox(tempdir.path()) + .await + .expect("harness should reload"); + let reloaded_agent = reloaded_harness + .get_agent(&agent_id) + .await + .expect("get agent") + .expect("agent exists"); + let reloaded_conversation = reloaded_agent + .get_conversation(&conversation_id) + .await + .expect("get conversation") + .expect("conversation exists"); + + let process = reloaded_conversation + .run_in_sandbox(RunInSandboxRequest { + id: sandbox_id, + command: vec![ + "/bin/sh".to_string(), + "-lc".to_string(), + "printf reattached".to_string(), + ], + env: Default::default(), + }) + .await + .expect("sandbox command should run after reload"); + let parts = process.into_parts(); + let mut stdout = parts.stdout; + let mut stderr = parts.stderr; + drop(parts.stdin); + let mut stdout_bytes = Vec::new(); + let mut stderr_bytes = Vec::new(); + let (stdout_result, stderr_result, wait_result) = tokio::join!( + stdout.read_to_end(&mut stdout_bytes), + stderr.read_to_end(&mut stderr_bytes), + parts.wait, + ); + + stdout_result.expect("stdout should read"); + stderr_result.expect("stderr should read"); + assert_eq!(String::from_utf8_lossy(&stdout_bytes), "reattached"); + assert_eq!(String::from_utf8_lossy(&stderr_bytes), ""); + assert_eq!(wait_result.expect("process should exit"), 0); +} + fn user_message(text: &str) -> Message { Message::User { content: UserContent::String(text.to_string()), diff --git a/crates/exoharness/src/sandbox.rs b/crates/exoharness/src/sandbox.rs index 64324e1..957f3e8 100644 --- a/crates/exoharness/src/sandbox.rs +++ b/crates/exoharness/src/sandbox.rs @@ -129,6 +129,7 @@ struct WarmSandboxEntry { name: String, request: SandboxRequest, last_used_at: Instant, + owned: bool, } #[derive(Debug, Deserialize)] @@ -262,7 +263,7 @@ impl AppleContainerSandboxBackend { .iter() .filter_map(|(key, entry)| { let ttl = entry.request.lifecycle.idle_ttl?; - (entry.last_used_at + ttl <= now).then(|| key.clone()) + (entry.owned && entry.last_used_at + ttl <= now).then(|| key.clone()) }) .collect::>(); @@ -273,7 +274,9 @@ impl AppleContainerSandboxBackend { }; for entry in expired { - cleanup_named_container(&self.container_bin, &entry.name).await?; + if entry.owned { + cleanup_named_container(&self.container_bin, &entry.name).await?; + } } Ok(()) @@ -293,7 +296,7 @@ impl Drop for AppleContainerSandboxBackend { }; let names = warm_sandboxes .drain() - .map(|(_, entry)| entry.name) + .filter_map(|(_, entry)| entry.owned.then_some(entry.name)) .collect::>(); for name in names { cleanup_named_container_blocking(&self.container_bin, &name); @@ -331,11 +334,19 @@ impl ManagedSandboxBackend for AppleContainerSandboxBackend { None => None, } }; - if let Some(entry) = replaced { + if let Some(entry) = replaced + && entry.owned + { schedule_cleanup_named_container(self.container_bin.clone(), entry.name); } - let name = create_unique_warm_sandbox(&self.container_bin, &request).await?; + let (name, owned) = match find_running_warm_sandbox(&self.container_bin, &request).await? { + Some(name) => (name, false), + None => ( + create_unique_warm_sandbox(&self.container_bin, &request).await?, + true, + ), + }; { let mut warm_sandboxes = self.warm_sandboxes.lock().await; @@ -345,6 +356,7 @@ impl ManagedSandboxBackend for AppleContainerSandboxBackend { name: name.clone(), request: request.clone(), last_used_at: Instant::now(), + owned, }, ); } @@ -432,7 +444,9 @@ impl ManagedSandboxHandle for AppleWarmSandboxHandle { warm_sandboxes.remove(&self.request.key) }; - if let Some(entry) = removed { + if let Some(entry) = removed + && entry.owned + { cleanup_named_container(&self.container_bin, &entry.name).await } else { Ok(()) @@ -601,6 +615,40 @@ async fn create_unique_warm_sandbox( )) } +async fn find_running_warm_sandbox( + container_bin: &Path, + request: &SandboxRequest, +) -> Result> { + let output = run_container_admin_command( + container_bin, + WARM_SANDBOX_CLEANUP_TIMEOUT, + ["list", "--format", "json"], + ) + .await?; + if !output.status.success() { + return Err(anyhow!( + "failed to list warm sandboxes: {}", + String::from_utf8_lossy(&output.stderr).trim() + )); + } + + let spec_hash = sandbox_spec_hash(&request.spec); + let containers: Vec = serde_json::from_slice(&output.stdout)?; + Ok(containers.into_iter().find_map(|container| { + if container.status.as_deref() != Some("running") { + return None; + } + let labels = &container.configuration.labels; + let key_matches = labels + .get(WARM_SANDBOX_KEY_LABEL) + .is_some_and(|value| value == &request.key.to_string()); + let spec_matches = labels + .get(WARM_SANDBOX_SPEC_HASH_LABEL) + .is_some_and(|value| value == &spec_hash); + (key_matches && spec_matches).then_some(container.configuration.id) + })) +} + async fn ensure_warm_sandbox_ready( container_bin: &Path, request: &SandboxRequest, @@ -615,35 +663,51 @@ async fn ensure_warm_sandbox_ready( }; let mut warm_sandboxes = warm_sandboxes.lock().await; - let current_name = match warm_sandboxes.get_mut(&request.key) { + let (current_name, current_owned) = match warm_sandboxes.get_mut(&request.key) { Some(entry) if entry.request.spec == request.spec => { entry.last_used_at = Instant::now(); - entry.name.clone() + (entry.name.clone(), entry.owned) } Some(_) => { let stale = warm_sandboxes .remove(&request.key) .expect("entry disappeared while locked"); - schedule_cleanup_named_container(container_bin.to_path_buf(), stale.name); - let name = create_unique_warm_sandbox(container_bin, request).await?; + if stale.owned { + schedule_cleanup_named_container(container_bin.to_path_buf(), stale.name); + } + let (name, owned) = match find_running_warm_sandbox(container_bin, request).await? { + Some(name) => (name, false), + None => ( + create_unique_warm_sandbox(container_bin, request).await?, + true, + ), + }; warm_sandboxes.insert( request.key.clone(), WarmSandboxEntry { name: name.clone(), request: request.clone(), last_used_at: Instant::now(), + owned, }, ); return Ok(name); } None => { - let name = create_unique_warm_sandbox(container_bin, request).await?; + let (name, owned) = match find_running_warm_sandbox(container_bin, request).await? { + Some(name) => (name, false), + None => ( + create_unique_warm_sandbox(container_bin, request).await?, + true, + ), + }; warm_sandboxes.insert( request.key.clone(), WarmSandboxEntry { name: name.clone(), request: request.clone(), last_used_at: Instant::now(), + owned, }, ); return Ok(name); @@ -658,17 +722,26 @@ async fn ensure_warm_sandbox_ready( return Ok(current_name); } - let replacement_name = create_unique_warm_sandbox(container_bin, request).await?; + let (replacement_name, owned) = match find_running_warm_sandbox(container_bin, request).await? { + Some(name) => (name, false), + None => ( + create_unique_warm_sandbox(container_bin, request).await?, + true, + ), + }; warm_sandboxes.insert( request.key.clone(), WarmSandboxEntry { name: replacement_name.clone(), request: request.clone(), last_used_at: Instant::now(), + owned, }, ); drop(warm_sandboxes); - schedule_cleanup_named_container(container_bin.to_path_buf(), current_name); + if current_owned { + schedule_cleanup_named_container(container_bin.to_path_buf(), current_name); + } Ok(replacement_name) } @@ -972,14 +1045,10 @@ async fn reap_orphaned_warm_sandboxes(container_bin: &Path) -> Result<()> { if now - started_date < ORPHANED_WARM_SANDBOX_MIN_AGE.as_secs_f64() { continue; } - if let Err(error) = - cleanup_named_container(container_bin, &container.configuration.id).await - { - eprintln!( - "failed to clean up orphaned warm sandbox {}: {error}", - container.configuration.id - ); - } + schedule_cleanup_named_container_silent( + container_bin.to_path_buf(), + container.configuration.id, + ); } Ok(()) @@ -1018,6 +1087,12 @@ fn schedule_cleanup_named_container(container_bin: PathBuf, name: String) { }); } +fn schedule_cleanup_named_container_silent(container_bin: PathBuf, name: String) { + tokio::spawn(async move { + let _ = cleanup_named_container(&container_bin, &name).await; + }); +} + async fn run_container_admin_command( container_bin: &Path, timeout: Duration, diff --git a/examples/exoclaw/README.md b/examples/exoclaw/README.md new file mode 100644 index 0000000..14bebed --- /dev/null +++ b/examples/exoclaw/README.md @@ -0,0 +1,62 @@ +# Exoclaw Harness + +Exoclaw is the long-running agent harness example. It builds on the minimal +TypeScript harness turn loop, but opts into heavier runtime features: + +- scheduled sandbox tasks +- live conversation wake-ups +- sticky conversation sandbox policy +- optional `sandboxMode: "task_fresh"` task-owned sandboxes + +Use Exoclaw when the agent should keep working over time. Use +`examples/typescript/basic-harness.ts` for a minimal TypeScript harness without +scheduler tools. + +## Tools + +Exoclaw includes the normal minimal tools: + +- `shell` +- `install_agent_tool` when agent tool creation is enabled +- configured library tools + +It also adds scheduler tools: + +- `schedule_sandbox_task` +- `list_scheduled_tasks` +- `cancel_scheduled_task` +- `delete_scheduled_task` + +`cancel_scheduled_task` disables a task and preserves its record/history. +`delete_scheduled_task` removes the task record and stored run history. + +## Sandbox Modes + +Scheduled tasks default to `sandboxMode: "conversation"`. This uses the sticky +conversation sandbox, so packages installed through the REPL, such as `curl` or +`python3`, are available to scheduled task runs while that warm sandbox is still +alive. + +Important limitation: the current sandbox filesystem is not durable across warm +container death. Exoclaw stores a durable conversation sandbox record, but package +installs made interactively live in the running warm container. If the REPL exits, +the host restarts, or the container backend cleans up the warm container, a later +scheduled task may recreate the sandbox from the base image and lose packages +installed with commands like `apt-get install python3`. + +For reliable scheduled tasks, prefer one of these: + +- Use a sandbox image that already contains required dependencies. +- Include a `setupCommand` that installs required packages before the task runs. +- Keep task code/data on mounted storage instead of relying on mutated container + filesystem state. + +Use `sandboxMode: "task_fresh"` when a task should have a separate fresh sandbox. +That sandbox starts from the configured image and mounts. It is reused across the +task's runs and stopped when the task is cancelled. + +The right long-term scope is still open. Conversation-scoped sandboxes are useful +for making one conversation's setup visible to its scheduled tasks, but +agent-scoped sandboxes may be more intuitive for long-running agents that manage +multiple conversations. This should likely become configurable, with an explicit +durability model rather than relying on warm container lifetime. diff --git a/examples/exoclaw/harness.ts b/examples/exoclaw/harness.ts new file mode 100644 index 0000000..a3e77cd --- /dev/null +++ b/examples/exoclaw/harness.ts @@ -0,0 +1,55 @@ +import { + defineHarness, + registerBuiltInTools, + registerLibraryToolsFromManifest, + registerAgentToolsFromManifestPathIfExists, + registerSchedulerTools, + type BuiltInToolName, + type HarnessToolRegistry, + type Message, + type TurnContext, +} from "@exo/harness"; + +import { + basicHarnessInstructions, + defaultBuiltInToolNames, + runResponsesHarnessTurn, +} from "../typescript/turn-loop"; + +export default defineHarness({ + async runTurn(context) { + await runResponsesHarnessTurn(context, { + instructions: exoclawInstructions, + registerTools: registerExoclawTools, + }); + }, +}); + +async function registerExoclawTools( + tools: HarnessToolRegistry, + context: TurnContext, +): Promise { + registerBuiltInTools(tools, context, builtInToolNames(context)); + registerSchedulerTools(tools); + await registerLibraryToolsFromManifest(tools, context, { + tools: context.agentConfig.libraryTools, + }); + if (context.agentConfig.enableAgentToolCreation) { + await registerAgentToolsFromManifestPathIfExists(tools, context); + } +} + +function builtInToolNames(context: TurnContext): BuiltInToolName[] { + return defaultBuiltInToolNames(context); +} + +function exoclawInstructions(context: TurnContext): Message[] { + return [ + ...basicHarnessInstructions(context), + { + role: "developer", + content: + 'This is the Exoclaw long-running agent harness. You can schedule recurring sandbox work with schedule_sandbox_task, inspect active tasks with list_scheduled_tasks, cancel tasks with cancel_scheduled_task, and permanently delete tasks with delete_scheduled_task. Use cancel_scheduled_task when task history should be preserved; use delete_scheduled_task when the user asks to remove a task entirely. Scheduled tasks default to sandboxMode: "conversation", which uses this conversation\'s sticky sandbox and can reuse tools installed through the REPL. Use sandboxMode: "task_fresh" only when the task should have a separate fresh sandbox that is reused across that task\'s runs.', + }, + ]; +} diff --git a/examples/typescript/basic-harness.ts b/examples/typescript/basic-harness.ts index c37a75a..d6e7f63 100644 --- a/examples/typescript/basic-harness.ts +++ b/examples/typescript/basic-harness.ts @@ -1,157 +1,9 @@ -import { - createToolRegistry, - defineHarness, - materializePromptMessages, - registerBuiltInTools, - registerAgentToolsFromManifestPathIfExists, - registerLibraryToolsFromManifest, - turnMetadata, - type EventData, - type Message, - type TurnContext, -} from "@exo/harness"; -import { - responseToLinguaEvents, - responseToolCalls, - ResponsesRuntime, - type NativeResponsesRequest, - type ResponsesRuntimeLike, - type TraceParent, -} from "@exo/model-runtime/responses"; +import { defineHarness } from "@exo/harness"; -import { resolveLlmBinding } from "./shared"; +import { runResponsesHarnessTurn } from "./turn-loop"; export default defineHarness({ async runTurn(context) { - const modelBinding = await resolveLlmBinding(context); - const runtime = ResponsesRuntime.fromModelBinding( - context.agentConfig, - modelBinding, - ); - await runtime.runTurn(context, (turnParent) => - runBasicTurnLoop(runtime, context, turnParent, modelBinding.model), - ); + await runResponsesHarnessTurn(context); }, }); - -async function runBasicTurnLoop( - runtime: ResponsesRuntimeLike, - context: TurnContext, - turnParent: TraceParent, - model: string, -): Promise { - const { conversation } = context.exoharness.current; - const maxToolRoundTrips = context.agentConfig.maxToolRoundTrips; - let latestEventId: string | null = null; - - for (let round = 0; ; round += 1) { - if ( - maxToolRoundTrips !== null && - maxToolRoundTrips !== undefined && - round > maxToolRoundTrips - ) { - return latestEventId; - } - - const tools = await createBasicToolRegistry(context); - const messages = await materializePromptMessages( - conversation, - basicHarnessInstructions(context), - ); - const request: NativeResponsesRequest = { - model, - messages, - tools: tools.definitions(), - maxOutputTokens: context.agentConfig.maxOutputTokens, - metadata: turnMetadata(context), - }; - - const response = context.streaming - ? await runtime.completeStream( - request, - { - onFirstChunk: (ttftMs) => context.stream.firstChunk(ttftMs), - onTextDelta: (text) => context.stream.text(text), - }, - { - parent: turnParent, - roundIndex: round, - }, - ) - : await runtime.complete(request, { - parent: turnParent, - roundIndex: round, - }); - - const events = responseToLinguaEvents(response); - if (events.length > 0) { - latestEventId = await appendTurnEvents(context, events); - } - - const toolCalls = responseToolCalls(response); - if (toolCalls.length === 0) { - return latestEventId; - } - - for (const toolCall of toolCalls) { - const toolResultEvents = await runtime.traceToolCall( - turnParent, - context, - toolCall, - round, - (toolCall) => tools.executePending([toolCall]), - ); - if (toolResultEvents.length > 0) { - latestEventId = await appendTurnEvents(context, toolResultEvents); - } - } - } -} - -async function appendTurnEvents( - context: TurnContext, - data: EventData[], -): Promise { - const { conversation, turn } = context.exoharness.current; - return ( - await conversation.addEvents({ - sessionId: turn.record.sessionId, - turnId: turn.record.id, - data, - }) - ).latestEventId; -} - -function basicHarnessInstructions(context: TurnContext): Message[] { - return context.agentConfig.enableAgentToolCreation - ? [...context.agentConfig.instructions, agentToolCreationInstruction()] - : context.agentConfig.instructions; -} - -function agentToolCreationInstruction(): Message { - return { - role: "developer", - content: - "Agent-created tools are supported. When the user asks you to create a reusable tool, call install_agent_tool with a complete TypeScript moduleSource. Do not claim the tool was created unless install_agent_tool returns ok: true. The moduleSource must default-export a Tool from @exo/harness using { definition, initializationParameters, initialize(...) }; definition.parameters must be a strict JSON schema object with additionalProperties: false; handlers must implement execute(args, execution), not invoke or call. Do not use zod, inputSchema, or external npm packages. After install_agent_tool succeeds, the new tool is available in the next model round of the same turn, so use it directly rather than falling back to shell.", - }; -} - -async function createBasicToolRegistry(context: TurnContext) { - const tools = createToolRegistry(context); - registerBuiltInTools(tools, context, builtInToolNames(context)); - await registerLibraryToolsFromManifest(tools, context, { - tools: context.agentConfig.libraryTools, - }); - if (context.agentConfig.enableAgentToolCreation) { - await registerAgentToolsFromManifestPathIfExists(tools, context); - } - return tools; -} - -function builtInToolNames( - context: TurnContext, -): Array<"shell" | "install_agent_tool"> { - return context.agentConfig.enableAgentToolCreation - ? ["shell", "install_agent_tool"] - : ["shell"]; -} diff --git a/examples/typescript/turn-loop.ts b/examples/typescript/turn-loop.ts new file mode 100644 index 0000000..fd8e80f --- /dev/null +++ b/examples/typescript/turn-loop.ts @@ -0,0 +1,184 @@ +import { + createToolRegistry, + materializePromptMessages, + registerAgentToolsFromManifestPathIfExists, + registerBuiltInTools, + registerLibraryToolsFromManifest, + turnMetadata, + type BuiltInToolName, + type EventData, + type HarnessToolRegistry, + type Message, + type TurnContext, +} from "@exo/harness"; +import { + responseToLinguaEvents, + responseToolCalls, + ResponsesRuntime, + type NativeResponsesRequest, + type ResponsesRuntimeLike, + type TraceParent, +} from "@exo/model-runtime/responses"; + +import { resolveLlmBinding } from "./shared"; + +export interface ResponsesTurnLoopOptions { + instructions?: (context: TurnContext) => Message[]; + registerTools?: ( + tools: HarnessToolRegistry, + context: TurnContext, + ) => Promise | void; +} + +export async function runResponsesHarnessTurn( + context: TurnContext, + options: ResponsesTurnLoopOptions = {}, +): Promise { + const modelBinding = await resolveLlmBinding(context); + const runtime = ResponsesRuntime.fromModelBinding( + context.agentConfig, + modelBinding, + ); + await runtime.runTurn(context, (turnParent) => + runResponsesTurnLoop( + runtime, + context, + turnParent, + modelBinding.model, + options, + ), + ); +} + +export async function createDefaultToolRegistry( + context: TurnContext, + builtInToolNames: BuiltInToolName[] = defaultBuiltInToolNames(context), +): Promise { + const tools = createToolRegistry(context); + registerBuiltInTools(tools, context, builtInToolNames); + await registerLibraryToolsFromManifest(tools, context, { + tools: context.agentConfig.libraryTools, + }); + if (context.agentConfig.enableAgentToolCreation) { + await registerAgentToolsFromManifestPathIfExists(tools, context); + } + return tools; +} + +export function defaultBuiltInToolNames( + context: TurnContext, +): BuiltInToolName[] { + const names: BuiltInToolName[] = ["shell"]; + if (context.agentConfig.enableAgentToolCreation) { + names.push("install_agent_tool"); + } + return names; +} + +export function basicHarnessInstructions(context: TurnContext): Message[] { + return context.agentConfig.enableAgentToolCreation + ? [...context.agentConfig.instructions, agentToolCreationInstruction()] + : context.agentConfig.instructions; +} + +export function agentToolCreationInstruction(): Message { + return { + role: "developer", + content: + "Agent-created tools are supported. When the user asks you to create a reusable tool, call install_agent_tool with a complete TypeScript moduleSource. Do not claim the tool was created unless install_agent_tool returns ok: true. The moduleSource must default-export a Tool from @exo/harness using { definition, initializationParameters, initialize(...) }; definition.parameters must be a strict JSON schema object with additionalProperties: false; handlers must implement execute(args, execution), not invoke or call. Do not use zod, inputSchema, or external npm packages. After install_agent_tool succeeds, the new tool is available in the next model round of the same turn, so use it directly rather than falling back to shell.", + }; +} + +async function runResponsesTurnLoop( + runtime: ResponsesRuntimeLike, + context: TurnContext, + turnParent: TraceParent, + model: string, + options: ResponsesTurnLoopOptions, +): Promise { + const { conversation } = context.exoharness.current; + const maxToolRoundTrips = context.agentConfig.maxToolRoundTrips; + let latestEventId: string | null = null; + + for (let round = 0; ; round += 1) { + if ( + maxToolRoundTrips !== null && + maxToolRoundTrips !== undefined && + round > maxToolRoundTrips + ) { + return latestEventId; + } + + const tools = options.registerTools + ? createToolRegistry(context) + : await createDefaultToolRegistry(context); + if (options.registerTools) { + await options.registerTools(tools, context); + } + const messages = await materializePromptMessages( + conversation, + options.instructions?.(context) ?? basicHarnessInstructions(context), + ); + const request: NativeResponsesRequest = { + model, + messages, + tools: tools.definitions(), + maxOutputTokens: context.agentConfig.maxOutputTokens, + metadata: turnMetadata(context), + }; + + const response = context.streaming + ? await runtime.completeStream( + request, + { + onFirstChunk: (ttftMs) => context.stream.firstChunk(ttftMs), + onTextDelta: (text) => context.stream.text(text), + }, + { + parent: turnParent, + roundIndex: round, + }, + ) + : await runtime.complete(request, { + parent: turnParent, + roundIndex: round, + }); + + const events = responseToLinguaEvents(response); + if (events.length > 0) { + latestEventId = await appendTurnEvents(context, events); + } + + const toolCalls = responseToolCalls(response); + if (toolCalls.length === 0) { + return latestEventId; + } + + for (const toolCall of toolCalls) { + const toolResultEvents = await runtime.traceToolCall( + turnParent, + context, + toolCall, + round, + (toolCall) => tools.executePending([toolCall]), + ); + if (toolResultEvents.length > 0) { + latestEventId = await appendTurnEvents(context, toolResultEvents); + } + } + } +} + +async function appendTurnEvents( + context: TurnContext, + data: EventData[], +): Promise { + const { conversation, turn } = context.exoharness.current; + return ( + await conversation.addEvents({ + sessionId: turn.record.sessionId, + turnId: turn.record.id, + data, + }) + ).latestEventId; +} diff --git a/scripts/exoclaw-repl b/scripts/exoclaw-repl new file mode 100755 index 0000000..8892b12 --- /dev/null +++ b/scripts/exoclaw-repl @@ -0,0 +1,416 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" + +EXO_BIN="${EXO_BIN:-$ROOT_DIR/target/debug/exo}" +ENV_FILE="${EXO_ENV_FILE:-$ROOT_DIR/.env}" +MODEL="${EXO_MODEL:-gpt-5.4}" +AGENT="${EXO_AGENT:-exoclaw-agent}" +AGENT_NAME="${EXO_AGENT_NAME:-Exoclaw Agent}" +CONVERSATION="${EXO_CONVERSATION:-dev}" +CONVERSATION_NAME="${EXO_CONVERSATION_NAME:-Dev}" +MODULE="${EXO_MODULE:-examples/exoclaw/harness.ts}" +HARNESS="exoclaw" +SANDBOX_IMAGE="${EXO_SANDBOX_IMAGE:-ubuntu:24.04}" +NETWORKING="${EXO_NETWORKING:-enabled}" +SHELL_PROGRAM="${EXO_SHELL_PROGRAM:-/bin/bash}" +SCHEDULER_INTERVAL_SECONDS="${EXO_SCHEDULER_INTERVAL_SECONDS:-10}" +COMMAND="repl" +USE_SANDBOX=true +PULL_SANDBOX=false +START_SCHEDULER="${EXO_START_SCHEDULER:-true}" + +usage() { + cat <<'EOF' +Usage: + scripts/exoclaw-repl [options] + scripts/exoclaw-repl list + scripts/exoclaw-repl delall all + scripts/exoclaw-repl setup-sandbox + +Default behavior creates or reuses an Exoclaw agent and conversation, starts the +local scheduler loop, then starts a REPL. It reads .env by default if present. + +Options: + --model Model binding name (default: gpt-5.4) + --agent Agent slug (default: exoclaw-agent) + --conversation Conversation slug (default: dev) + --convo Alias for --conversation + --agent-name Agent display name (default: Exoclaw Agent) + --conversation-name Conversation display name (default: Dev) + --module Exoclaw TypeScript harness module + --sandbox-image Sandbox image (default: ubuntu:24.04) + --networking enabled or disabled (default: enabled) + --shell-program Shell in the sandbox (default: /bin/bash) + --scheduler-interval Scheduler polling interval (default: 10) + --no-scheduler Do not start the local scheduled task runner + --scheduler Start the local scheduled task runner + --pull-sandbox Pull the sandbox image before starting + --no-sandbox Do not require or configure sandbox shell support + --env-file Env file to read if present (default: .env) + --exo-bin exo binary path (default: ./target/debug/exo) + --help Show this help + +Environment overrides: + EXO_MODEL, EXO_AGENT, EXO_CONVERSATION, EXO_AGENT_NAME, + EXO_CONVERSATION_NAME, EXO_MODULE, EXO_SANDBOX_IMAGE, + EXO_NETWORKING, EXO_SHELL_PROGRAM, EXO_ENV_FILE, EXO_BIN, + EXO_START_SCHEDULER, EXO_SCHEDULER_INTERVAL_SECONDS +EOF +} + +die() { + echo "error: $*" >&2 + exit 1 +} + +ensure_exo_bin() { + if [[ -x "$EXO_BIN" ]]; then + return + fi + if [[ "$EXO_BIN" != "$ROOT_DIR/target/debug/exo" ]]; then + die "exo binary is not executable: $EXO_BIN" + fi + echo "Building exo binary..." + (cd "$ROOT_DIR" && CARGO_TARGET_DIR=target cargo build -p exo --ignore-rust-version) +} + +exo() { + "$EXO_BIN" --env-file-if-exists "$ENV_FILE" "$@" +} + +scheduler_pid_file() { + echo "$ROOT_DIR/.exo/exoclaw-scheduler.pid" +} + +scheduler_log_file() { + echo "$ROOT_DIR/.exo/exoclaw-scheduler.log" +} + +scheduler_process_running() { + local pid_file pid command_line + pid_file="$(scheduler_pid_file)" + [[ -f "$pid_file" ]] || return 1 + pid="$(<"$pid_file")" + [[ "$pid" =~ ^[0-9]+$ ]] || return 1 + kill -0 "$pid" >/dev/null 2>&1 || return 1 + if [[ "$EXO_BIN" -nt "$pid_file" ]]; then + echo "Restarting scheduler because $EXO_BIN is newer..." + kill "$pid" >/dev/null 2>&1 || true + return 1 + fi + command_line="$(ps -p "$pid" -o command= 2>/dev/null || true)" + [[ "$command_line" == *"schedule run --watch"* ]] +} + +ensure_scheduler() { + if [[ "$START_SCHEDULER" != true ]]; then + return + fi + if scheduler_process_running; then + return + fi + + mkdir -p "$ROOT_DIR/.exo" + local pid_file log_file + pid_file="$(scheduler_pid_file)" + log_file="$(scheduler_log_file)" + echo "Starting scheduler loop..." + nohup "$EXO_BIN" --env-file-if-exists "$ENV_FILE" --harness "$HARNESS" \ + schedule run --watch --interval-seconds "$SCHEDULER_INTERVAL_SECONDS" \ + >>"$log_file" 2>&1 & + echo "$!" >"$pid_file" + echo "Scheduler log: $log_file" +} + +container_image_exists() { + if command -v docker >/dev/null 2>&1; then + docker image inspect "$SANDBOX_IMAGE" >/dev/null 2>&1 + return + fi + if command -v podman >/dev/null 2>&1; then + podman image exists "$SANDBOX_IMAGE" >/dev/null 2>&1 + return + fi + return 2 +} + +container_pull_image() { + if command -v docker >/dev/null 2>&1; then + docker pull "$SANDBOX_IMAGE" + return + fi + if command -v podman >/dev/null 2>&1; then + podman pull "$SANDBOX_IMAGE" + return + fi + die "docker or podman is required to pre-pull sandbox images" +} + +preflight_sandbox_image() { + local status=0 + container_image_exists || status=$? + case "$status" in + 0) + return + ;; + 1) + die "sandbox image $SANDBOX_IMAGE is not present; you have to either --pull-sandbox or use --no-sandbox" + ;; + 2) + die "docker/podman not found; you have to either install one or use --no-sandbox" + ;; + esac +} + +setup_sandbox() { + ensure_exo_bin + container_pull_image +} + +agent_exists() { + exo agent show "$AGENT" >/dev/null 2>&1 +} + +conversation_exists() { + exo conversation show "$AGENT" "$CONVERSATION" >/dev/null 2>&1 +} + +ensure_agent() { + if agent_exists; then + return + fi + + echo "Creating agent $AGENT..." + local args=( + --harness "$HARNESS" + agent create "$AGENT_NAME" + --slug "$AGENT" + --module "$MODULE" + --model "$MODEL" + ) + if [[ "$USE_SANDBOX" == true ]]; then + args+=(--sandbox-image "$SANDBOX_IMAGE" --networking "$NETWORKING") + fi + exo "${args[@]}" +} + +ensure_conversation() { + if conversation_exists; then + return + fi + + echo "Creating conversation $CONVERSATION..." + exo conversation create "$AGENT" "$CONVERSATION_NAME" \ + --slug "$CONVERSATION" + if [[ "$USE_SANDBOX" == true ]]; then + exo conversation update "$AGENT" "$CONVERSATION" \ + --shell-program "$SHELL_PROGRAM" >/dev/null + fi +} + +list_agents_and_conversations() { + ensure_exo_bin + echo "Agents and conversations:" + local agents + agents="$(exo agent list | awk 'NR > 1 { print $1 }')" + if [[ -z "$agents" ]]; then + echo " none" + return + fi + + while IFS= read -r agent; do + [[ -z "$agent" ]] && continue + echo + exo agent show "$agent" | awk ' + /^slug:/ { slug=$2 } + /^name:/ { name=substr($0, 7) } + END { + if (slug != "") { + printf "%s", slug + if (name != "") { + printf " - %s", name + } + printf "\n" + } + } + ' + exo conversation list "$agent" | awk -F '\t' 'NR == 1 { next } { printf " %s - %s\n", $1, $2 }' + done <<<"$agents" +} + +stop_scheduler() { + local pid_file pid + pid_file="$(scheduler_pid_file)" + [[ -f "$pid_file" ]] || return + pid="$(<"$pid_file")" + if [[ "$pid" =~ ^[0-9]+$ ]] && kill -0 "$pid" >/dev/null 2>&1; then + echo "Stopping Exoclaw scheduler..." + kill "$pid" >/dev/null 2>&1 || true + fi + rm -f "$pid_file" +} + +delete_all_agents_and_conversations() { + ensure_exo_bin + stop_scheduler + + local agents + agents="$(exo agent list | awk 'NR > 1 { print $1 }')" + if [[ -z "$agents" ]]; then + echo "No agents to delete." + return + fi + + while IFS= read -r agent; do + [[ -z "$agent" ]] && continue + + local conversations + conversations="$(exo conversation list "$agent" | awk -F '\t' 'NR > 1 { print $1 }')" + while IFS= read -r conversation; do + [[ -z "$conversation" ]] && continue + echo "Deleting conversation $agent/$conversation..." + exo conversation delete "$agent" "$conversation" >/dev/null + done <<<"$conversations" + + echo "Deleting agent $agent..." + exo agent delete "$agent" >/dev/null + done <<<"$agents" + + echo "Deleted all agents and conversations." +} + +run_repl() { + ensure_exo_bin + if [[ "$USE_SANDBOX" == true ]]; then + if [[ "$PULL_SANDBOX" == true ]]; then + container_pull_image + else + preflight_sandbox_image + fi + fi + ensure_agent + ensure_conversation + ensure_scheduler + exec "$EXO_BIN" --env-file-if-exists "$ENV_FILE" chat repl "$AGENT" "$CONVERSATION" +} + +while [[ $# -gt 0 ]]; do + case "$1" in + list) + shift + [[ $# -eq 0 ]] || die "list does not accept additional arguments" + COMMAND="list" + ;; + delall|delete-all) + shift + [[ "${1:-}" == "all" ]] || die "delall requires literal argument: all" + shift + [[ $# -eq 0 ]] || die "delall all does not accept additional arguments" + COMMAND="delall" + ;; + setup-sandbox) + shift + COMMAND="setup-sandbox" + ;; + --model) + MODEL="${2:-}" + [[ -n "$MODEL" ]] || die "--model requires a value" + shift 2 + ;; + --agent) + AGENT="${2:-}" + [[ -n "$AGENT" ]] || die "--agent requires a value" + shift 2 + ;; + --conversation|--convo) + CONVERSATION="${2:-}" + [[ -n "$CONVERSATION" ]] || die "$1 requires a value" + shift 2 + ;; + --agent-name) + AGENT_NAME="${2:-}" + [[ -n "$AGENT_NAME" ]] || die "--agent-name requires a value" + shift 2 + ;; + --conversation-name) + CONVERSATION_NAME="${2:-}" + [[ -n "$CONVERSATION_NAME" ]] || die "--conversation-name requires a value" + shift 2 + ;; + --module) + MODULE="${2:-}" + [[ -n "$MODULE" ]] || die "--module requires a value" + shift 2 + ;; + --sandbox-image) + SANDBOX_IMAGE="${2:-}" + [[ -n "$SANDBOX_IMAGE" ]] || die "--sandbox-image requires a value" + shift 2 + ;; + --networking) + NETWORKING="${2:-}" + [[ "$NETWORKING" == "enabled" || "$NETWORKING" == "disabled" ]] || die "--networking must be enabled or disabled" + shift 2 + ;; + --shell-program) + SHELL_PROGRAM="${2:-}" + [[ -n "$SHELL_PROGRAM" ]] || die "--shell-program requires a value" + shift 2 + ;; + --scheduler-interval) + SCHEDULER_INTERVAL_SECONDS="${2:-}" + [[ "$SCHEDULER_INTERVAL_SECONDS" =~ ^[0-9]+$ && "$SCHEDULER_INTERVAL_SECONDS" -gt 0 ]] || die "--scheduler-interval requires a positive integer" + shift 2 + ;; + --no-scheduler) + START_SCHEDULER=false + shift + ;; + --scheduler) + START_SCHEDULER=true + shift + ;; + --pull-sandbox) + PULL_SANDBOX=true + shift + ;; + --no-sandbox) + USE_SANDBOX=false + shift + ;; + --env-file) + ENV_FILE="${2:-}" + [[ -n "$ENV_FILE" ]] || die "--env-file requires a value" + shift 2 + ;; + --exo-bin) + EXO_BIN="${2:-}" + [[ -n "$EXO_BIN" ]] || die "--exo-bin requires a value" + shift 2 + ;; + --help|-h) + usage + exit 0 + ;; + *) + die "unknown argument: $1" + ;; + esac +done + +case "$COMMAND" in + repl) + run_repl + ;; + list) + list_agents_and_conversations + ;; + delall) + delete_all_agents_and_conversations + ;; + setup-sandbox) + setup_sandbox + ;; +esac diff --git a/tools.md b/tools.md deleted file mode 100644 index 8d6fa02..0000000 --- a/tools.md +++ /dev/null @@ -1,1079 +0,0 @@ -# Tool Support Plan - -## Context - -`exo` separates the trusted exoharness substrate from executor-owned agent -semantics. Tool support should follow the same split: - -- The exoharness owns durable state, events, artifacts, bindings, secrets, and - sandbox execution. -- Executors and harness modules own which model-facing tools exist, how they - are exposed, how calls are authorized, and how calls are dispatched. - -The tool system should make it easy for harnesses to expose a small set of -model-callable functions without turning exoharness into a product-specific -integration registry. The exoharness can already store bindings and secrets. -Tools should use those generic substrate capabilities when they need -configuration or credentials, but tool semantics remain above the substrate. - -## Goals - -- Let TypeScript harnesses compose tools without hard-coding every tool in Rust. -- Keep the model-facing tool contract portable across model runtimes. -- Preserve `tool_requested` and `tool_result` events as the canonical durable - record of tool use. -- Use existing bindings and secrets for credentials instead of tool-specific - secret plumbing. -- Keep product-specific tool behavior out of the exoharness substrate. -- Support three tool sources: built-in, library, and agent. -- Make it possible for an agent to create a local tool as code without needing a - package distribution system. - -## Non-Goals - -- Do not make exoharness a registry of specific app semantics. -- Do not require all tool execution to cross the Rust `execute_tool` protocol. -- Do not choose one model provider's tool schema as the internal source of - truth. -- Do not design a standardized tool marketplace or package distribution system - yet. -- Do not add product-specific event variants, binding kinds, or storage records - for individual tools. - -## Current Shape - -The current TypeScript surface has a small `ToolDefinition`: - -```ts -interface ToolDefinition { - name: string; - description: string; - parameters: JsonValue; -} -``` - -`examples/typescript/basic-harness.ts` exposes only -`buildShellToolDefinitions(context.conversationConfig)`. When the model calls a -tool, the TypeScript runner sends an `execute_tool` runtime request to Rust. -Rust's `BasicToolRuntime` currently dispatches only `shell`, backed by the -conversation sandbox. - -That is a good host-backed boundary for built-in tools that need Rust-owned -runtime behavior. It should not be the only tool execution path. TypeScript can -already access events, artifacts, bindings, secrets, and sandbox processes -through `TurnContext`, so many tools can execute directly in TypeScript while -still using exoharness for durable and privileged operations. - -## Tool Sources - -### Built-In - -Built-in tools are maintained by the maintainers of `exo`. They are part of the -core release, reviewed with the project, documented with the harness, and -updated as `exo` evolves. - -Examples: - -- `shell` -- `run_workspace_command`, if we choose to ship it as a first-party tool -- future core exoharness inspection or artifact tools - -Built-ins can still be optional. A conversation or harness should explicitly -choose which built-ins are exposed to the model. - -### Library - -Library tools are not written by the agent itself, but they are also not part of -the core `exo` release. They may be written by the user, by a team, or by a -third party. - -There is no standardized distribution plan yet. For now, a library tool can be a -local TypeScript module imported by a harness. Later, library tools could be -distributed as npm packages, copied modules, git submodules, or another format. -The architecture should not depend on that choice. - -Examples: - -- A user-written IRC tool module. -- A team-maintained internal incident-management tool. -- An externally maintained GitHub or Linear tool package. - -### Agent - -Agent tools are created by the agent itself. The agent may write a TypeScript -module, a script, or another local artifact, then ask the harness to expose it as -a model-facing tool. - -Agent tools are the riskiest category because the author is the agent. They -should be clearly marked as `agent`, scoped narrowly, and subject to stricter -policy. For a first implementation, agent-created tools should be local to a -conversation or workspace and should not be promoted into shared library tools -without user review. - -## Core Design - -### Model Tool Definition - -Keep the model-facing definition small and provider-neutral: - -```ts -interface ToolDefinition { - name: string; - description: string; - parameters: JsonValue; - outputSchema?: JsonValue; -} -``` - -`outputSchema` should be optional. It is useful for tools that return structured -results, but model runtimes can ignore it when the provider has no native -output-schema concept. - -Auth requirements, source, policy, runtime choice, and provenance should not be -added to `ToolDefinition`. Those are executor concerns. - -### Harness Tool - -Add an executor-side representation around the model definition: - -```ts -type HarnessToolSource = "built_in" | "library" | "agent"; - -interface ToolExecutionContext { - readonly context: TurnContext; - readonly toolCallId?: string; -} - -interface ToolHandler { - execute( - args: JsonObject, - execution: ToolExecutionContext, - ): Promise; -} - -interface ToolInstance { - definition: ToolDefinition; - source: HarnessToolSource; - handler: ToolHandler; -} -``` - -This is a TypeScript harness API, not an exoharness API. The executor can attach -policy, tracing, auth, and implementation details without changing the portable -model-facing contract. - -### Tool Module - -Library and agent tools should use a standardized module shape. Each tool module -should default export a `Tool`. The export name is standardized, so loaders do -not need to guess whether a file exports `createTool`, `ircSendMessageTool`, or -something else. This is the TypeScript equivalent of loading a `.so` file with a -known interface. - -Tool modules should separate initialization parameters from runtime parameters: - -- Initialization parameters configure the tool before it is exposed. They are - not model-visible and can include server names, default channels, secret ids, - allowlists, and other setup values. -- Runtime parameters are the model-facing arguments in `definition.parameters`. - They are supplied by the model each time it calls the tool. - -```ts -interface ToolInitializationContext { - readonly context: TurnContext; - readonly source: HarnessToolSource; -} - -interface Tool { - definition: ToolDefinition; - initializationParameters: JsonValue; - initialize( - args: JsonObject, - initialization: ToolInitializationContext, - ): Promise | ToolHandler; -} -``` - -The registry or loader combines the module, source, and initialized handler into -a `ToolInstance`: - -```ts -async function initializeTool( - tool: Tool, - source: HarnessToolSource, - initializationArgs: JsonObject, - context: TurnContext, -): Promise { - return { - definition: tool.definition, - source, - handler: await tool.initialize(initializationArgs, { - context, - source, - }), - }; -} -``` - -This makes the module contract stable while leaving each tool's private -implementation types, such as `IrcConfig`, internal to that module. The harness -can load the module with `await import(path)` and read `module.default`. For -static imports, the equivalent is: - -```ts -import * as module from "./foo"; - -const tool = module.default; -``` - -### Tool Registry - -Add a `HarnessToolRegistry` in `typescript/harness/index.ts`: - -```ts -const tools = createToolRegistry(context); - -tools.useBuiltIns(["shell"]); -tools.register(await loadLibraryTool(context, "irc", ircInitialization)); -tools.register(await loadAgentTool(context, "irc_send_message")); - -const request = { - model, - messages, - tools: tools.definitions(), -}; - -const events = await tools.executePending(toolCalls); -``` - -The registry should: - -- Map tool names to `ToolInstance` handlers. -- Reject duplicate tool names at registration time. -- Expose `definitions()` for model calls. -- Execute pending tool calls with streaming `tool_call` and `tool_result` - updates when enabled. -- Return durable `tool_result` events for the caller to append to the turn. -- Preserve each tool's source so policy and tracing can distinguish built-in, - library, and agent tools. -- Support registering initialized `Tool` default exports for - library and agent tools. - -The existing `context.executePendingTools` can remain as the host-backed default -for compatibility with simple harnesses. The registry should be the preferred -path for TypeScript harnesses that compose multiple tool sources. - -## Execution Paths - -### Host-Backed Execution - -Some tools should continue to delegate to Rust or another host runtime. `shell` -is the main example today. The TypeScript registry can expose `shell` as a -built-in tool while its handler delegates to: - -```ts -context.executeTool({ - functionName: "shell", - arguments: args, -}); -``` - -This lets Rust continue to own sandbox lifecycle and shell execution while the -TypeScript harness gets a uniform registry API. - -### TypeScript Execution - -Library and agent tools can often run directly in the TypeScript harness runner. -They can call external APIs, use Node libraries, access generic exoharness -bindings and secrets, write artifacts, and append custom events through the -existing `TurnContext`. - -This path is useful for tools where the trusted substrate does not need to know -the protocol semantics. - -### Sandboxed Process Execution - -Some tools need to run code in a sandbox. A built-in tool such as -`run_workspace_command` can use: - -```ts -const process = await context.startSandboxProcess({ - command: [shellProgram, "-lc", command], -}); -``` - -This is useful for running scripts or local programs, but it should be treated -as a powerful built-in capability, not as its own tool source. If we expose it, -it should have explicit policy and should be enabled intentionally. - -Before relying on sandboxed execution for untrusted agent-authored code, we -should verify the sandbox security model. If we need strong in-process -JavaScript isolation, a smaller runtime such as QuickJS may be a better fit than -unrestricted Node execution. - -## Configuration and Credentials - -Tools should use existing generic substrate objects: - -- Non-secret configuration can live in harness code, agent config, - conversation config, artifacts, or future generic installation records. -- Credentials should live in `Secret`. -- References between configuration and credentials should use secret ids or - existing binding ids. -- Tool definitions should not expose raw credential material. -- Tool result events should not contain raw credential material. - -If persisted tool installation state becomes necessary, add a generic record -that does not encode product-specific semantics: - -```ts -interface ToolInstallation { - id: string; - toolId: string; - source: "library" | "agent"; - version?: string; - scope: "exoharness" | "agent" | "conversation"; - initialization: JsonObject; - bindingIds: string[]; - secretIds: string[]; - enabled?: boolean; -} -``` - -Do this only when the executor needs persisted tool configuration. The first -implementation can work with explicit imports and local initialization arguments -in the harness. - -## Policy - -Policy belongs to the executor or harness module. Evaluate it in two places: - -- Exposure time: decide whether a tool should be included in - `tools.definitions()` for the current turn. -- Invocation time: decide whether the exact call can run with the supplied - arguments, credentials, bindings, mounts, network access, and user/session - context. - -The first implementation can keep policy simple and explicit: - -- `shell` is exposed only when `conversationConfig.shellProgram` is set. -- Networked tools require explicit networking enablement. -- Tools with external side effects should have a confirmation hook before - execution. -- Agent tools should default to the narrowest useful scope. -- Agent tools should not silently persist beyond the conversation or workspace - where they were created. - -The CLI/TUI should render confirmation prompts, but the executor should own the -decision and the durable record of the decision. - -## Events and Observability - -Keep `tool_requested` and `tool_result` as the canonical history. Model runtime -helpers already append `tool_requested` from model outputs, and registry -execution should return `tool_result` events. - -Add optional custom events only when they provide real value: - -- `tool_policy_decision`: exposure or invocation allowed/denied. -- `tool_invocation_started`: tool name, source, optional library id/version. -- `tool_invocation_completed`: duration, status, redacted result summary. -- `tool_auth_refreshed`: secret id or binding id, without credential material. - -Large logs should be artifacts. Events should contain summaries and references, -not unbounded output or secrets. - -Tracing should also preserve the tool source. That makes it possible to compare -built-in, library, and agent tool behavior in Braintrust or other tracing -systems without changing the durable event contract. - -## Incremental Implementation Plan - -The implementation should move in small, testable steps. The first milestone is -shell parity: the TypeScript basic harness should behave exactly as it does -today, but through the registry. Only after that should we add library and agent -tool loading. - -### Step 1: Add Portable Types Only - -Add the core TypeScript types in `typescript/harness/index.ts`: - -- `outputSchema?: JsonValue` on `ToolDefinition`. -- `HarnessToolSource = "built_in" | "library" | "agent"`. -- `ToolExecutionContext`. -- `ToolHandler`. -- `ToolInstance`. -- `ToolInitializationContext`. -- `Tool`. - -For Rust, add `output_schema: Option` to the Rust `ToolDefinition` only -if the Rust model-runtime path needs to deserialize or forward tool definitions -with output schemas. This can be done later if TypeScript-only work does not -touch Rust serialization. - -Test checkpoint: - -- `pnpm typecheck` -- `cargo test -p executor` if the Rust `ToolDefinition` changes - -Expected behavior change: none. - -### Step 2: Add Registry Without Switching Harnesses - -Add `HarnessToolRegistry` and `createToolRegistry(context)`. - -The registry should support: - -- `register(tool: ToolInstance)`. -- Duplicate-name rejection. -- `definitions()`. -- `get(name)`. -- `executePending(toolCalls)`, including stream events and `tool_result` event - construction. - -At this point, no harness needs to use it yet. - -Test checkpoint: - -- Unit tests for duplicate registration. -- Unit tests for `definitions()`. -- Unit tests for `executePending(...)` using a fake in-memory `ToolInstance`. -- `pnpm typecheck` - -Expected behavior change: none. - -### Step 3: Move Shell Definition Behind A Built-In Tool - -Implement a built-in shell `ToolInstance` that delegates execution to the -existing host path: - -```ts -context.executeTool({ - functionName: "shell", - arguments: args, -}); -``` - -Then reimplement `buildShellToolDefinitions(config)` through the built-in shell -helper. Existing callers should still receive the same model-facing shell -definition. - -Test checkpoint: - -- Existing tests still pass. -- A focused test verifies `buildShellToolDefinitions(...)` returns the same - shape as before. -- A focused test verifies the shell `ToolInstance` delegates to - `context.executeTool`. - -Expected behavior change: none. - -### Step 4: Let Tracing Use A Custom Tool Executor - -Update `ResponsesRuntime.traceToolCall(...)` to accept an optional execution -callback: - -```ts -execute = (toolCall: PendingToolCall) => - context.executePendingTools([toolCall]); -``` - -The default preserves existing behavior. Registry-aware harnesses can pass: - -```ts -(toolCall) => tools.executePending([toolCall]); -``` - -Test checkpoint: - -- Unit test or typecheck proving existing call sites compile unchanged. -- Unit test proving a supplied callback is used. -- `pnpm typecheck` - -Expected behavior change: none for existing harnesses. - -### Step 5: Switch The Basic TypeScript Harness To Shell Through Registry - -Update `examples/typescript/basic-harness.ts` to: - -- Create a registry once per turn loop. -- Register built-in `shell`. -- Pass `tools.definitions()` to the model. -- Execute tool calls through the registry callback passed to - `traceToolCall(...)`. - -This step should expose only shell, so it should be behaviorally equivalent to -the current basic TypeScript harness. - -Test checkpoint: - -- `pnpm typecheck` -- Existing TypeScript harness tests or e2e script. -- Manual smoke test: ask the basic TypeScript harness to run a simple shell - command and verify `tool_requested` / `tool_result` events still appear. - -Expected behavior change: none except internal dispatch path. - -### Step 6: Prove Direct TypeScript Library Tools - -Add one small library tool that does not require Rust. Prefer a harmless local -tool over a networked integration for the first proof, for example: - -- `echo_json` -- `now_fixed_for_test` -- `uppercase` - -The point is to prove that a `Tool` default export can be initialized and -registered, and that its handler can produce a `tool_result` without -`context.executeTool`. - -Test checkpoint: - -- Unit test imports the module, initializes it, registers it, and executes it. -- `pnpm typecheck` - -Expected behavior change: none unless the example harness opts into this tool. - -### Step 7: Add A Local Agent Tool Loading Convention - -Add the smallest local convention for agent tools, such as an artifact or config -record containing: - -```json -{ - "tools": [ - { - "modulePath": ".exo/agent-tools/irc.ts", - "initialization": {} - } - ] -} -``` - -The loader should: - -- Import `module.default`. -- Validate it satisfies the `Tool` shape. -- Validate `initialization` against `initializationParameters`. -- Call `initializeTool(...)`. -- Register the resulting `ToolInstance` with source `"agent"`. - -This can start as a helper used by the example TypeScript harness rather than a -new exoharness storage feature. - -Test checkpoint: - -- Unit test with a generated local agent tool module. -- Unit test for a missing default export. -- Unit test for invalid initialization parameters. -- `pnpm typecheck` - -Expected behavior change: only conversations/harnesses that opt into agent tool -loading can expose agent tools. - -### Step 8: Add An Example IRC Tool - -After the local agent tool loading path works, add a concrete IRC tool under an -examples directory, for example `examples/typescript/tools/irc.ts`. - -This should be an example of the standardized `Tool` default export: - -- `definition` exposes the runtime model-facing `irc_send_message` parameters. -- `initializationParameters` exposes setup values such as server, port, nick, - TLS, allowed channels, and optional password secret id. -- `initialize(...)` validates initialization arguments and returns a - `ToolHandler`. -- The handler uses the generic secret APIs for credentials and regular - TypeScript/Node networking for IRC. - -This should be committed separately from the core registry and loading changes. -That keeps the review split clean: first prove the tool API, then add a real -example tool that exercises it. - -Test checkpoint: - -- Unit test imports the IRC tool, validates initialization, initializes it, and - verifies the model-facing definition. -- A network-free handler test should mock the IRC socket or connection layer. -- `pnpm typecheck` - -Expected behavior change: none unless an example harness opts into the IRC tool. - -### Step 9: Add Optional Built-In Code Execution - -Only after shell parity and direct TypeScript tools work, decide whether to add -`run_workspace_command` as a built-in. If added, treat it as a powerful built-in -capability with explicit enablement and tests. - -Test checkpoint: - -- Unit tests for argument validation and structured output. -- Sandbox smoke test. -- Manual review of sandbox security assumptions. - -Expected behavior change: only when the built-in is explicitly enabled. - -### Step 10: Defer Persistent Installation Storage - -Do not add `ToolInstallation` storage until a real library or agent tool needs -durable configuration that cannot reasonably live in harness code, agent config, -conversation config, or artifacts. - -Test checkpoint: - -- None yet. This should remain a later design decision. - -Expected behavior change: none. - -## Suggested First Patch - -The first patch should stop at Step 2: - -- Add the TypeScript types. -- Add `HarnessToolRegistry`. -- Add tests for registration, duplicate names, definitions, and execution using - fake in-memory tools. -- Do not change `examples/typescript/basic-harness.ts` yet. -- Do not change Rust unless TypeScript changes force a Rust schema update. - -That patch validates the core API without changing runtime behavior. The second -patch can add shell as a built-in registry tool while preserving the old -`buildShellToolDefinitions(...)` behavior. The third patch can switch the basic -TypeScript harness to registry-backed shell execution. - -## Open Questions - -- What is the confirmation API between executor and CLI/TUI? -- Should `run_workspace_command` be a built-in tool, or should the first built-in - code execution tool have a narrower interface? -- What local entrypoint should agent-created tools use so the agent does not - have to modify the main harness module directly? -- Should library tools be loaded only by explicit imports at first, or should - there be a small manifest format? -- What is the smallest generic installation record needed before adding storage? - -## Recommendation - -Build the TypeScript registry first. Treat tools as built-in, library, or agent -tools. Keep `shell` on the existing Rust execution path, execute library and -agent tools directly in TypeScript where practical, and keep exoharness focused -on durable substrate responsibilities: events, bindings, secrets, artifacts, and -sandbox execution. - -## Example: Agent-Created IRC Tool - -This example walks through how an agent could create IRC support as its own -tool. IRC is useful because it needs network access, configuration, and optional -credentials, but it does not require exoharness to learn anything IRC-specific. - -Assume the agent wants to expose this model-facing tool: - -```ts -{ - name: "irc_send_message", - description: "Send a message to an IRC channel.", - parameters: { - type: "object", - additionalProperties: false, - properties: { - channel: { - type: "string", - description: "IRC channel name, for example #exo.", - }, - text: { - type: "string", - description: "Message text to send.", - }, - }, - required: ["channel", "text"], - }, - outputSchema: { - type: "object", - additionalProperties: false, - properties: { - ok: { type: "boolean" }, - server: { type: "string" }, - channel: { type: "string" }, - }, - required: ["ok", "server", "channel"], - }, -} -``` - -### What the Agent Creates - -The agent writes a local tool module, for example -`.exo/agent-tools/irc.ts`: - -```ts -import net from "node:net"; -import tls from "node:tls"; - -import type { Tool, JsonObject, ToolResult, TurnContext } from "@exo/harness"; - -interface IrcConfig { - server: string; - port: number; - nick: string; - username: string; - realname: string; - tls: boolean; - passwordSecretId?: string | null; -} - -const ircTool = { - definition: { - name: "irc_send_message", - description: "Send a message to an IRC channel.", - parameters: { - type: "object", - additionalProperties: false, - properties: { - channel: { type: "string" }, - text: { type: "string" }, - }, - required: ["channel", "text"], - }, - outputSchema: { - type: "object", - additionalProperties: false, - properties: { - ok: { type: "boolean" }, - server: { type: "string" }, - channel: { type: "string" }, - }, - required: ["ok", "server", "channel"], - }, - }, - initializationParameters: { - type: "object", - additionalProperties: false, - properties: { - server: { type: "string" }, - port: { type: "number" }, - nick: { type: "string" }, - username: { type: "string" }, - realname: { type: "string" }, - tls: { type: "boolean" }, - passwordSecretId: { type: ["string", "null"] }, - }, - required: ["server", "port", "nick", "username", "realname", "tls"], - }, - initialize(args) { - const config = parseIrcConfig(args); - return { - async execute(args, execution): Promise { - return sendIrcMessage(execution.context, config, args); - }, - }; - }, -} satisfies Tool; - -export default ircTool; - -function parseIrcConfig(args: JsonObject): IrcConfig { - return { - server: stringArgument(args, "server"), - port: numberArgument(args, "port"), - nick: stringArgument(args, "nick"), - username: stringArgument(args, "username"), - realname: stringArgument(args, "realname"), - tls: booleanArgument(args, "tls"), - passwordSecretId: optionalStringArgument(args, "passwordSecretId"), - }; -} - -async function sendIrcMessage( - context: TurnContext, - config: IrcConfig, - args: JsonObject, -): Promise { - const channel = stringArgument(args, "channel"); - const text = stringArgument(args, "text"); - const password = await resolvePassword(context, config.passwordSecretId); - - await withIrcConnection(config, async (socket) => { - if (password) { - socket.write(`PASS ${password}\r\n`); - } - socket.write(`NICK ${config.nick}\r\n`); - socket.write(`USER ${config.username} 0 * :${config.realname}\r\n`); - socket.write(`PRIVMSG ${channel} :${text}\r\n`); - socket.write("QUIT\r\n"); - }); - - return { - ok: true, - server: config.server, - channel, - }; -} - -async function resolvePassword( - context: TurnContext, - secretId: string | null | undefined, -): Promise { - if (!secretId) { - return null; - } - const secret = - await context.exoharness.current.conversation.getSecret(secretId); - if (!secret) { - throw new Error(`IRC password secret does not exist: ${secretId}`); - } - if (secret.type !== "key") { - throw new Error("IRC password secret must be a key secret"); - } - return secret.value; -} - -async function withIrcConnection( - config: IrcConfig, - run: (socket: net.Socket) => Promise | void, -): Promise { - await new Promise((resolve, reject) => { - const socket = config.tls - ? tls.connect(config.port, config.server) - : net.connect(config.port, config.server); - socket.setEncoding("utf8"); - socket.setTimeout(10_000); - socket.once("connect", async () => { - try { - await run(socket); - socket.end(resolve); - } catch (error) { - socket.destroy(); - reject(error); - } - }); - socket.once("error", reject); - socket.once("timeout", () => { - socket.destroy(new Error("IRC connection timed out")); - }); - }); -} - -function stringArgument(args: JsonObject, name: string): string { - const value = args[name]; - if (typeof value !== "string" || value.length === 0) { - throw new Error(`IRC tool argument ${name} must be a non-empty string`); - } - return value; -} - -function optionalStringArgument(args: JsonObject, name: string): string | null { - const value = args[name]; - if (value === undefined || value === null) { - return null; - } - if (typeof value !== "string" || value.length === 0) { - throw new Error(`IRC tool initialization ${name} must be a string`); - } - return value; -} - -function numberArgument(args: JsonObject, name: string): number { - const value = args[name]; - if (typeof value !== "number") { - throw new Error(`IRC tool initialization ${name} must be a number`); - } - return value; -} - -function booleanArgument(args: JsonObject, name: string): boolean { - const value = args[name]; - if (typeof value !== "boolean") { - throw new Error(`IRC tool initialization ${name} must be a boolean`); - } - return value; -} -``` - -This module is ordinary TypeScript harness code. It does not require a Rust tool -implementation because it can run directly inside the TypeScript harness runner. -It uses exoharness only for secret lookup. Its default export is the stable -loader contract. `IrcConfig` can remain internal because the harness passes -untyped JSON initialization parameters and the module validates them. - -### What the Harness Needs - -The harness needs a local entrypoint for agent-created tools so the agent does -not have to edit the main turn loop every time. A simple first version could be -an explicit loader in `examples/typescript/basic-harness.ts`: - -```ts -interface AgentToolManifest { - tools: Array<{ - modulePath: string; - initialization: JsonObject; - }>; -} - -async function registerAgentTools( - context: TurnContext, - tools: HarnessToolRegistry, -): Promise { - const manifest = - await context.exoharness.current.conversation.readArtifactJson( - { - artifactId: "agent-tools", - }, - ); - for (const entry of manifest?.tools ?? []) { - const module = (await import(entry.modulePath)) as { - default: Tool; - }; - tools.register( - await initializeTool( - module.default, - "agent", - entry.initialization, - context, - ), - ); - } -} -``` - -`initializeTool` should validate `entry.initialization` against the tool's -`initializationParameters` before calling `initialize(...)`. - -The turn loop then builds the registry and loads agent tools before calling the -model: - -```ts -const tools = createToolRegistry(context).useBuiltIns(["shell"]); -await registerAgentTools(context, tools); - -const request: NativeResponsesRequest = { - model, - messages, - tools: tools.definitions(), - maxOutputTokens: context.agentConfig.maxOutputTokens, - metadata: turnMetadata(context), -}; -``` - -The agent would also need to write the manifest artifact: - -```json -{ - "tools": [ - { - "modulePath": ".exo/agent-tools/irc.ts", - "initialization": { - "server": "irc.libera.chat", - "port": 6697, - "nick": "exo-agent", - "username": "exo", - "realname": "Exo Agent", - "tls": true, - "passwordSecretId": "irc-password" - } - } - ] -} -``` - -This is intentionally a minimal local convention, not a distribution system. A -more polished version could validate the manifest, restrict allowed paths, cache -loaded modules, and require user approval before exposing new agent tools. - -### Tool Execution Wiring - -The harness passes `tools.definitions()` to the model request and executes -returned calls through `tools.executePending(...)`: - -```ts -const toolResultEvents = await tools.executePending([toolCall]); -await turn.addEvents(toolResultEvents); -``` - -If the current runtime helper still hardcodes -`context.executePendingTools(...)`, update `ResponsesRuntime.traceToolCall` to -accept an optional executor callback: - -```ts -async traceToolCall( - turnParent: TraceParent, - context: TurnContext, - toolCall: PendingToolCall, - roundIndex: number, - execute = (toolCall: PendingToolCall) => - context.executePendingTools([toolCall]), -): Promise { - return tracedUnderParent( - turnParent, - async (span) => { - const events = await execute(toolCall); - span.log({ output: toolResultTraceOutput(events) }); - return events; - }, - // existing trace args - ); -} -``` - -The harness can then call: - -```ts -await runtime.traceToolCall(turnParent, context, toolCall, round, (toolCall) => - tools.executePending([toolCall]), -); -``` - -### Required Configuration - -The conversation or agent must have networking enabled because IRC is an -external network call: - -```bash -exo agent create --model gpt-5.4 --enable-networking "IRC Agent" -``` - -If the IRC server requires a password or NickServ token, store it as a normal -secret: - -```bash -exo secret set irc-password --env IRC_PASSWORD -``` - -The exact CLI command may differ as the config surface evolves, but the storage -model should remain generic: the IRC tool references a secret id, and the -exoharness stores only the secret material. The model sees the tool schema and -arguments, not the password. - -### What Does Not Change - -This example should not require: - -- A new exoharness binding type named `irc`. -- A Rust `ToolRuntime` implementation for IRC. -- IRC-specific event variants. -- Raw IRC credentials in model-visible prompts, tool definitions, or events. - -The durable event history remains the same: - -1. The model emits `irc_send_message`. -2. The executor appends `tool_requested`. -3. The registry authorizes and executes the TypeScript handler. -4. The registry returns a `tool_result` event. -5. The next model round sees the result through normal event materialization. - -### Hardening Before Sharing - -For a local experiment, the direct module above is enough. Before treating IRC -as a reusable library tool or allowing broad agent-created tools, add: - -- A policy check that only allows configured servers and channels. -- A confirmation requirement for sending messages to public channels. -- Rate limits and message length validation. -- Redacted observability events for connection failures. -- Manifest validation and path restrictions for agent tool modules. -- Tests for argument validation, duplicate tool registration, missing secrets, - disabled networking, and rejected manifests. - -After review, a user could promote the IRC implementation from an `agent` tool -to a `library` tool by moving it into a user-maintained module and importing it -explicitly from the harness. The exoharness substrate still only needs generic -bindings, secrets, artifacts, events, and sandbox/network policy. diff --git a/typescript/harness/index.ts b/typescript/harness/index.ts index 22340c5..7b6aa3f 100644 --- a/typescript/harness/index.ts +++ b/typescript/harness/index.ts @@ -7,6 +7,7 @@ export interface JsonObject { export * from "./tools"; export * from "./built-in-tools"; export * from "./tool-manifest"; +export * from "./scheduler-tools"; export type MessageRole = | "system" @@ -23,7 +24,7 @@ export interface Message { export interface AgentConfig { instructions: Message[]; - harness: "basic" | "rlm" | "typescript"; + harness: "basic" | "rlm" | "typescript" | "exoclaw"; typescript?: { modulePath: string; } | null; diff --git a/typescript/harness/runner.ts b/typescript/harness/runner.ts index ff8b14c..f6b006c 100644 --- a/typescript/harness/runner.ts +++ b/typescript/harness/runner.ts @@ -43,7 +43,7 @@ import { interface RawAgentConfig { instructions: Message[]; - harness: "basic" | "rlm" | "typescript" | "type_script"; + harness: "basic" | "rlm" | "typescript" | "type_script" | "exoclaw"; typescript?: { module_path: string; } | null; diff --git a/typescript/harness/scheduler-tools.ts b/typescript/harness/scheduler-tools.ts new file mode 100644 index 0000000..af3b86e --- /dev/null +++ b/typescript/harness/scheduler-tools.ts @@ -0,0 +1,215 @@ +import type { JsonObject, ToolDefinition, TurnContext } from "./index"; +import type { HarnessToolRegistry, ToolInstance } from "./tools"; + +export type SchedulerToolName = + | "schedule_sandbox_task" + | "list_scheduled_tasks" + | "cancel_scheduled_task" + | "delete_scheduled_task"; + +export function createSchedulerToolInstances(): ToolInstance[] { + return [ + createScheduleSandboxTaskTool(), + createListScheduledTasksTool(), + createCancelScheduledTaskTool(), + createDeleteScheduledTaskTool(), + ]; +} + +export function registerSchedulerTools( + registry: HarnessToolRegistry, + names: SchedulerToolName[] = [ + "schedule_sandbox_task", + "list_scheduled_tasks", + "cancel_scheduled_task", + "delete_scheduled_task", + ], +): void { + const requested = new Set(names); + for (const tool of createSchedulerToolInstances()) { + if (requested.has(tool.definition.name as SchedulerToolName)) { + registry.register(tool); + } + } +} + +function createScheduleSandboxTaskTool(): ToolInstance { + return { + source: "built_in", + definition: { + name: "schedule_sandbox_task", + description: + "Schedule a recurring command to run in this conversation's sandbox. A host scheduler owns timing and will wake this conversation with compact results when runs complete. The scheduler reuses the shared conversation sandbox when available; use setupCommand for task-specific setup that should run before each scheduled run.", + parameters: { + type: "object", + additionalProperties: false, + properties: { + name: { + type: "string", + description: + "Stable task name using letters, numbers, dashes, or underscores.", + }, + schedule: { + type: "string", + description: + "Schedule as '@every 10m', '@every 1h', or a simple cron interval like '*/30 * * * *'.", + }, + command: { + type: "array", + items: { type: "string" }, + minItems: 1, + description: + "Command argv to run in the sandbox, for example ['bash', '-lc', 'curl -fsSL https://example.com/health'].", + }, + sandboxMode: { + anyOf: [ + { type: "string", enum: ["conversation", "task_fresh"] }, + { type: "null" }, + ], + description: + "Sandbox selection mode. Use 'conversation' or null to run in the shared persistent conversation sandbox. Use 'task_fresh' to create a separate fresh sandbox for this task and reuse it across that task's runs.", + }, + setupCommand: { + anyOf: [ + { + type: "array", + items: { type: "string" }, + minItems: 1, + }, + { type: "null" }, + ], + description: + "Optional argv to run immediately before each scheduled run in the shared conversation sandbox, for example ['bash', '-lc', 'apt-get update && apt-get install -y curl']. Use this for dependencies that should be prepared before each run.", + }, + reportPrompt: { + type: "string", + description: + "Instructions for how to report each completed run back to the user.", + }, + maxOutputBytes: { + type: ["number", "null"], + description: + "Maximum bytes to retain from each output stream before truncating, or null for the default.", + }, + }, + required: [ + "name", + "schedule", + "command", + "sandboxMode", + "setupCommand", + "reportPrompt", + "maxOutputBytes", + ], + }, + }, + handler: { + execute(args, execution) { + return execution.context.executeTool({ + functionName: "schedule_sandbox_task", + arguments: withConversationScope(execution.context, args), + }); + }, + }, + }; +} + +function createListScheduledTasksTool(): ToolInstance { + return { + source: "built_in", + definition: { + name: "list_scheduled_tasks", + description: + "List scheduled sandbox tasks for this conversation. Disabled tasks are hidden unless includeDisabled is true.", + parameters: { + type: "object", + additionalProperties: false, + properties: { + includeDisabled: { + type: ["boolean", "null"], + description: + "Whether to include disabled/cancelled tasks. Use false or null for the default active-task view.", + }, + }, + required: ["includeDisabled"], + }, + }, + handler: { + execute(args, execution) { + return execution.context.executeTool({ + functionName: "list_scheduled_tasks", + arguments: withConversationScope(execution.context, args), + }); + }, + }, + }; +} + +function createDeleteScheduledTaskTool(): ToolInstance { + return { + source: "built_in", + definition: { + name: "delete_scheduled_task", + description: + "Permanently delete a scheduled sandbox task for this conversation, including its stored run history. Use cancel_scheduled_task instead when history should be preserved.", + parameters: taskIdParameters( + "Scheduled task id returned by schedule_sandbox_task or list_scheduled_tasks.", + ), + }, + handler: { + execute(args, execution) { + return execution.context.executeTool({ + functionName: "delete_scheduled_task", + arguments: withConversationScope(execution.context, args), + }); + }, + }, + }; +} + +function createCancelScheduledTaskTool(): ToolInstance { + return { + source: "built_in", + definition: { + name: "cancel_scheduled_task", + description: "Disable a scheduled sandbox task for this conversation.", + parameters: taskIdParameters( + "Scheduled task id returned by schedule_sandbox_task or list_scheduled_tasks.", + ), + }, + handler: { + execute(args, execution) { + return execution.context.executeTool({ + functionName: "cancel_scheduled_task", + arguments: withConversationScope(execution.context, args), + }); + }, + }, + }; +} + +function taskIdParameters(description: string): ToolDefinition["parameters"] { + return { + type: "object", + additionalProperties: false, + properties: { + taskId: { + type: "string", + description, + }, + }, + required: ["taskId"], + }; +} + +function withConversationScope( + context: TurnContext, + args: JsonObject, +): JsonObject { + const { agent, conversation } = context.exoharness.current; + return { + ...args, + agentId: agent.record.id, + conversationId: conversation.record.id, + }; +} From d816def390bc8318113388ffa1261efe008e9cc8 Mon Sep 17 00:00:00 2001 From: 61cygni <> Date: Sat, 23 May 2026 17:24:33 -0700 Subject: [PATCH 8/8] Default Exoclaw sandboxes to agent scope Co-authored-by: Cursor --- README.md | 16 +++ crates/cli/src/main.rs | 57 +++++++- crates/executor/src/agent_sandbox.rs | 139 ++++++++++++++++++++ crates/executor/src/basic.rs | 19 ++- crates/executor/src/basic_tests.rs | 5 + crates/executor/src/conversation_sandbox.rs | 26 ++-- crates/executor/src/executor_types.rs | 29 +++- crates/executor/src/harness_executor.rs | 15 ++- crates/executor/src/harness_tool.rs | 59 ++++++++- crates/executor/src/lib.rs | 5 +- crates/executor/src/scheduler_runtime.rs | 65 ++++++--- crates/executor/src/scheduler_types.rs | 5 +- crates/executor/src/typescript.rs | 13 +- crates/exoharness/src/sandbox.rs | 49 +------ examples/exoclaw/README.md | 57 +++++--- examples/exoclaw/harness.ts | 2 +- scripts/exoclaw-repl | 29 +++- typescript/harness/index.ts | 1 + typescript/harness/runner.ts | 2 + typescript/harness/scheduler-tools.ts | 8 +- 20 files changed, 476 insertions(+), 125 deletions(-) create mode 100644 crates/executor/src/agent_sandbox.rs diff --git a/README.md b/README.md index fe6ba7e..ef1433e 100644 --- a/README.md +++ b/README.md @@ -126,6 +126,22 @@ corresponding CLI commands are under: ./target/debug/exo --harness exoclaw schedule --help ``` +By default, Exoclaw uses an agent-scoped sandbox for shell commands and scheduled +tasks, so setup done through the REPL is shared by future tasks and conversations +for that agent while the warm sandbox is alive, including across normal REPL +restarts. Conversation-scoped and task-scoped sandboxes are available for +isolation. The sandbox filesystem is not yet durable across warm container +cleanup; use a prepared image or task `setupCommand` for dependencies that must +survive host/container cleanup. + +To make a conversation use its own sandbox for shell commands instead of the +agent sandbox, create or update it with `--sandbox-scope conversation`: + +```bash +./target/debug/exo --harness exoclaw conversation create exoclaw-agent "Isolated Dev" --sandbox-scope conversation +scripts/exoclaw-repl --conversation isolated-dev --sandbox-scope conversation +``` + ## Repository Layout - `crates`: Rust workspace for the CLI, exoharness substrate, and executors. diff --git a/crates/cli/src/main.rs b/crates/cli/src/main.rs index cd9e73f..94cf4aa 100644 --- a/crates/cli/src/main.rs +++ b/crates/cli/src/main.rs @@ -23,9 +23,9 @@ use executor::{ BraintrustRuntimeConfig, BraintrustTracingConfig, ConversationModelConfig, CreateAgentRequest, CreateConversationRequest, EventQuery, EventQueryDirection, ExoHarness, ExoclawToolRuntime, FileSystemMount, FileSystemMountMode, ForkConversationRequest, Harness, HarnessAgent, - HarnessConversation, PutSecretRequest, RlmHarness, SANDBOX_MAIN_MOUNT_DIR, Secret, - ToolManifestEntry, TypeScriptHarness, TypeScriptHarnessConfig, Uuid7, load_agent_config, - send_conversation_wakeup, + HarnessConversation, PutSecretRequest, RlmHarness, SANDBOX_MAIN_MOUNT_DIR, SandboxScope, + Secret, ToolManifestEntry, TypeScriptHarness, TypeScriptHarnessConfig, Uuid7, + effective_sandbox_scope, load_agent_config, send_conversation_wakeup, }; use lingua::Message; use lingua::universal::{AssistantContent, AssistantContentPart, ToolContentPart, UserContent}; @@ -73,6 +73,21 @@ enum NetworkingMode { Disabled, } +#[derive(Debug, Clone, Copy, ValueEnum)] +enum SandboxScopeArg { + Agent, + Conversation, +} + +impl From for SandboxScope { + fn from(value: SandboxScopeArg) -> Self { + match value { + SandboxScopeArg::Agent => SandboxScope::Agent, + SandboxScopeArg::Conversation => SandboxScope::Conversation, + } + } +} + #[derive(Debug, Subcommand)] enum Commands { /// Start a chat REPL using a registered model, creating a default agent and @@ -203,6 +218,8 @@ enum ConversationCommands { name: Option, #[arg(long)] slug: Option, + #[arg(long, value_enum)] + sandbox_scope: Option, #[arg(long)] repl: bool, }, @@ -226,6 +243,8 @@ enum ConversationCommands { clear_shell_program: bool, #[arg(long, value_enum)] networking: Option, + #[arg(long, value_enum)] + sandbox_scope: Option, #[arg(long)] model: Option, #[arg(long)] @@ -724,6 +743,7 @@ async fn main() -> Result<(), Box> { agent, name, slug, + sandbox_scope, repl, } => { let agent = must_get_agent(harness.as_ref(), &agent).await?; @@ -742,6 +762,11 @@ async fn main() -> Result<(), Box> { name, }) .await?; + if let Some(sandbox_scope) = sandbox_scope { + let mut config = conversation.config().await?; + config.sandbox_scope = Some(sandbox_scope.into()); + conversation.put_config(config).await?; + } println!( "created conversation {} ({})", conversation.record().slug, @@ -803,6 +828,7 @@ async fn main() -> Result<(), Box> { shell_program, clear_shell_program, networking, + sandbox_scope, model, max_output_tokens, clear_max_output_tokens, @@ -852,6 +878,11 @@ async fn main() -> Result<(), Box> { changed = true; } + if let Some(sandbox_scope) = sandbox_scope { + config.sandbox_scope = Some(sandbox_scope.into()); + changed = true; + } + let updated_model_override = if clear_model_override { changed = true; Some(None) @@ -1015,7 +1046,18 @@ async fn main() -> Result<(), Box> { ); println!( "shell_program: {}", - config.shell_program.unwrap_or_else(|| "none".to_string()) + config.shell_program.as_deref().unwrap_or("none") + ); + println!( + "sandbox_scope: {}", + config + .sandbox_scope + .map(sandbox_scope_name) + .unwrap_or("default") + ); + println!( + "effective_sandbox_scope: {}", + sandbox_scope_name(effective_sandbox_scope(&agent_config, &config)) ); println!( "effective_sandbox_image: {}", @@ -1699,6 +1741,13 @@ fn chat_repl_command(agent_slug: &str, conversation_slug: &str) -> String { format!("exo chat repl {agent_slug} {conversation_slug}") } +fn sandbox_scope_name(scope: SandboxScope) -> &'static str { + match scope { + SandboxScope::Agent => "agent", + SandboxScope::Conversation => "conversation", + } +} + fn slugify(input: &str) -> String { let mut slug = String::new(); let mut last_was_dash = false; diff --git a/crates/executor/src/agent_sandbox.rs b/crates/executor/src/agent_sandbox.rs new file mode 100644 index 0000000..981deeb --- /dev/null +++ b/crates/executor/src/agent_sandbox.rs @@ -0,0 +1,139 @@ +use exoharness::{ + AgentHandle, Artifact, ArtifactVersion, ConversationHandle, CreateSandboxRequest, + ReadArtifactRequest, Result, Uuid7, WriteArtifactRequest, +}; +use serde::{Deserialize, Serialize}; + +use crate::conversation_sandbox::{ + ConversationSandboxSpec, conversation_sandbox_spec, conversation_sandboxes, +}; +use crate::{AgentConfig, ConversationConfig}; + +const AGENT_SANDBOX_ARTIFACT_PATH: &str = "config/exoclaw-agent-sandbox.json"; + +#[derive(Clone)] +pub(crate) struct AgentSandboxHandle { + pub(crate) conversation: std::sync::Arc, + pub(crate) sandbox_id: String, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(rename_all = "camelCase")] +struct AgentSandboxRecord { + conversation_id: String, + sandbox_id: String, + image: String, + default_workdir: String, + file_system_mounts: Vec, + enable_networking: bool, + idle_seconds: u64, +} + +impl AgentSandboxRecord { + fn matches_spec(&self, spec: &ConversationSandboxSpec) -> bool { + self.image == spec.image + && self.default_workdir == spec.default_workdir + && self.file_system_mounts == spec.file_system_mounts + && self.enable_networking == spec.enable_networking + && self.idle_seconds == spec.idle_seconds + } +} + +pub(crate) async fn ensure_agent_sandbox( + agent: &dyn AgentHandle, + current_conversation: &dyn ConversationHandle, + agent_config: &AgentConfig, + conversation_config: &ConversationConfig, +) -> Result { + let spec = conversation_sandbox_spec(agent_config, conversation_config); + if let Some(record) = load_agent_sandbox_record(agent).await? + && record.matches_spec(&spec) + && let Ok(conversation_id) = record.conversation_id.parse::() + && let Some(owner) = agent.get_conversation(&conversation_id).await? + { + for sandbox in conversation_sandboxes(owner.as_ref()).await? { + if sandbox.id == record.sandbox_id && sandbox.matches_spec(&spec) { + return Ok(AgentSandboxHandle { + conversation: owner, + sandbox_id: record.sandbox_id, + }); + } + } + } + + let sandbox_id = current_conversation + .create_sandbox(CreateSandboxRequest { + image: spec.image.clone(), + default_workdir: Some(spec.default_workdir.clone()), + file_system_mounts: Some(spec.file_system_mounts.clone()), + enable_networking: Some(spec.enable_networking), + idle_seconds: Some(spec.idle_seconds), + }) + .await?; + store_agent_sandbox_record( + agent, + &AgentSandboxRecord { + conversation_id: current_conversation.record().id.to_string(), + sandbox_id: sandbox_id.clone(), + image: spec.image, + default_workdir: spec.default_workdir, + file_system_mounts: spec.file_system_mounts, + enable_networking: spec.enable_networking, + idle_seconds: spec.idle_seconds, + }, + ) + .await?; + + let Some(owner) = agent + .get_conversation(¤t_conversation.record().id) + .await? + else { + anyhow::bail!( + "agent sandbox owner conversation disappeared: {}", + current_conversation.record().id + ); + }; + Ok(AgentSandboxHandle { + conversation: owner, + sandbox_id, + }) +} + +async fn load_agent_sandbox_record(agent: &dyn AgentHandle) -> Result> { + let Some(artifact) = latest_agent_artifact(agent, AGENT_SANDBOX_ARTIFACT_PATH).await? else { + return Ok(None); + }; + Ok(Some(serde_json::from_slice(&artifact.contents)?)) +} + +async fn store_agent_sandbox_record( + agent: &dyn AgentHandle, + record: &AgentSandboxRecord, +) -> Result<()> { + agent + .write_artifact(WriteArtifactRequest { + path: AGENT_SANDBOX_ARTIFACT_PATH.to_string(), + contents: serde_json::to_vec_pretty(record)?, + }) + .await?; + Ok(()) +} + +async fn latest_agent_artifact(agent: &dyn AgentHandle, path: &str) -> Result> { + let Some(version) = latest_artifact_version(agent.list_artifacts().await?, path) else { + return Ok(None); + }; + agent + .read_artifact(ReadArtifactRequest { + artifact_id: version.artifact_id, + version: Some(version.version), + }) + .await +} + +fn latest_artifact_version(artifacts: Vec, path: &str) -> Option { + artifacts + .into_iter() + .filter(|artifact| artifact.path == path) + .max_by_key(|artifact| artifact.version) +} diff --git a/crates/executor/src/basic.rs b/crates/executor/src/basic.rs index 798dd03..c6fe8f9 100644 --- a/crates/executor/src/basic.rs +++ b/crates/executor/src/basic.rs @@ -107,6 +107,7 @@ where async fn run_turn_loop( &self, + agent: &dyn AgentHandle, conversation: &dyn ConversationHandle, turn: &dyn TurnHandle, agent_config: &AgentConfig, @@ -142,7 +143,9 @@ where let tool_results = self .execute_tool_round( + agent, conversation, + agent_config, conversation_config, tool_requests, round as usize, @@ -243,7 +246,9 @@ where async fn execute_tool_round( &self, + agent: &dyn AgentHandle, conversation: &dyn ConversationHandle, + agent_config: &AgentConfig, conversation_config: &ConversationConfig, tool_requests: Vec, round: usize, @@ -274,7 +279,13 @@ where }; let result = match self .tools - .execute(conversation, conversation_config, &tool_request.request) + .execute( + agent, + conversation, + agent_config, + conversation_config, + &tool_request.request, + ) .await { Ok(response) => response, @@ -317,12 +328,13 @@ where async fn prepare_conversation( &self, + agent: &dyn AgentHandle, conversation: &dyn ConversationHandle, agent_config: &AgentConfig, conversation_config: &ConversationConfig, ) -> Result<()> { self.tools - .prepare_conversation(conversation, agent_config, conversation_config) + .prepare_conversation(agent, conversation, agent_config, conversation_config) .await } @@ -332,7 +344,7 @@ where async fn run_turn( &self, - _agent: &dyn AgentHandle, + agent: &dyn AgentHandle, conversation: &dyn ConversationHandle, turn: Arc, agent_config: &AgentConfig, @@ -342,6 +354,7 @@ where turn_trace: Option<&dyn TurnExecutionTrace>, ) -> Result<()> { self.run_turn_loop( + agent, conversation, turn.as_ref(), agent_config, diff --git a/crates/executor/src/basic_tests.rs b/crates/executor/src/basic_tests.rs index 3d846f8..bcf686c 100644 --- a/crates/executor/src/basic_tests.rs +++ b/crates/executor/src/basic_tests.rs @@ -60,6 +60,7 @@ async fn send_appends_user_and_assistant_messages() { executor .prepare_conversation( + agent.as_ref(), conversation.as_ref(), &default_agent_config(), &ConversationConfig::default(), @@ -152,6 +153,7 @@ async fn send_executes_tool_round_trip() { enable_networking: true, shell_program: Some("bash".to_string()), mounts: Vec::new(), + sandbox_scope: None, }; let turn = conversation .begin_turn(BeginTurnRequest { @@ -259,6 +261,7 @@ async fn send_stream_emits_chunks_and_persists_final_response() { executor .prepare_conversation( + agent.as_ref(), conversation.as_ref(), &default_agent_config(), &ConversationConfig::default(), @@ -439,7 +442,9 @@ impl FakeToolRuntime { impl ToolRuntime for FakeToolRuntime { async fn execute( &self, + _agent: &dyn AgentHandle, _conversation: &dyn ConversationHandle, + _agent_config: &AgentConfig, _config: &ConversationConfig, _request: &ToolRequest, ) -> Result { diff --git a/crates/executor/src/conversation_sandbox.rs b/crates/executor/src/conversation_sandbox.rs index 7347fa1..72e0e88 100644 --- a/crates/executor/src/conversation_sandbox.rs +++ b/crates/executor/src/conversation_sandbox.rs @@ -7,15 +7,15 @@ use exoharness::{ #[derive(Debug, Clone, PartialEq, Eq)] pub(crate) struct ConversationSandboxInfo { pub(crate) id: String, - image: String, - default_workdir: String, - file_system_mounts: Vec, - enable_networking: bool, - idle_seconds: u64, + pub(crate) image: String, + pub(crate) default_workdir: String, + pub(crate) file_system_mounts: Vec, + pub(crate) enable_networking: bool, + pub(crate) idle_seconds: u64, } impl ConversationSandboxInfo { - fn matches_spec(&self, spec: &ConversationSandboxSpec) -> bool { + pub(crate) fn matches_spec(&self, spec: &ConversationSandboxSpec) -> bool { self.image == spec.image && self.default_workdir == spec.default_workdir && self.file_system_mounts == spec.file_system_mounts @@ -25,12 +25,12 @@ impl ConversationSandboxInfo { } #[derive(Debug, Clone, PartialEq, Eq)] -struct ConversationSandboxSpec { - image: String, - default_workdir: String, - file_system_mounts: Vec, - enable_networking: bool, - idle_seconds: u64, +pub(crate) struct ConversationSandboxSpec { + pub(crate) image: String, + pub(crate) default_workdir: String, + pub(crate) file_system_mounts: Vec, + pub(crate) enable_networking: bool, + pub(crate) idle_seconds: u64, } pub(crate) async fn ensure_conversation_sandbox( @@ -106,7 +106,7 @@ pub(crate) async fn conversation_sandboxes( Ok(sandboxes) } -fn conversation_sandbox_spec( +pub(crate) fn conversation_sandbox_spec( agent_config: &AgentConfig, config: &ConversationConfig, ) -> ConversationSandboxSpec { diff --git a/crates/executor/src/executor_types.rs b/crates/executor/src/executor_types.rs index 6c3ae96..1aedfa3 100644 --- a/crates/executor/src/executor_types.rs +++ b/crates/executor/src/executor_types.rs @@ -5,8 +5,8 @@ use std::time::Duration; use async_trait::async_trait; use exoharness::{ - ConversationHandle, EventId, FileSystemMount, ResponseId, Result, SessionId, ToolArguments, - ToolCallId, ToolRequest, ToolResult, TurnId, + AgentHandle, ConversationHandle, EventId, FileSystemMount, ResponseId, Result, SessionId, + ToolArguments, ToolCallId, ToolRequest, ToolResult, TurnId, }; use lingua::{Message, UniversalStreamChunk, UniversalUsage}; use serde::{Deserialize, Serialize}; @@ -68,6 +68,15 @@ pub struct ConversationConfig { pub shell_program: Option, #[serde(default)] pub mounts: Vec, + #[serde(default)] + pub sandbox_scope: Option, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum SandboxScope { + Agent, + Conversation, } #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] @@ -95,10 +104,23 @@ impl Default for ConversationConfig { enable_networking: false, shell_program: Some("/bin/bash".to_string()), mounts: Vec::new(), + sandbox_scope: None, } } } +pub fn effective_sandbox_scope( + agent_config: &AgentConfig, + conversation_config: &ConversationConfig, +) -> SandboxScope { + conversation_config + .sandbox_scope + .unwrap_or(match agent_config.harness { + AgentHarnessKind::Exoclaw => SandboxScope::Agent, + _ => SandboxScope::Conversation, + }) +} + #[async_trait] pub trait ModelClient: Send + Sync { async fn complete(&self, request: ModelRequest) -> Result; @@ -115,6 +137,7 @@ pub trait ModelResponseStream: Send { pub trait ToolRuntime: Send + Sync { async fn prepare_conversation( &self, + _agent: &dyn AgentHandle, _conversation: &dyn ConversationHandle, _agent_config: &AgentConfig, _config: &ConversationConfig, @@ -124,7 +147,9 @@ pub trait ToolRuntime: Send + Sync { async fn execute( &self, + agent: &dyn AgentHandle, conversation: &dyn ConversationHandle, + agent_config: &AgentConfig, config: &ConversationConfig, request: &ToolRequest, ) -> Result; diff --git a/crates/executor/src/harness_executor.rs b/crates/executor/src/harness_executor.rs index f1df255..2d53be1 100644 --- a/crates/executor/src/harness_executor.rs +++ b/crates/executor/src/harness_executor.rs @@ -34,6 +34,7 @@ pub(crate) trait HarnessExecutor: Send + Sync + Clone + 'static { async fn prepare_conversation( &self, + _agent: &dyn AgentHandle, _conversation: &dyn ConversationHandle, _agent_config: &AgentConfig, _conversation_config: &ConversationConfig, @@ -207,7 +208,12 @@ where )?; apply_conversation_model_override(&mut agent_config, model_override); self.executor - .prepare_conversation(conversation.as_ref(), &agent_config, &conversation_config) + .prepare_conversation( + agent.as_ref(), + conversation.as_ref(), + &agent_config, + &conversation_config, + ) .await?; let prepared = self.executor.prepare_request(&request)?; let turn = conversation @@ -260,7 +266,12 @@ where )?; apply_conversation_model_override(&mut agent_config, model_override); self.executor - .prepare_conversation(conversation.as_ref(), &agent_config, &conversation_config) + .prepare_conversation( + agent.as_ref(), + conversation.as_ref(), + &agent_config, + &conversation_config, + ) .await?; let prepared = self.executor.prepare_request(&request)?; let turn = conversation diff --git a/crates/executor/src/harness_tool.rs b/crates/executor/src/harness_tool.rs index 56b4df9..e64518d 100644 --- a/crates/executor/src/harness_tool.rs +++ b/crates/executor/src/harness_tool.rs @@ -1,13 +1,18 @@ use std::path::PathBuf; +use crate::agent_sandbox::ensure_agent_sandbox; use crate::conversation_sandbox::{conversation_sandboxes, ensure_conversation_sandbox}; use crate::scheduler_store::SchedulerStore; use crate::scheduler_types::{ DEFAULT_MAX_OUTPUT_BYTES, NewScheduledTask, ScheduledTaskSandboxMode, }; use crate::{AgentConfig, ConversationConfig, ToolRuntime}; +use crate::{SandboxScope, effective_sandbox_scope}; use async_trait::async_trait; -use exoharness::{ConversationHandle, Result, RunInSandboxRequest, ToolRequest, ToolResult}; +use exoharness::{ + AgentHandle, ConversationHandle, Result, RunInSandboxRequest, SandboxProcess, ToolRequest, + ToolResult, +}; use futures::io::AsyncReadExt; use serde::{Deserialize, Serialize}; use serde_json::Value; @@ -32,6 +37,7 @@ impl ExoclawToolRuntime { impl ToolRuntime for BasicToolRuntime { async fn prepare_conversation( &self, + _agent: &dyn AgentHandle, conversation: &dyn ConversationHandle, agent_config: &AgentConfig, config: &ConversationConfig, @@ -44,7 +50,9 @@ impl ToolRuntime for BasicToolRuntime { async fn execute( &self, + _agent: &dyn AgentHandle, conversation: &dyn ConversationHandle, + _agent_config: &AgentConfig, config: &ConversationConfig, request: &ToolRequest, ) -> Result { @@ -61,22 +69,34 @@ impl ToolRuntime for BasicToolRuntime { impl ToolRuntime for ExoclawToolRuntime { async fn prepare_conversation( &self, + agent: &dyn AgentHandle, conversation: &dyn ConversationHandle, agent_config: &AgentConfig, config: &ConversationConfig, ) -> Result<()> { - ensure_conversation_sandbox(conversation, agent_config, config).await?; + match effective_sandbox_scope(agent_config, config) { + SandboxScope::Agent => { + ensure_agent_sandbox(agent, conversation, agent_config, config).await?; + } + SandboxScope::Conversation => { + ensure_conversation_sandbox(conversation, agent_config, config).await?; + } + } Ok(()) } async fn execute( &self, + agent: &dyn AgentHandle, conversation: &dyn ConversationHandle, + agent_config: &AgentConfig, config: &ConversationConfig, request: &ToolRequest, ) -> Result { match request.function_name.as_str() { - "shell" => execute_shell_tool(conversation, config, request).await, + "shell" => { + execute_exoclaw_shell_tool(agent, conversation, agent_config, config, request).await + } "schedule_sandbox_task" => { execute_schedule_task_tool(&self.scheduler_store, request).await } @@ -276,6 +296,10 @@ async fn execute_shell_tool( env: Default::default(), }) .await?; + read_shell_process(process).await +} + +async fn read_shell_process(process: Box) -> Result { let parts = process.into_parts(); let mut stdout = parts.stdout; let mut stderr = parts.stderr; @@ -298,3 +322,32 @@ async fn execute_shell_tool( exit_code, })?) } + +async fn execute_exoclaw_shell_tool( + agent: &dyn AgentHandle, + conversation: &dyn ConversationHandle, + agent_config: &AgentConfig, + config: &ConversationConfig, + request: &ToolRequest, +) -> Result { + if effective_sandbox_scope(agent_config, config) == SandboxScope::Conversation { + return execute_shell_tool(conversation, config, request).await; + } + + let args = + serde_json::from_value::(Value::Object(request.arguments.clone()))?; + let program = config + .shell_program + .clone() + .ok_or_else(|| anyhow::anyhow!("shell tool is not enabled for this conversation"))?; + let agent_sandbox = ensure_agent_sandbox(agent, conversation, agent_config, config).await?; + let process = agent_sandbox + .conversation + .run_in_sandbox(RunInSandboxRequest { + id: agent_sandbox.sandbox_id, + command: vec![program, "-lc".to_string(), args.command], + env: Default::default(), + }) + .await?; + read_shell_process(process).await +} diff --git a/crates/executor/src/lib.rs b/crates/executor/src/lib.rs index 886fed7..3862692 100644 --- a/crates/executor/src/lib.rs +++ b/crates/executor/src/lib.rs @@ -1,3 +1,4 @@ +mod agent_sandbox; mod basic; #[cfg(test)] mod basic_tests; @@ -33,8 +34,8 @@ pub use conversation_wakeup::send_conversation_wakeup; pub use executor_types::{ AgentConfig, AgentHarnessKind, ConversationConfig, ConversationModelConfig, ExecutionStreamEvent, ExecutionStreamHandle, ModelClient, ModelRequest, ModelResponse, - ModelResponseStream, PendingToolCall, SendRequest, SendResult, ToolDefinition, - ToolManifestEntry, ToolRuntime, TypeScriptHarnessConfig, + ModelResponseStream, PendingToolCall, SandboxScope, SendRequest, SendResult, ToolDefinition, + ToolManifestEntry, ToolRuntime, TypeScriptHarnessConfig, effective_sandbox_scope, }; pub use exoharness::{ AgentHandle, BasicExoHarness, Binding, BindingMetadata, ConversationHandle, EventData, EventId, diff --git a/crates/executor/src/scheduler_runtime.rs b/crates/executor/src/scheduler_runtime.rs index 135284e..ea1d5b4 100644 --- a/crates/executor/src/scheduler_runtime.rs +++ b/crates/executor/src/scheduler_runtime.rs @@ -5,6 +5,7 @@ use exoharness::{RunInSandboxRequest, SandboxProcess, WriteArtifactRequest}; use futures::io::{AsyncRead, AsyncReadExt}; use serde::Serialize; +use crate::agent_sandbox::{AgentSandboxHandle, ensure_agent_sandbox}; use crate::conversation_sandbox::{create_conversation_sandbox, ensure_conversation_sandbox}; use crate::conversation_wakeup::send_conversation_wakeup; use crate::scheduler_store::SchedulerStore; @@ -141,17 +142,19 @@ async fn run_task_inner( let agent_config = agent.config().await?; let conversation_config = conversation.config().await?; let conversation_handle = conversation.exoharness_handle(); - let sandbox_id = resolve_task_sandbox( + let sandbox = resolve_task_sandbox( task, - conversation_handle.as_ref(), + agent.exoharness_handle().as_ref(), + std::sync::Arc::clone(&conversation_handle), &agent_config, &conversation_config, ) .await?; let command_result: Result = async { - let process = conversation_handle + let process = sandbox + .conversation .run_in_sandbox(RunInSandboxRequest { - id: sandbox_id.clone(), + id: sandbox.sandbox_id.clone(), command: task .setup_command .clone() @@ -162,7 +165,7 @@ async fn run_task_inner( let setup_output = read_process_output(process, task.max_output_bytes).await?; if task.setup_command.is_none() { return Ok(CommandOutput { - sandbox_id, + sandbox_id: sandbox.sandbox_id.clone(), setup: None, main: setup_output, error: None, @@ -170,22 +173,23 @@ async fn run_task_inner( } if setup_output.exit_code != Some(0) { return Ok(CommandOutput { - sandbox_id, + sandbox_id: sandbox.sandbox_id.clone(), setup: Some(setup_output), main: ProcessOutput::empty(), error: Some("setup command exited non-zero".to_string()), }); } - let process = conversation_handle + let process = sandbox + .conversation .run_in_sandbox(RunInSandboxRequest { - id: sandbox_id.clone(), + id: sandbox.sandbox_id.clone(), command: task.command.clone(), env: Default::default(), }) .await?; let main_output = read_process_output(process, task.max_output_bytes).await?; Ok(CommandOutput { - sandbox_id: sandbox_id.clone(), + sandbox_id: sandbox.sandbox_id.clone(), setup: Some(setup_output), main: main_output, error: None, @@ -271,23 +275,48 @@ async fn run_task_inner( async fn resolve_task_sandbox( task: &mut ScheduledTaskRecord, - conversation: &dyn exoharness::ConversationHandle, + agent: &dyn exoharness::AgentHandle, + conversation: std::sync::Arc, agent_config: &crate::AgentConfig, conversation_config: &crate::ConversationConfig, -) -> Result { +) -> Result { match task.sandbox_mode { - ScheduledTaskSandboxMode::Conversation => { - ensure_conversation_sandbox(conversation, agent_config, conversation_config).await + ScheduledTaskSandboxMode::Agent => { + ensure_agent_sandbox( + agent, + conversation.as_ref(), + agent_config, + conversation_config, + ) + .await } + ScheduledTaskSandboxMode::Conversation => Ok(AgentSandboxHandle { + sandbox_id: ensure_conversation_sandbox( + conversation.as_ref(), + agent_config, + conversation_config, + ) + .await?, + conversation, + }), ScheduledTaskSandboxMode::TaskFresh => { if let Some(sandbox_id) = &task.task_sandbox_id { - return Ok(sandbox_id.clone()); + return Ok(AgentSandboxHandle { + conversation, + sandbox_id: sandbox_id.clone(), + }); } - let sandbox_id = - create_conversation_sandbox(conversation, agent_config, conversation_config) - .await?; + let sandbox_id = create_conversation_sandbox( + conversation.as_ref(), + agent_config, + conversation_config, + ) + .await?; task.task_sandbox_id = Some(sandbox_id.clone()); - Ok(sandbox_id) + Ok(AgentSandboxHandle { + conversation, + sandbox_id, + }) } } } diff --git a/crates/executor/src/scheduler_types.rs b/crates/executor/src/scheduler_types.rs index 7be99eb..b949770 100644 --- a/crates/executor/src/scheduler_types.rs +++ b/crates/executor/src/scheduler_types.rs @@ -48,6 +48,7 @@ pub struct NewScheduledTask { #[serde(rename_all = "snake_case")] pub enum ScheduledTaskSandboxMode { #[default] + Agent, Conversation, TaskFresh, } @@ -246,7 +247,7 @@ mod tests { } #[test] - fn scheduled_task_defaults_to_conversation_sandbox() { + fn scheduled_task_defaults_to_agent_sandbox() { let task = ScheduledTaskRecord::new( NewScheduledTask { agent_id: "agent".to_string(), @@ -263,7 +264,7 @@ mod tests { ) .unwrap(); - assert_eq!(task.sandbox_mode, ScheduledTaskSandboxMode::Conversation); + assert_eq!(task.sandbox_mode, ScheduledTaskSandboxMode::Agent); assert_eq!(task.task_sandbox_id, None); } diff --git a/crates/executor/src/typescript.rs b/crates/executor/src/typescript.rs index f075080..3f546b3 100644 --- a/crates/executor/src/typescript.rs +++ b/crates/executor/src/typescript.rs @@ -80,12 +80,13 @@ where async fn prepare_conversation( &self, + agent: &dyn AgentHandle, conversation: &dyn ConversationHandle, agent_config: &AgentConfig, conversation_config: &ConversationConfig, ) -> Result<()> { self.tools - .prepare_conversation(conversation, agent_config, conversation_config) + .prepare_conversation(agent, conversation, agent_config, conversation_config) .await } @@ -169,7 +170,7 @@ where async fn execute_runtime_request( &self, - _agent: &dyn AgentHandle, + agent: &dyn AgentHandle, conversation: &dyn ConversationHandle, _agent_config: &AgentConfig, conversation_config: &ConversationConfig, @@ -179,7 +180,13 @@ where RuntimeRequest::ExecuteTool { request } => Ok(RuntimeResponsePayload::ToolResult { result: self .tools - .execute(conversation, conversation_config, &request) + .execute( + agent, + conversation, + _agent_config, + conversation_config, + &request, + ) .await?, }), RuntimeRequest::StartSandboxProcess { .. } diff --git a/crates/exoharness/src/sandbox.rs b/crates/exoharness/src/sandbox.rs index 957f3e8..5de199f 100644 --- a/crates/exoharness/src/sandbox.rs +++ b/crates/exoharness/src/sandbox.rs @@ -118,7 +118,7 @@ const DEFAULT_ENABLED_NETWORK_NAME: &str = "exo-default"; const WARM_SANDBOX_KEEPALIVE_ARGV: &[&str] = &["sleep", "infinity"]; const WARM_SANDBOX_HEALTHCHECK_TIMEOUT: Duration = Duration::from_secs(3); const WARM_SANDBOX_CLEANUP_TIMEOUT: Duration = Duration::from_secs(5); -const ORPHANED_WARM_SANDBOX_MIN_AGE: Duration = Duration::from_secs(60 * 60); +const ORPHANED_WARM_SANDBOX_MIN_AGE: Duration = Duration::from_secs(24 * 60 * 60); const WARM_SANDBOX_KEY_LABEL: &str = "exo.sandbox.key"; const WARM_SANDBOX_SPEC_HASH_LABEL: &str = "exo.sandbox.spec-hash"; const WARM_SANDBOX_OWNER_PID_LABEL: &str = "exo.sandbox.owner-pid"; @@ -291,16 +291,9 @@ impl Default for AppleContainerSandboxBackend { impl Drop for AppleContainerSandboxBackend { fn drop(&mut self) { - let Ok(mut warm_sandboxes) = self.warm_sandboxes.try_lock() else { - return; - }; - let names = warm_sandboxes - .drain() - .filter_map(|(_, entry)| entry.owned.then_some(entry.name)) - .collect::>(); - for name in names { - cleanup_named_container_blocking(&self.container_bin, &name); - } + // Warm sandboxes intentionally outlive a single CLI/REPL process so a + // restarted Exoclaw agent can reattach to the same environment. Stale + // containers are cleaned by the orphan reaper on later backend startup. } } @@ -1074,11 +1067,6 @@ fn owner_pid_is_alive(pid: &str) -> bool { .is_ok_and(|status| status.success()) } -fn cleanup_named_container_blocking(container_bin: &Path, name: &str) { - run_container_admin_command_blocking(container_bin, ["stop", name]); - run_container_admin_command_blocking(container_bin, ["delete", name]); -} - fn schedule_cleanup_named_container(container_bin: PathBuf, name: String) { tokio::spawn(async move { if let Err(error) = cleanup_named_container(&container_bin, &name).await { @@ -1110,35 +1098,6 @@ async fn run_container_admin_command( } } -fn run_container_admin_command_blocking(container_bin: &Path, args: [&str; N]) { - let Ok(mut child) = std::process::Command::new(container_bin) - .args(args) - .stdout(Stdio::null()) - .stderr(Stdio::null()) - .spawn() - else { - return; - }; - - let deadline = Instant::now() + WARM_SANDBOX_CLEANUP_TIMEOUT; - loop { - match child.try_wait() { - Ok(Some(_)) => return, - Ok(None) if Instant::now() < deadline => std::thread::sleep(Duration::from_millis(50)), - Ok(None) => { - if let Err(error) = child.kill() { - eprintln!("failed to kill timed out container admin command: {error}"); - } - if let Err(error) = child.wait() { - eprintln!("failed to wait for timed out container admin command: {error}"); - } - return; - } - Err(_) => return, - } - } -} - fn is_missing_container_error(stderr: &str) -> bool { let lower = stderr.to_ascii_lowercase(); lower.contains("not found") || lower.contains("no such") diff --git a/examples/exoclaw/README.md b/examples/exoclaw/README.md index 14bebed..f2196ab 100644 --- a/examples/exoclaw/README.md +++ b/examples/exoclaw/README.md @@ -5,13 +5,17 @@ TypeScript harness turn loop, but opts into heavier runtime features: - scheduled sandbox tasks - live conversation wake-ups -- sticky conversation sandbox policy +- sticky agent sandbox policy +- optional `sandboxScope: "conversation"` conversation-scoped shell sandboxes +- optional `sandboxMode: "conversation"` scheduled tasks - optional `sandboxMode: "task_fresh"` task-owned sandboxes Use Exoclaw when the agent should keep working over time. Use `examples/typescript/basic-harness.ts` for a minimal TypeScript harness without scheduler tools. +Start an Exoclaw REPL with the default agent-scoped sandbox: + ## Tools Exoclaw includes the normal minimal tools: @@ -32,17 +36,39 @@ It also adds scheduler tools: ## Sandbox Modes -Scheduled tasks default to `sandboxMode: "conversation"`. This uses the sticky -conversation sandbox, so packages installed through the REPL, such as `curl` or -`python3`, are available to scheduled task runs while that warm sandbox is still -alive. +Exoclaw conversations default to `sandboxScope: "agent"`. The `shell` tool uses +the sticky agent sandbox, so packages installed through the Exoclaw REPL, such as +`curl` or `python3`, are available to scheduled task runs and future +conversations for the same agent while that warm sandbox is still alive. Normal +REPL exits leave the warm sandbox running so the next Exoclaw process can +reattach to it. + +Because exoclaw defaults to agent scope, you don't need to specify anything from +the cli. The following command will create a REPL with the agent and a +persistent sandbox that will be durable across conversations + +```bash +scripts/exoclaw-repl --pull-sandbox +``` + +If you want a conversation to have its own sandbox, use `sandboxScope: "conversation"`: + +```bash +scripts/exoclaw-repl --conversation isolated-dev --sandbox-scope conversation +exo --harness exoclaw conversation update exoclaw-agent isolated-dev --sandbox-scope conversation +``` + +Scheduled tasks also default to `sandboxMode: "agent"`. A task can explicitly use +`sandboxMode: "conversation"` to run in the current conversation's sandbox, or +`sandboxMode: "task_fresh"` to create a separate task-owned sandbox. Important limitation: the current sandbox filesystem is not durable across warm -container death. Exoclaw stores a durable conversation sandbox record, but package -installs made interactively live in the running warm container. If the REPL exits, -the host restarts, or the container backend cleans up the warm container, a later +container death. Exoclaw stores a durable pointer to the agent's sandbox, but +package installs made interactively live in the running warm container. If the +host restarts or the container backend cleans up the warm container, a later scheduled task may recreate the sandbox from the base image and lose packages -installed with commands like `apt-get install python3`. +installed with commands like `apt-get install python3`. Stale warm containers are +eligible for orphan cleanup after roughly 24 hours. For reliable scheduled tasks, prefer one of these: @@ -51,12 +77,9 @@ For reliable scheduled tasks, prefer one of these: - Keep task code/data on mounted storage instead of relying on mutated container filesystem state. -Use `sandboxMode: "task_fresh"` when a task should have a separate fresh sandbox. -That sandbox starts from the configured image and mounts. It is reused across the -task's runs and stopped when the task is cancelled. +The task-owned sandbox starts from the configured image and mounts. It is reused +across the task's runs and stopped when the task is cancelled. -The right long-term scope is still open. Conversation-scoped sandboxes are useful -for making one conversation's setup visible to its scheduled tasks, but -agent-scoped sandboxes may be more intuitive for long-running agents that manage -multiple conversations. This should likely become configurable, with an explicit -durability model rather than relying on warm container lifetime. +The current scope model is Exoclaw-specific policy on top of conversation-owned +exoharness sandbox records. The default mental model is agent-scoped, while +conversation and task scopes remain available for isolation. diff --git a/examples/exoclaw/harness.ts b/examples/exoclaw/harness.ts index a3e77cd..32d2518 100644 --- a/examples/exoclaw/harness.ts +++ b/examples/exoclaw/harness.ts @@ -49,7 +49,7 @@ function exoclawInstructions(context: TurnContext): Message[] { { role: "developer", content: - 'This is the Exoclaw long-running agent harness. You can schedule recurring sandbox work with schedule_sandbox_task, inspect active tasks with list_scheduled_tasks, cancel tasks with cancel_scheduled_task, and permanently delete tasks with delete_scheduled_task. Use cancel_scheduled_task when task history should be preserved; use delete_scheduled_task when the user asks to remove a task entirely. Scheduled tasks default to sandboxMode: "conversation", which uses this conversation\'s sticky sandbox and can reuse tools installed through the REPL. Use sandboxMode: "task_fresh" only when the task should have a separate fresh sandbox that is reused across that task\'s runs.', + 'This is the Exoclaw long-running agent harness. You can schedule recurring sandbox work with schedule_sandbox_task, inspect active tasks with list_scheduled_tasks, cancel tasks with cancel_scheduled_task, and permanently delete tasks with delete_scheduled_task. Use cancel_scheduled_task when task history should be preserved; use delete_scheduled_task when the user asks to remove a task entirely. Conversations default to sandboxScope: "agent", so shell commands use this agent\'s shared sandbox unless the conversation was configured with sandboxScope: "conversation". Scheduled tasks default to sandboxMode: "agent". Use sandboxMode: "conversation" when the task should run in this conversation\'s sandbox, and sandboxMode: "task_fresh" when the task should have a separate fresh sandbox that is reused across that task\'s runs.', }, ]; } diff --git a/scripts/exoclaw-repl b/scripts/exoclaw-repl index 8892b12..c2bd667 100755 --- a/scripts/exoclaw-repl +++ b/scripts/exoclaw-repl @@ -15,6 +15,7 @@ HARNESS="exoclaw" SANDBOX_IMAGE="${EXO_SANDBOX_IMAGE:-ubuntu:24.04}" NETWORKING="${EXO_NETWORKING:-enabled}" SHELL_PROGRAM="${EXO_SHELL_PROGRAM:-/bin/bash}" +SANDBOX_SCOPE="${EXO_SANDBOX_SCOPE:-}" SCHEDULER_INTERVAL_SECONDS="${EXO_SCHEDULER_INTERVAL_SECONDS:-10}" COMMAND="repl" USE_SANDBOX=true @@ -43,6 +44,7 @@ Options: --sandbox-image Sandbox image (default: ubuntu:24.04) --networking enabled or disabled (default: enabled) --shell-program Shell in the sandbox (default: /bin/bash) + --sandbox-scope agent or conversation (default: Exoclaw agent) --scheduler-interval Scheduler polling interval (default: 10) --no-scheduler Do not start the local scheduled task runner --scheduler Start the local scheduled task runner @@ -55,8 +57,8 @@ Options: Environment overrides: EXO_MODEL, EXO_AGENT, EXO_CONVERSATION, EXO_AGENT_NAME, EXO_CONVERSATION_NAME, EXO_MODULE, EXO_SANDBOX_IMAGE, - EXO_NETWORKING, EXO_SHELL_PROGRAM, EXO_ENV_FILE, EXO_BIN, - EXO_START_SCHEDULER, EXO_SCHEDULER_INTERVAL_SECONDS + EXO_NETWORKING, EXO_SHELL_PROGRAM, EXO_SANDBOX_SCOPE, EXO_ENV_FILE, + EXO_BIN, EXO_START_SCHEDULER, EXO_SCHEDULER_INTERVAL_SECONDS EOF } @@ -198,15 +200,25 @@ ensure_agent() { ensure_conversation() { if conversation_exists; then + if [[ -n "$SANDBOX_SCOPE" ]]; then + exo conversation update "$AGENT" "$CONVERSATION" \ + --sandbox-scope "$SANDBOX_SCOPE" >/dev/null + fi return fi echo "Creating conversation $CONVERSATION..." - exo conversation create "$AGENT" "$CONVERSATION_NAME" \ - --slug "$CONVERSATION" + local args=(conversation create "$AGENT" "$CONVERSATION_NAME" --slug "$CONVERSATION") + if [[ -n "$SANDBOX_SCOPE" ]]; then + args+=(--sandbox-scope "$SANDBOX_SCOPE") + fi + exo "${args[@]}" if [[ "$USE_SANDBOX" == true ]]; then - exo conversation update "$AGENT" "$CONVERSATION" \ - --shell-program "$SHELL_PROGRAM" >/dev/null + local update_args=(conversation update "$AGENT" "$CONVERSATION" --shell-program "$SHELL_PROGRAM") + if [[ -n "$SANDBOX_SCOPE" ]]; then + update_args+=(--sandbox-scope "$SANDBOX_SCOPE") + fi + exo "${update_args[@]}" >/dev/null fi } @@ -359,6 +371,11 @@ while [[ $# -gt 0 ]]; do [[ -n "$SHELL_PROGRAM" ]] || die "--shell-program requires a value" shift 2 ;; + --sandbox-scope) + SANDBOX_SCOPE="${2:-}" + [[ "$SANDBOX_SCOPE" == "agent" || "$SANDBOX_SCOPE" == "conversation" ]] || die "--sandbox-scope must be agent or conversation" + shift 2 + ;; --scheduler-interval) SCHEDULER_INTERVAL_SECONDS="${2:-}" [[ "$SCHEDULER_INTERVAL_SECONDS" =~ ^[0-9]+$ && "$SCHEDULER_INTERVAL_SECONDS" -gt 0 ]] || die "--scheduler-interval requires a positive integer" diff --git a/typescript/harness/index.ts b/typescript/harness/index.ts index 7b6aa3f..2ab7bbc 100644 --- a/typescript/harness/index.ts +++ b/typescript/harness/index.ts @@ -90,6 +90,7 @@ export interface SecretMetadata { export interface ConversationConfig { enableNetworking: boolean; shellProgram?: string | null; + sandboxScope?: "agent" | "conversation" | null; mounts: FileSystemMount[]; } diff --git a/typescript/harness/runner.ts b/typescript/harness/runner.ts index f6b006c..3feaa26 100644 --- a/typescript/harness/runner.ts +++ b/typescript/harness/runner.ts @@ -65,6 +65,7 @@ interface RawToolManifestEntry { interface RawConversationConfig { enable_networking: boolean; shell_program?: string | null; + sandbox_scope?: "agent" | "conversation" | null; mounts: Array<{ host_path: string; mount_path: string; @@ -809,6 +810,7 @@ function toConversationConfig(raw: RawConversationConfig): ConversationConfig { return { enableNetworking: raw.enable_networking, shellProgram: raw.shell_program ?? null, + sandboxScope: raw.sandbox_scope ?? null, mounts: raw.mounts.map(toFileSystemMount), }; } diff --git a/typescript/harness/scheduler-tools.ts b/typescript/harness/scheduler-tools.ts index af3b86e..f75a706 100644 --- a/typescript/harness/scheduler-tools.ts +++ b/typescript/harness/scheduler-tools.ts @@ -39,7 +39,7 @@ function createScheduleSandboxTaskTool(): ToolInstance { definition: { name: "schedule_sandbox_task", description: - "Schedule a recurring command to run in this conversation's sandbox. A host scheduler owns timing and will wake this conversation with compact results when runs complete. The scheduler reuses the shared conversation sandbox when available; use setupCommand for task-specific setup that should run before each scheduled run.", + "Schedule a recurring command to run in this Exoclaw agent's shared sandbox by default. A host scheduler owns timing and will wake this conversation with compact results when runs complete. Use setupCommand for task-specific setup that should run before each scheduled run.", parameters: { type: "object", additionalProperties: false, @@ -63,11 +63,11 @@ function createScheduleSandboxTaskTool(): ToolInstance { }, sandboxMode: { anyOf: [ - { type: "string", enum: ["conversation", "task_fresh"] }, + { type: "string", enum: ["agent", "conversation", "task_fresh"] }, { type: "null" }, ], description: - "Sandbox selection mode. Use 'conversation' or null to run in the shared persistent conversation sandbox. Use 'task_fresh' to create a separate fresh sandbox for this task and reuse it across that task's runs.", + "Sandbox selection mode. Use 'agent' or null to run in the shared persistent agent sandbox. Use 'conversation' for a sandbox scoped to this conversation. Use 'task_fresh' to create a separate fresh sandbox for this task and reuse it across that task's runs.", }, setupCommand: { anyOf: [ @@ -79,7 +79,7 @@ function createScheduleSandboxTaskTool(): ToolInstance { { type: "null" }, ], description: - "Optional argv to run immediately before each scheduled run in the shared conversation sandbox, for example ['bash', '-lc', 'apt-get update && apt-get install -y curl']. Use this for dependencies that should be prepared before each run.", + "Optional argv to run immediately before each scheduled run, for example ['bash', '-lc', 'apt-get update && apt-get install -y curl']. Use this for dependencies that should be prepared before each run.", }, reportPrompt: { type: "string",