diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 61d713d0..b070add5 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -59,4 +59,4 @@ jobs: uses: astral-sh/setup-uv@v5 - name: Run pyright - run: uv run --with=".[rl,dev]" pyright + run: uv run --with=".[dev]" pyright diff --git a/README.md b/README.md index bd4fdfd3..ff67506a 100644 --- a/README.md +++ b/README.md @@ -6,395 +6,133 @@ -OSS RL environment + evals toolkit. Wrap software as environments, run benchmarks, and train with RL โ€“ locally or at scale. +The HUD SDK is an open-source Python toolkit for building, evaluating, and training AI agents. Use a unified API for any model provider, wrap your code as MCP environments, run A/B evals at scale, and train with reinforcement learning. -[![PyPI version](https://img.shields.io/pypi/v/hud-python?style=flat-square)](https://pypi.org/project/hud-python/) +To learn more, check out our [Documentation](https://docs.hud.ai) and [API Reference](https://docs.hud.ai/reference). + +[![PyPI](https://img.shields.io/pypi/v/hud-python?style=flat-square)](https://pypi.org/project/hud-python/) [![License](https://img.shields.io/badge/license-MIT-green?style=flat-square)](LICENSE) [![Add docs to Cursor](https://img.shields.io/badge/Add%20docs%20to-Cursor-black?style=flat-square)](https://cursor.com/en/install-mcp?name=docs-hud-python&config=eyJ1cmwiOiJodHRwczovL2RvY3MuaHVkLmFpL21jcCJ9) [![Discord](https://img.shields.io/discord/1327447144772407390?label=Discord&logo=discord&style=flat-square)](https://discord.gg/wkjtmHYYjm) [![X Follow](https://img.shields.io/twitter/follow/hud_evals?style=social)](https://x.com/intent/user?screen_name=hud_evals) [![Shop](https://img.shields.io/badge/_-white.svg?label=shop&logo=&style=social)](https://shop.hud.ai) [![Scarf](https://static.scarf.sh/a.png?x-pxid=6530ff33-4945-452b-81f9-626872593933)](https://scarf.sh) +[![Docs](https://img.shields.io/badge/docs-hud.ai-blue?style=flat-square)](https://docs.hud.ai) - -### Are you an enterprise building agents? - -[๐Ÿ“… Hop on a call](https://cal.com/jay-hud) or [๐Ÿ“ง founders@hud.ai](mailto:founders@hud.ai) - -## Highlights - -- ๐Ÿš€ **[MCP environment skeleton](https://docs.hud.ai/core-concepts/mcp-protocol)** โ€“ any agent can call any environment. -- โšก๏ธ **[Live telemetry](https://hud.ai)** โ€“ inspect every tool call, observation, and reward in real time. -- ๐Ÿ—‚๏ธ **[Public benchmarks](https://hud.ai/leaderboards)** โ€“ OSWorld-Verified, SheetBench-50, and more. -- ๐ŸŒ **[Cloud browsers](environments/remote_browser/)** โ€“ AnchorBrowser, Steel, BrowserBase integrations for browser automation. -- ๐Ÿ› ๏ธ **[Hot-reload dev loop](environments/README.md#phase-5-hot-reload-development-with-cursor-agent)** โ€“ `hud dev` for iterating on environments without rebuilds. -- ๐ŸŽ“ **[One-click RL](https://hud.ai/models)** โ€“ Run `hud rl` to get a trained model on any environment. - -> We welcome contributors and feature requests โ€“ open an issue or hop on a call to discuss improvements! - -## Installation +## Install ```bash -# SDK - MCP servers, telemetry, evaluation pip install hud-python - -# CLI - RL pipeline, environment design -uv tool install hud-python@latest --python 3.12 -# uv tool update-shell -``` - -> See [docs.hud.ai](https://docs.hud.ai), or add docs to any MCP client: -> `claude mcp add --transport http docs-hud https://docs.hud.ai/mcp` - -Before starting, get your HUD_API_KEY at [hud.ai](https://hud.ai). - - -## Quickstart: Evals - -For a tutorial that explains the agent and evaluation design, run: - -```python -uvx hud-python quickstart ``` -Or just write your own agent loop (more [examples here](examples/)). - -```python -import asyncio, hud, os -from hud.settings import settings -from hud.clients import MCPClient -from hud.agents import ClaudeAgent -from hud.datasets import Task # See docs: https://docs.hud.ai/reference/tasks - -async def main() -> None: - with hud.trace("Quick Start 2048"): # All telemetry works for any MCP-based agent (see https://hud.ai) - task = { - "prompt": "Reach 64 in 2048.", - "mcp_config": { - "hud": { - "url": "https://mcp.hud.ai/v3/mcp", # HUD's cloud MCP server (see https://docs.hud.ai/core-concepts/architecture) - "headers": { - "Authorization": f"Bearer {settings.api_key}", # Get your key at https://hud.ai - "Mcp-Image": "hudpython/hud-text-2048:v1.2" # Docker image from https://hub.docker.com/u/hudpython - } - } - }, - "evaluate_tool": {"name": "evaluate", "arguments": {"name": "max_number", "arguments": {"target": 64}}}, - } - task = Task(**task) - - # 1. Define the client explicitly: - client = MCPClient(mcp_config=task.mcp_config) - agent = ClaudeAgent( - mcp_client=client, - model="claude-sonnet-4-5", # requires ANTHROPIC_API_KEY - ) - - result = await agent.run(task) - - # 2. Or just: - # result = await ClaudeAgent().run(task) - - print(f"Reward: {result.reward}") - await client.shutdown() - -asyncio.run(main()) -``` - -The above example let's the agent play 2048 ([See replay](https://hud.ai/trace/6feed7bd-5f67-4d66-b77f-eb1e3164604f)) - -![Agent playing 2048](https://raw.githubusercontent.com/hud-evals/hud-python/main/docs/src/images/2048_1.gif) - -## Quickstart: Training - -RL using GRPO a Qwen2.5-VL model on any hud dataset: +Get your API key at [hud.ai](https://hud.ai) and set it: ```bash -hud get hud-evals/2048-basic # from HF -hud rl 2048-basic.json +export HUD_API_KEY=your-key-here ``` -> See [agent training docs](https://docs.hud.ai/train-agents/quickstart) - -Or make your own environment and dataset: - -```bash -hud init my-env && cd my-env -hud dev --interactive -# When ready to run: -hud rl -``` +> For CLI tools (`hud init`, `hud dev`, etc.): `uv tool install hud-python --python 3.12` -> See [environment design docs](https://docs.hud.ai/build-environments) +![Agent running on SheetBench](https://raw.githubusercontent.com/hud-evals/hud-python/main/docs/src/images/trace_sheet.gif) -## Benchmarking Agents +## Usage -This is Claude Computer Use running on our proprietary financial analyst benchmark [SheetBench-50](https://huggingface.co/datasets/hud-evals/SheetBench-50): +### Unified Model API -![Trace screenshot](https://raw.githubusercontent.com/hud-evals/hud-python/main/docs/src/images/trace_sheet.gif) +Use Claude, GPT, Gemini, or Grok through one OpenAI-compatible endpoint: -> [See this trace on _hud.ai_](https://hud.ai/trace/9e212e9e-3627-4f1f-9eb5-c6d03c59070a) - -This example runs the full dataset (only takes ~20 minutes) using [run_evaluation.py](examples/run_evaluation.py): - -```bash -python examples/run_evaluation.py hud-evals/SheetBench-50 --full --agent claude -``` +```python +from openai import AsyncOpenAI +import os -Or in code: +client = AsyncOpenAI( + base_url="https://inference.hud.ai", + api_key=os.environ["HUD_API_KEY"] +) -```python -import asyncio -from hud.datasets import run_dataset -from hud.agents import ClaudeAgent - -results = await run_dataset( - name="My SheetBench-50 Evaluation", - dataset="hud-evals/SheetBench-50", # <-- HuggingFace dataset - agent_class=ClaudeAgent, # <-- Your custom agent can replace this (see https://docs.hud.ai/evaluate-agents/create-agents) - agent_config={"model": "claude-sonnet-4-5"}, - max_concurrent=50, - max_steps=30, +response = await client.chat.completions.create( + model="claude-sonnet-4-5", # or gpt-4o, gemini-2.5-pro (https://hud.ai/models) + messages=[{"role": "user", "content": "Hello!"}] ) -print(f"Average reward: {sum(r.reward for r in results) / len(results):.2f}") ``` -> Running a dataset creates a job and streams results to the [hud.ai](https://hud.ai) platform for analysis and [leaderboard submission](https://docs.hud.ai/evaluate-agents/leaderboards). - -## Building Environments (MCP) +Every call is traced at [hud.ai](https://hud.ai). โ†’ [Docs](https://docs.hud.ai/quick-links/gateway) -This is how you can make any environment into an interactable one in 5 steps: +### Environments -1. Define MCP server layer using [`MCPServer`](https://docs.hud.ai/reference/environments) +Turn your code into tools agents can call. Define how to evaluate them: ```python -from hud.server import MCPServer -from hud.tools import HudComputerTool +from hud import Environment -mcp = MCPServer("My Environment") +env = Environment("my-env") -# Add hud tools (see all tools: https://docs.hud.ai/reference/tools) -mcp.tool(HudComputerTool()) +@env.tool() +def search(query: str) -> str: + """Search the knowledge base.""" + return db.search(query) -# Or custom tools (see https://docs.hud.ai/build-environments/adapting-software) -@mcp.tool("launch_app"): -def launch_app(name: str = "Gmail") -... - -if __name__ == "__main__": - mcp.run() +@env.scenario("find-answer") +async def find_answer(question: str, answer: str): + response = yield f"Find: {question}" # Prompt + yield 1.0 if answer in response else 0.0 # Reward ``` -2. Write a simple Dockerfile that installs packages and runs: +The agent runs between the yields. First yield sends the prompt, second yield scores the result. โ†’ [Docs](https://docs.hud.ai/quick-links/environments) ยท [Templates](https://hud.ai/environments) -```python -CMD ["python", "-m", "hud_controller.server"] -``` +### A/B Evals -And build the image: +Test different models. Repeat runs to see the distribution: -```bash -hud build # runs docker build under the hood -``` +```python +import hud -Or run it in interactible mode +task = env("find-answer", question="What is 2+2?", answer="4") -```bash -hud dev +async with hud.eval(task, variants={"model": ["gpt-4o", "claude-sonnet-4-5"]}, group=5) as ctx: + response = await client.chat.completions.create( + model=ctx.variants["model"], + messages=[{"role": "user", "content": ctx.prompt}] + ) + await ctx.submit(response.choices[0].message.content) ``` -3. Debug it with the CLI to see if it launches: +**Variants** test configurations. **Groups** repeat for distribution. Results stream to [hud.ai](https://hud.ai). โ†’ [Docs](https://docs.hud.ai/quick-links/ab-testing) -```console -$ hud debug my-name/my-environment:latest +### Deploy & Train -โœ“ Phase 1: Docker image exists -โœ“ Phase 2: MCP server responds to initialize -โœ“ Phase 3: Tools are discoverable -โœ“ Phase 4: Basic tool execution works -โœ“ Phase 5: Parallel performance is good - -Progress: [โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ] 5/5 phases (100%) -โœ… All phases completed successfully! -``` - -Analyze it to see if all tools appear: - -```console -$ hud analyze hudpython/hud-remote-browser:latest -โ  โœ“ Analysis complete -... -Tools -โ”œโ”€โ”€ Regular Tools -โ”‚ โ”œโ”€โ”€ computer -โ”‚ โ”‚ โ””โ”€โ”€ Control computer with mouse, keyboard, and screenshots -... -โ””โ”€โ”€ Hub Tools - โ”œโ”€โ”€ setup - โ”‚ โ”œโ”€โ”€ navigate_to_url - โ”‚ โ”œโ”€โ”€ set_cookies - โ”‚ โ”œโ”€โ”€ ... - โ””โ”€โ”€ evaluate - โ”œโ”€โ”€ url_match - โ”œโ”€โ”€ page_contains - โ”œโ”€โ”€ cookie_exists - โ”œโ”€โ”€ ... - -๐Ÿ“ก Telemetry Data - Live URL https://live.anchorbrowser.io?sessionId=abc123def456 -``` - -4. When the tests pass, push it up to the docker registry: +Push to GitHub, connect on hud.ai, run at scale: ```bash -hud push # needs docker login, hud api key -``` - -5. Now you can use `mcp.hud.ai` to launch 100s of instances of this environment in parallel with any agent, and see everything live on [hud.ai](https://hud.ai): - -```python -from hud.agents import ClaudeAgent - -result = await ClaudeAgent().run({ # See all agents: https://docs.hud.ai/reference/agents - "prompt": "Please explore this environment", - "mcp_config": { - "my-environment": { - "url": "https://mcp.hud.ai/v3/mcp", - "headers": { - "Authorization": f"Bearer {os.getenv('HUD_API_KEY')}", - "Mcp-Image": "my-name/my-environment:latest" - } - } - # "my-environment": { # or use hud run which wraps local and remote running - # "cmd": "hud", - # "args": [ - # "run", - # "my-name/my-environment:latest", - # ] - # } - } -}) - +hud init # Scaffold environment +git push # Push to GitHub +# Connect on hud.ai โ†’ New โ†’ Environment +hud eval my-org/my-eval --model gpt-4o --group-size 100 +# Or create and run tasks on the platform ``` -> See the full environment design guide and common pitfalls in [`environments/README.md`](environments/README.md) +Every run generates training data. Use it to fine-tune or run RL. โ†’ [Docs](https://docs.hud.ai/quick-links/deploy) -## Leaderboards & benchmarks +## Links -All leaderboards are publicly available on [hud.ai/leaderboards](https://hud.ai/leaderboards) (see [docs](https://docs.hud.ai/evaluate-agents/leaderboards)) +- ๐Ÿ“– [Documentation](https://docs.hud.ai) +- โŒจ๏ธ [CLI Reference](https://docs.hud.ai/reference/cli/overview) +- ๐Ÿ† [Leaderboards](https://hud.ai/leaderboards) +- ๐ŸŒ [Environment Templates](https://hud.ai/environments) +- ๐Ÿค– [Supported Models](https://hud.ai/models) +- ๐Ÿ’ฌ [Discord](https://discord.gg/wkjtmHYYjm) -![Leaderboard](https://raw.githubusercontent.com/hud-evals/hud-python/main/docs/src/images/leaderboards_3.png) +## Enterprise -We highly suggest running 3-5 evaluations per dataset for the most consistent results across multiple jobs. +Building agents at scale? We work with teams on custom environments, benchmarks, and training. -Using the [`run_dataset`](https://docs.hud.ai/reference/tasks#run_dataset) function with a HuggingFace dataset automatically assigns your job to that leaderboard page, and allows you to create a scorecard out of it: - -## Reinforcement Learning with GRPO - -This is a Qwenโ€‘2.5โ€‘VLโ€‘3B agent training a policy on the 2048-basic browser environment: - -![RL curve](https://raw.githubusercontent.com/hud-evals/hud-python/main/docs/src/images/rl_2.png) - -Train with the new interactive `hud rl` flow: - -```bash -# Install CLI -uv tool install hud-python@latest --python 3.12 - -# Option A: Run directly from a HuggingFace dataset -hud rl hud-evals/2048-basic - -# Option B: Download first, modify, then train -hud get hud-evals/2048-basic -hud rl 2048-basic.json - -# Optional: baseline evaluation -hud eval 2048-basic.json -``` - -Supports multiโ€‘turn RL for both: -- Languageโ€‘only models (e.g., `Qwen/Qwen2.5-7B-Instruct`) -- Visionโ€‘Language models (e.g., `Qwen/Qwen2.5-VL-3B-Instruct`) - -By default, `hud rl` provisions a persistent server and trainer in the cloud, streams telemetry to `hud.ai`, and lets you monitor/manage models at `hud.ai/models`. Use `--local` to run entirely on your machines (typically 2+ GPUs: one for vLLM, the rest for training). - -Any HUD MCP environment and evaluation works with our RL pipeline (including remote configurations). See the guided docs: `https://docs.hud.ai/train-agents/quickstart`. - -Pricing: Hosted vLLM and training GPU rates are listed in the [Training Quickstart โ†’ Pricing](https://docs.hud.ai/train-agents/quickstart#pricing). Manage billing at the [HUD billing dashboard](https://hud.ai/project/billing). - -## Architecture - -```mermaid -%%{init: {"theme": "neutral", "themeVariables": {"fontSize": "14px"}} }%% -graph LR - subgraph "Platform" - Dashboard["๐Ÿ“Š hud.ai"] - API["๐Ÿ”Œ mcp.hud.ai"] - end - - subgraph "hud" - Agent["๐Ÿค– Agent"] - Task["๐Ÿ“‹ Task"] - SDK["๐Ÿ“ฆ SDK"] - end - - subgraph "Environments" - LocalEnv["๐Ÿ–ฅ๏ธ Local Docker
(Development)"] - RemoteEnv["โ˜๏ธ Remote Docker
(100s Parallel)"] - end - - subgraph "otel" - Trace["๐Ÿ“ก Traces & Metrics"] - end - - Dataset["๐Ÿ“š Dataset
(HuggingFace)"] - - AnyMCP["๐Ÿ”— Any MCP Client
(Cursor, Claude, Custom)"] - - Agent <--> SDK - Task --> SDK - Dataset <-.-> Task - SDK <-->|"MCP"| LocalEnv - SDK <-->|"MCP"| API - API <-->|"MCP"| RemoteEnv - SDK --> Trace - Trace --> Dashboard - AnyMCP -->|"MCP"| API - -``` - -## CLI reference - -| Command | Purpose | Docs | -| ----------------------- | ------------------------------------------ | ---- | -| [`hud init`](https://docs.hud.ai/reference/cli/init) | Create new environment with boilerplate. | [๐Ÿ“–](https://docs.hud.ai/reference/cli/init) | -| [`hud dev`](https://docs.hud.ai/reference/cli/dev) | Hot-reload development with Docker. | [๐Ÿ“–](https://docs.hud.ai/reference/cli/dev) | -| [`hud build`](https://docs.hud.ai/reference/cli/build) | Build image and generate lock file. | [๐Ÿ“–](https://docs.hud.ai/reference/cli/build) | -| [`hud push`](https://docs.hud.ai/reference/cli/push) | Share environment to registry. | [๐Ÿ“–](https://docs.hud.ai/reference/cli/push) | -| [`hud pull `](https://docs.hud.ai/reference/cli/pull) | Get environment from registry. | [๐Ÿ“–](https://docs.hud.ai/reference/cli/pull) | -| [`hud analyze `](https://docs.hud.ai/reference/cli/analyze) | Discover tools, resources, and metadata. | [๐Ÿ“–](https://docs.hud.ai/reference/cli/analyze) | -| [`hud debug `](https://docs.hud.ai/reference/cli/debug) | Five-phase health check of an environment. | [๐Ÿ“–](https://docs.hud.ai/reference/cli/debug) | -| [`hud run `](https://docs.hud.ai/reference/cli/run) | Run MCP server locally or remotely. | [๐Ÿ“–](https://docs.hud.ai/reference/cli/run) | - -## Roadmap - -- Merging our forks in to the main `mcp`, `mcp_use` repositories -- Helpers for building new environments (see [current guide](environments/README.md)) -- Integrations with every major agent framework -- Evaluation environment registry -- MCP opentelemetry standard +[๐Ÿ“… Book a call](https://cal.com/jay-hud) ยท [๐Ÿ“ง founders@hud.ai](mailto:founders@hud.ai) ## Contributing -We welcome contributions! See [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines. - -Key areas: -- [Environment examples](environments/) - Add new MCP environments -- [Agent implementations](hud/agents/) - Add support for new LLM providers -- [Tool library](hud/tools/) - Extend the built-in tool collection -- [RL training](hud/rl/) - Improve reinforcement learning pipelines +We welcome contributions! See [CONTRIBUTING.md](CONTRIBUTING.md). -Thanks to all our contributors! +Key areas: [Agents](hud/agents/) ยท [Tools](hud/tools/) ยท [Environments](https://hud.ai/environments) @@ -412,4 +150,4 @@ Thanks to all our contributors! } ``` -> **License**: HUD is released under the MIT License โ€“ see the [LICENSE](LICENSE) file for details. +MIT License ยท [LICENSE](LICENSE) diff --git a/docs/advanced/testing-environments.mdx b/docs/advanced/testing-environments.mdx new file mode 100644 index 00000000..99ece815 --- /dev/null +++ b/docs/advanced/testing-environments.mdx @@ -0,0 +1,105 @@ +--- +title: "Testing Environments" +description: "Test scenarios, tools, and environment logic locally" +icon: "flask-vial" +--- + +Before deploying, test locally. See [Sandboxing](/guides/sandboxing) for Docker vs no-Docker patterns. + +## Local Testing + +| Environment | `local_test.py` | +|-------------|-----------------| +| No Docker | `from env import env` | +| Docker | `env.connect_url("http://localhost:8765/mcp")` | + +Both use the same API after setup: + +```python +async with env: + tools = env.as_tools() # List available tools + result = await env.call_tool("my_tool", arg="val") # Call a tool +``` + +## Testing Scenarios Directly + +Scenarios are async generators. `hud.eval()` drives them automatically, but you can test the logic directlyโ€”this is exactly what runs at the start and end of `hud.eval()`: + +```python +async def checkout(user_id: str, amount: int = 100): + # Setup + prompt (first yield) โ€” runs at hud.eval() start + answer = yield f"Complete checkout for {user_id}, ${amount}" + + # Evaluation (second yield) โ€” runs after agent submits + yield 1.0 if "success" in answer.lower() else 0.0 + +async def test(): + gen = checkout("alice", 50) + prompt = await anext(gen) # What hud.eval() does at start + reward = await gen.asend("Success!") # What hud.eval() does after submit + assert reward == 1.0 +``` + +If your scenario tests pass, `hud.eval()` will behave identically. + +## Mocking + +`env.mock()` intercepts at the tool layerโ€”agents only see tools: + +```python +env.mock() # All tools return fake responses +env.mock_tool("send_email", {"status": "sent"}) + +# Check mock state +assert env.is_mock == True +``` + +## Hot-Reload + +For Docker environments, `hud dev -w path` reloads Python on save: + +```bash +hud dev -w scenarios -w tools --port 8765 +``` + +System services (postgres, VNC, browsers) persist across reloads. + +## Debugging Build Failures + +`hud build` runs the exact same pipeline as **New โ†’ Environment** on [hud.ai](https://hud.ai)โ€”so if it passes locally, it'll work in production. If the build fails or the container crashes on startup, use `hud debug` to run a 5-phase compliance test: + +```bash +hud debug my-env:latest +``` + +Output shows exactly which phase failed: +``` +โœ“ Phase 1: Docker image exists +โœ“ Phase 2: MCP server responds to initialize +โœ— Phase 3: Tool discovery failed + โ†’ Error: Connection refused on port 8005 + โ†’ Hint: Backend service may not be starting +``` + +You can also debug a directory (builds first) or stop at a specific phase: + +```bash +hud debug . # Build and debug current directory +hud debug . --max-phase 3 # Stop after phase 3 +hud debug --config mcp.json # Debug from config file +``` + +## Useful Environment Properties + +```python +# Check parallelization (for running multiple evals) +env.is_parallelizable # True if all connections are remote + +# List what's connected +env.connections # Dict of connection names โ†’ connectors +env.is_connected # True if in async context + +# Resources and prompts (beyond tools) +await env.list_resources() # MCP resources +await env.list_prompts() # MCP prompts +``` diff --git a/docs/beta/index.mdx b/docs/beta/index.mdx index b318cad3..6485a3fd 100644 --- a/docs/beta/index.mdx +++ b/docs/beta/index.mdx @@ -11,5 +11,5 @@ Beta features are experimental and may change in future releases. ## Available Beta Features - Fine-tune models with reinforcement learning on your HUD tasks (invite-only) + Fine-tune models on your HUD tasks (invite-only) diff --git a/docs/build-environments/index.mdx b/docs/build-environments/index.mdx index 40ec910f..4981b22e 100644 --- a/docs/build-environments/index.mdx +++ b/docs/build-environments/index.mdx @@ -66,9 +66,6 @@ hud eval tasks.json # Deploy to registry hud push - -# Train agents on your tasks -hud rl tasks.json ``` --- @@ -83,7 +80,6 @@ hud rl tasks.json | Troubleshoot | `hud debug my-env:dev` | | Build image | `hud build` | | Push to registry | `hud push` | -| RL training | `hud rl tasks.json` | --- @@ -93,3 +89,20 @@ hud rl tasks.json * **CLI reference**: [CLI Overview](/reference/cli/overview) Have fun โ€“ and remember: *stderr for logs, stdout for MCP!* + +--- + +## Available Environments + +Browse ready-to-use environments and templates at **[hud.ai/environments](https://hud.ai/environments)**. + +| Environment | Description | +|-------------|-------------| +| `hud-blank` | Minimal starter template | +| `hud-browser` | Browser automation with Playwright | +| `hud-remote-browser` | Cloud browser providers (Steel, Anchor, etc.) | +| `hud-deepresearch` | Deep research with web search | +| `hud-rubrics` | LLM-as-judge evaluations | +| `coding-template` | Full coding env with VNC, Postgres, Redis | + +Each environment is available as a GitHub template you can fork and customize. diff --git a/docs/build-environments/spec.mdx b/docs/build-environments/spec.mdx index a87160df..61069b21 100644 --- a/docs/build-environments/spec.mdx +++ b/docs/build-environments/spec.mdx @@ -24,7 +24,7 @@ graph TD - No nonโ€‘MCP output on stdout (all logging to stderr). - No required file layout, framework, or endpoints. -Recommended (for HUD RL/evals): provide tools named `setup` and `evaluate`. +Recommended (for HUD evals): provide tools named `setup` and `evaluate`. ## Make it runnable remotely (mcp.hud.ai) @@ -143,7 +143,7 @@ The same structure is used by `hud init`โ€™s template and by programmatic tasks. ] ``` -Switching this file to remote is as simple as replacing the `mcp_config` with the `hud` section shown above (or using `hud rl`, which will help convert it automatically). +Switching this file to remote is as simple as replacing the `mcp_config` with the `hud` section shown above (or using `hud convert`, which will help convert it automatically). Run tasks with either the CLI or an agent: diff --git a/docs/docs.json b/docs/docs.json index b9091131..2b73c81c 100644 --- a/docs/docs.json +++ b/docs/docs.json @@ -29,12 +29,81 @@ "navigation": { "versions": [ { - "version": "0.4.74", + "version": "0.5.0", "groups": [ { "group": "Get Started", "pages": [ "index", + "llm-quickstart" + ] + }, + { + "group": "Essentials", + "pages": [ + "quick-links/gateway", + "quick-links/ab-testing", + "quick-links/environments", + "quick-links/deploy" + ] + }, + { + "group": "Guides", + "pages": [ + "guides/integrations", + "guides/sandboxing", + "guides/best-practices", + "migration" + ] + }, + { + "group": "Advanced", + "pages": [ + "advanced/testing-environments" + ] + }, + { + "group": "SDK Reference", + "pages": [ + "reference/evals", + "reference/environments", + "reference/tools", + "reference/mcpserver", + "reference/agents", + "reference/types" + ] + }, + { + "group": "CLI Reference", + "pages": [ + "reference/cli/overview", + "reference/cli/init", + "reference/cli/dev", + "reference/cli/build", + "reference/cli/push", + "reference/cli/analyze", + "reference/cli/debug", + "reference/cli/run", + "reference/cli/eval", + "reference/cli/rft", + "reference/cli/misc" + ] + }, + { + "group": "Community", + "pages": [ + "contributing" + ] + } + ] + }, + { + "version": "0.4.73", + "groups": [ + { + "group": "Get Started", + "pages": [ + "index-legacy", "quickstart", "llm-quickstart" ] @@ -50,10 +119,11 @@ { "group": "SDK Reference", "pages": [ + "reference/eval", "reference/tools", "reference/agents", "reference/types", - "reference/environments", + "reference/mcpserver", "reference/tasks" ] }, @@ -64,17 +134,10 @@ "build-environments/spec" ] }, - { - "group": "Training (RL)", - "pages": [ - "train-agents/quickstart", - "train-agents/tasks" - ] - }, { "group": "HUD Gateway", "pages": [ - "gateway/index" + "gateway/index-legacy" ] }, { @@ -103,7 +166,6 @@ "reference/cli/debug", "reference/cli/run", "reference/cli/eval", - "reference/cli/rl", "reference/cli/rft", "reference/cli/misc" ] diff --git a/docs/evaluate-agents/benchmarks.mdx b/docs/evaluate-agents/benchmarks.mdx index b63d9b17..09561a30 100644 --- a/docs/evaluate-agents/benchmarks.mdx +++ b/docs/evaluate-agents/benchmarks.mdx @@ -18,7 +18,30 @@ hud eval tasks.json hud eval hud-evals/SheetBench-50 claude --full ``` -- SDK +- SDK (Context Manager) + +```python +import hud + +# Single task evaluation +async with hud.eval("hud-evals/SheetBench-50:0") as ctx: + agent = MyAgent() + result = await agent.run(ctx) + ctx.reward = result.reward + +# All tasks with variants +async with hud.eval( + "hud-evals/SheetBench-50:*", + variants={"model": ["claude-sonnet", "gpt-4o"]}, + group=3, + max_concurrent=50, +) as ctx: + agent = create_agent(model=ctx.variants["model"]) + result = await agent.run(ctx) + ctx.reward = result.reward +``` + +- SDK (Batch Execution) ```python from hud.datasets import run_tasks @@ -108,8 +131,9 @@ results = await run_tasks( ## See Also -- [`hud eval`](/reference/cli/eval) -- [`hud rl`](/reference/cli/rl) +- [Evaluation API](/reference/eval) - SDK reference for `hud.eval()` +- [`hud eval`](/reference/cli/eval) - CLI reference +- [`hud rft`](/reference/cli/rft) - [Tasks](/reference/tasks) - [Agents (SDK)](/reference/agents) diff --git a/docs/gateway/index.mdx b/docs/gateway/index-legacy.mdx similarity index 99% rename from docs/gateway/index.mdx rename to docs/gateway/index-legacy.mdx index ea235980..a60b6811 100644 --- a/docs/gateway/index.mdx +++ b/docs/gateway/index-legacy.mdx @@ -1,5 +1,5 @@ --- -title: "HUD Gateway" +title: "Gateway" description: "Unified LLM inference service with built-in auth and credit management." icon: "server" --- @@ -128,3 +128,4 @@ This example demonstrates: - Automatic token usage and latency tracking View your traces on the [HUD Dashboard](https://hud.ai/home). + diff --git a/docs/guides/best-practices.mdx b/docs/guides/best-practices.mdx new file mode 100644 index 00000000..662cbab9 --- /dev/null +++ b/docs/guides/best-practices.mdx @@ -0,0 +1,142 @@ +--- +title: "Best Practices" +description: "Design effective environments, evals, and grading logic" +icon: "star" +--- + +Building good agent evaluations requires thoughtful design at every layerโ€”the environment, the prompts, and the grading logic. This guide covers patterns that lead to useful, reliable signal. + +## Good Environments + +A good environment gives agents what they need to succeedโ€”and gives you what you need to evaluate them. + +### Observable State + +Agents need access to the right information. If they can't see the data they need, they can't complete the task. Design tools that expose useful state: + +```python +# โŒ Bad: Agent can't see what was created +@env.tool() +def create_user(name: str) -> str: + db.insert("users", name=name) + return "User created" + +# โœ… Good: Agent gets actionable data back +@env.tool() +def create_user(name: str) -> dict: + user_id = db.insert("users", name=name) + return {"id": user_id, "name": name, "created": True} +``` + +For grading, you also need to observe what happened. If the agent creates a database row, you need to query that database. If it uploads a file, you need to read that file. Be cognizant of what you can and cannot observeโ€”only ask agents to do things you can verify. + +### Deterministic Setup + +Each eval should seed the state it needs. HUD handles container isolationโ€”you handle making sure your scenario sets up the right data before the agent runs. + +```python +# โŒ Bad: Depends on whatever state exists +@env.scenario("find-user") +async def find_user(name: str): + answer = yield f"Find the user named {name}" + yield 1.0 if name in answer else 0.0 + +# โœ… Good: Seeds known state before eval +@env.scenario("find-user") +async def find_user(name: str): + await db.clear() + await db.insert("users", name=name, email=f"{name}@example.com") + + answer = yield f"Find the user named {name}" + yield 1.0 if name in answer else 0.0 +``` + +### Isolated Execution + +HUD sandboxes each evalโ€”containers don't share state. But if your environment connects to external services, think about stateful vs stateless. + +**Stateless services** are fine. Multiple agents can hit the same read-only API without interference. + +**Stateful services** need care. If 100 agents all hit the same database endpoint that modifies data, they'll step on each other. Use per-eval instances, transaction isolation, or target different records. + +## Good Evals + +An eval combines a prompt (the first `yield`) with grading logic (everything after). The prompt tells agents what to doโ€”write short-to-medium length instructions that ask for an unambiguous change you can verify. + +### Be Specific + +Ambiguous prompts lead to ambiguous grading. Say exactly what you want: + +``` +โŒ "Update the user settings" +โœ… "Change the email for user alice@example.com to alice.new@example.com" +``` + +Real-world example: *"Add a column to the Portfolio snapshot with the 'Phase' of the engagement. C-11X should be 'Phase 2', all else are 'Phase 1'."* + +### Only Ask for Testable Things + +If you can't observe the result, you can't grade it. Don't ask an agent to "think about" somethingโ€”ask it to do something you can verify. + +``` +โŒ "Consider the best approach to optimize the query" +โœ… "Rewrite the query to use an index on the email column" +``` + +### Create Variations + +Evals are easier to write when you have a specific failure mode in mind. If you've observed agents struggling with something, incorporate that into future evals. + +Create different versions with more or less explicit instructionsโ€”step-by-step guidance vs. high-level goals. Use [variants](/quick-links/ab-testing) to test these systematically. Variations make it easier to tune difficulty later. + +## Good Graders + +The grading logic after the first `yield` determines the grade. Fair grading means useful signal. + +### Match the Prompt + +If the prompt says "create a document with a Japanese car brand", check for any Japanese car brandโ€”not just "Toyota". But don't accept any document either. Exactly as strict as the prompt implies. + +```python +# โŒ Bad: Too strictโ€”only accepts one answer +@env.scenario("add-car") +async def add_car(): + answer = yield "Add a Japanese car brand to the document" + yield 1.0 if answer == "Toyota" else 0.0 + +# โœ… Good: Accepts any valid answer +@env.scenario("add-car") +async def add_car(): + answer = yield "Add a Japanese car brand to the document" + japanese_brands = ["toyota", "honda", "nissan", "mazda", "subaru"] + yield 1.0 if any(brand in answer.lower() for brand in japanese_brands) else 0.0 +``` + +### Use Partial Credit + +Partial grades help you see where agents fail. Did they add to cart but not checkout? That's useful signal. Break complex grading into sub-checks with weighted grades: + +```python +@env.scenario("checkout") +async def checkout(product: str): + answer = yield f"Add {product} to cart and checkout" + + score = 0.0 + if await product_in_cart(product): + score += 0.3 # Partial credit for first step + if await order_completed(product): + score += 0.7 # Most credit for completion + yield score +``` + +### Sanity Check + +At minimum, verify two cases: unchanged state โ†’ 0.0, correct completion โ†’ 1.0. For grading logic you'll reuse across many evals, write unit tests. Load a known state snapshot, verify the grade matches what you expect. + +## Finding the Right Difficulty + +A good eval set has rangeโ€”target 20-30% average success rate. You want high variance: some runs should grade 0.0, others 1.0. If every run grades the same, there's no signal to learn from. Having both positive and negative examples on the same eval is what makes improvement possible. + +**Iterate.** Create an eval, test it manually, run it at scale, check the difficulty. If it's too easy or too hard, adjust the prompt or grading. Use your best evals as templates for more. + +**Train.** Every eval generates dataโ€”prompts, tool calls, grades. Use successful runs for fine-tuning. The loop: eval โ†’ analyze โ†’ train โ†’ eval again. diff --git a/docs/guides/integrations.mdx b/docs/guides/integrations.mdx new file mode 100644 index 00000000..0d826e07 --- /dev/null +++ b/docs/guides/integrations.mdx @@ -0,0 +1,430 @@ +--- +title: "Integrations" +description: "Use any agent framework with HUD environments" +icon: "robot" +--- + +HUD environments work with any agent framework. The `Environment` class provides format converters for all major providers, and `hud.eval()` handles setup, evaluation, and tracing automatically. + +Every example on this page uses the `eval` defined below and the [Gateway](/quick-links/gateway) for inference. + +## The Example Environment + +```python +import hud + +CEOS = {"hud": "Jay Ram", "openai": "Sam Altman", "anthropic": "Dario Amodei"} + +env = hud.Environment("trivia") + +@env.tool() +def lookup_ceo(company: str) -> str: + """Look up the CEO of a company.""" + return CEOS.get(company.lower(), "Unknown") + +@env.scenario("initials") +async def find_initials(company: str): + answer = yield f"What are the initials of the CEO of {company}?" + ceo = CEOS.get(company.lower()) + correct = "".join(word[0] for word in ceo.split()) if ceo else None + yield 1.0 if answer and correct and correct in answer.upper() else 0.0 + +task = env("initials", company="HUD") +``` + +--- + +## OpenAI + +The OpenAI SDK supports three APIs: Chat Completions, Responses, and the Agents SDK. + +### Chat Completions + +```python +import os +from openai import AsyncOpenAI +import hud + +client = AsyncOpenAI( + base_url="https://inference.hud.ai", + api_key=os.environ["HUD_API_KEY"] +) + +async with hud.eval(eval) as ctx: + messages = [{"role": "user", "content": ctx.prompt}] + + while True: + response = await client.chat.completions.create( + model="gpt-4o", + messages=messages, + tools=ctx.as_openai_chat_tools() + ) + + msg = response.choices[0].message + messages.append(msg) + + if not msg.tool_calls: + break + + for tool_call in msg.tool_calls: + result = await ctx.call_tool(tool_call) + messages.append(result) + + await ctx.submit(msg.content or "") +``` + +### Responses API + +```python +async with hud.eval(eval) as ctx: + response = await client.responses.create( + model="gpt-4o", + input=ctx.prompt, + tools=ctx.as_openai_responses_tools() + ) + + for item in response.output: + if item.type == "function_call": + await ctx.call_tool(item) + + await ctx.submit(response.output_text) +``` + +### Agents SDK + +```python +from agents import Agent, Runner +import hud + +async with hud.eval(eval) as ctx: + agent = Agent( + name="trivia-agent", + instructions="Answer trivia questions. Use tools to look up information.", + tools=ctx.as_openai_agent_tools() + ) + + result = await Runner.run(agent, ctx.prompt) + await ctx.submit(result.final_output) +``` + +Requires: `pip install openai-agents` + +--- + +## Anthropic + +Claude's Messages API with tool use. + +```python +import os +from anthropic import AsyncAnthropic +import hud + +client = AsyncAnthropic( + base_url="https://inference.hud.ai", + api_key=os.environ["HUD_API_KEY"] +) + +async with hud.eval(eval) as ctx: + messages = [{"role": "user", "content": ctx.prompt}] + + while True: + response = await client.messages.create( + model="claude-sonnet-4-20250514", + max_tokens=1024, + messages=messages, + tools=ctx.as_claude_tools() + ) + + tool_uses = [b for b in response.content if b.type == "tool_use"] + if not tool_uses: + break + + tool_results = [await ctx.call_tool(block) for block in tool_uses] + + messages.append({"role": "assistant", "content": response.content}) + messages.append({"role": "user", "content": tool_results}) + + text = next((b.text for b in response.content if b.type == "text"), "") + await ctx.submit(text) +``` + +Requires: `pip install anthropic` + +--- + +## Gemini + +Google's Gemini API with function calling. + +```python +import os +import google.generativeai as genai +import hud + +genai.configure(api_key=os.environ["GOOGLE_API_KEY"]) +model = genai.GenerativeModel("gemini-2.0-flash") + +async with hud.eval(eval) as ctx: + chat = model.start_chat() + + response = chat.send_message( + ctx.prompt, + tools=ctx.as_gemini_tools(), + tool_config=ctx.as_gemini_tool_config() + ) + + while True: + part = response.candidates[0].content.parts[0] + if not hasattr(part, "function_call") or not part.function_call: + break + + result = await ctx.call_tool(part) + response = chat.send_message(result) + + await ctx.submit(response.text) +``` + +Requires: `pip install google-generativeai` + +--- + +## browser-use + +Browser automation for web agents. + +```python +import os +from browser_use import Agent +from langchain_openai import ChatOpenAI +import hud + +llm = ChatOpenAI( + model="gpt-4o", + base_url="https://inference.hud.ai", + api_key=os.environ["HUD_API_KEY"] +) + +async with hud.eval(eval) as ctx: + agent = Agent(task=ctx.prompt, llm=llm) + result = await agent.run() + await ctx.submit(str(result)) +``` + +Requires: `pip install browser-use playwright && playwright install` + +--- + +## LangChain + +LangChain's agent framework with tool calling. + +```python +import os +from langchain_openai import ChatOpenAI +from langchain.agents import create_tool_calling_agent, AgentExecutor +from langchain_core.prompts import ChatPromptTemplate +import hud + +llm = ChatOpenAI( + model="gpt-4o", + base_url="https://inference.hud.ai", + api_key=os.environ["HUD_API_KEY"] +) + +async with hud.eval(eval) as ctx: + tools = ctx.as_langchain_tools() + + prompt = ChatPromptTemplate.from_messages([ + ("system", "You are a helpful assistant."), + ("human", "{input}"), + ("placeholder", "{agent_scratchpad}"), + ]) + + agent = create_tool_calling_agent(llm, tools, prompt) + executor = AgentExecutor(agent=agent, tools=tools) + + result = await executor.ainvoke({"input": ctx.prompt}) + await ctx.submit(result["output"]) +``` + +Requires: `pip install langchain langchain-openai langchain-core` + +--- + +## LlamaIndex + +LlamaIndex's ReAct agent with tool integration. + +```python +import os +from llama_index.llms.openai import OpenAI +from llama_index.core.agent import ReActAgent +import hud + +llm = OpenAI( + model="gpt-4o", + api_base="https://inference.hud.ai", + api_key=os.environ["HUD_API_KEY"] +) + +async with hud.eval(eval) as ctx: + tools = ctx.as_llamaindex_tools() + + agent = ReActAgent.from_tools(tools, llm=llm, verbose=True) + response = await agent.achat(ctx.prompt) + + await ctx.submit(str(response)) +``` + +Requires: `pip install llama-index-core llama-index-llms-openai` + +--- + +## Google ADK + +Google's Agent Development Kit for Gemini-powered agents. + +```python +import os +from google.adk.agents import Agent +from google.adk.runners import Runner +import hud + +async with hud.eval(eval) as ctx: + agent = Agent( + name="trivia-agent", + model="gemini-2.0-flash", + instruction="Answer trivia questions. Use tools to look up information.", + tools=ctx.as_adk_tools() + ) + + runner = Runner(agent=agent) + result = await runner.run(ctx.prompt) + + await ctx.submit(result.output) +``` + +Requires: `pip install google-adk` + +--- + +## CrewAI + +Multi-agent orchestration with roles and tasks. + +```python +import os +from crewai import Agent, Task, Crew +from langchain_openai import ChatOpenAI +import hud + +llm = ChatOpenAI( + model="gpt-4o", + base_url="https://inference.hud.ai", + api_key=os.environ["HUD_API_KEY"] +) + +async with hud.eval(eval) as ctx: + tools = ctx.as_langchain_tools() + + researcher = Agent( + role="Researcher", + goal="Find accurate information", + backstory="Expert at finding information", + tools=tools, + llm=llm + ) + + task = LegacyTask( + description=ctx.prompt, + expected_output="The initials of the CEO", + agent=researcher + ) + + crew = Crew(agents=[researcher], tasks=[task]) + result = crew.kickoff() + await ctx.submit(str(result)) +``` + +Requires: `pip install crewai langchain-openai` + +--- + +## AutoGen + +Microsoft's multi-agent conversation framework. + +```python +import os +from autogen import AssistantAgent, UserProxyAgent +import hud + +async with hud.eval(eval) as ctx: + config_list = [{ + "model": "gpt-4o", + "base_url": "https://inference.hud.ai", + "api_key": os.environ["HUD_API_KEY"] + }] + + assistant = AssistantAgent( + name="assistant", + llm_config={"config_list": config_list} + ) + + for tool in ctx.as_tools(): + @assistant.register_for_execution() + async def tool_fn(name=tool.name, **kwargs): + return await ctx.call_tool(name, **kwargs) + + user = UserProxyAgent( + name="user", + human_input_mode="NEVER", + code_execution_config=False + ) + + result = await user.a_initiate_chat(assistant, message=ctx.prompt) + await ctx.submit(result.summary) +``` + +Requires: `pip install pyautogen` + +--- + +## Format Reference + +| Method | Returns | Use With | +|--------|---------|----------| +| `as_openai_chat_tools()` | OpenAI Chat format | OpenAI Chat Completions | +| `as_openai_responses_tools()` | OpenAI Responses format | OpenAI Responses API | +| `as_openai_agent_tools()` | FunctionTool objects | OpenAI Agents SDK | +| `as_claude_tools()` | Anthropic format | Claude API | +| `as_gemini_tools()` | Gemini format | Google AI | +| `as_adk_tools()` | ADK FunctionTool objects | Google ADK | +| `as_langchain_tools()` | StructuredTool objects | LangChain, CrewAI | +| `as_llamaindex_tools()` | FunctionTool objects | LlamaIndex | +| `as_tools()` | MCP Tool objects | Raw MCP, AutoGen | + +All `call_tool()` calls auto-detect the input format and return matching output format. + +--- + +## Bring Your Own + +Don't see your framework? The pattern is simple: + +1. Get tools in your framework's format (or use `as_tools()` for raw MCP) +2. Run your agent loop +3. Call `ctx.call_tool()` for each tool invocation +4. Call `ctx.submit()` with the final answer + +```python +async with hud.eval(eval) as ctx: + tools = ctx.as_tools() # Raw MCP format + + result = await my_custom_agent(ctx.prompt, tools, ctx.call_tool) + + await ctx.submit(result) +``` + +The environment handles setup, evaluation, and tracing. You handle the agent logic. diff --git a/docs/guides/sandboxing.mdx b/docs/guides/sandboxing.mdx new file mode 100644 index 00000000..dbebcb3d --- /dev/null +++ b/docs/guides/sandboxing.mdx @@ -0,0 +1,161 @@ +--- +title: "Sandboxing" +description: "Turn your existing services into agent-testable environments" +icon: "shield" +--- + +You have a production stack. You want an agent on it. But you can't just point an agent at productionโ€”it'll make real changes, hit real APIs, affect real users. And you can't test at scale against a single live instance with shared state. + +HUD lets you mock your production environment so agents can run against it safely. Connect your services in a few lines, mock external dependencies, and run thousands of agents in parallelโ€”each isolated, each reproducible, each generating useful data. + +## Connecting Your Stack + +HUD wraps your existing infrastructure without rewriting it: + +```python +from hud import Environment + +env = Environment("my-env") + +# Connect what you already have +env.connect_fastapi(app) # FastAPI โ†’ tools +env.connect_openapi("https://api.example.com/openapi.json") # OpenAPI spec โ†’ tools +env.connect_hub("hud-evals/browser") # HUD Hub environments +env.connect_image("my-service:v1") # Docker images +``` + +## Making Databases Safe + +Agents need isolated state. Three patterns work: + +**In-memory SQLite** โ€” fastest, resets automatically: +```python +import sqlite3 +db = sqlite3.connect(":memory:") # Fresh per eval + +@env.scenario("update-order") +async def update_order(order_id: str): + db.executescript(Path("fixtures/orders.sql").read_text()) # Seed + answer = yield f"Update order {order_id} to shipped" + row = db.execute("SELECT status FROM orders WHERE id=?", (order_id,)).fetchone() + yield 1.0 if row and row[0] == "shipped" else 0.0 +``` + +**Transaction rollback** โ€” use your real DB, undo changes: +```python +@env.scenario("process-refund") +async def process_refund(order_id: str): + conn = await asyncpg.connect(DATABASE_URL) + tx = conn.transaction() + await tx.start() + try: + answer = yield f"Process refund for order {order_id}" + # Check result... + yield reward + finally: + await tx.rollback() # Always undo + await conn.close() +``` + +**Fixture seeding** โ€” deterministic starting state: +```python +await db.execute("TRUNCATE orders, users CASCADE") +await db.executemany("INSERT INTO users ...", fixtures["users"]) +``` + +## Mocking External Services + +`env.mock()` intercepts at the tool layer. Agents only see tools, so this is usually all you need: + +```python +env.mock() # All tools return schema-based fake responses +env.mock_tool("send_email", {"status": "sent", "id": "mock-123"}) +env.mock_tool("charge_card", {"success": True, "transaction_id": "tx-mock"}) +``` + +For stateful mocking (tracking what happened for assertions): + +```python +class MockPaymentService: + def __init__(self): + self.charges = [] + + async def charge(self, amount: int, card_token: str) -> dict: + self.charges.append({"amount": amount, "token": card_token}) + return {"success": True, "id": f"ch-{len(self.charges)}"} + +payments = MockPaymentService() + +@env.scenario("checkout") +async def checkout(cart_total: int): + _ = yield f"Complete checkout for ${cart_total}" + yield 1.0 if any(c["amount"] == cart_total for c in payments.charges) else 0.0 +``` + +## Docker vs No Docker + +| Pattern | When to Use | Examples | +|---------|-------------|----------| +| **No Docker** | Pure Python, API integrations | Web research, LLM grading | +| **Docker** | System dependencies, persistent services | VNC, PostgreSQL, browsers | + +### Pattern 1: No Docker + +Import and test directly: + +```python +# local_test.py +from env import env + +async def test(): + async with env: + result = await env.call_tool("search", query="test") +``` + +### Pattern 2: Docker + +Connect to the running container instead of importing. Same API, different transportโ€”because your tools now run inside the container where dependencies live: + +```python +# local_test.py +env = Environment("browser-env") +env.connect_url("http://localhost:8765/mcp") # Connect instead of import + +async def test(): + async with env: # Same API from here + result = await env.call_tool("navigate", url="https://example.com") +``` + +```bash +hud build # Build image +hud dev -w scenarios -w tools --port 8765 # Start with hot-reload +python local_test.py # Connects to container +``` + +### Hot-Reload + +`hud dev -w path` reloads Python on save. System services (postgres, VNC) persist. + +**Rebuild** (`hud build`) when: Dockerfile, system packages, or dependencies change. + +## Environment Structure + +Start simple, add structure as needed: + +``` +# Simple # Organized +my-env/ my-env/ +โ”œโ”€โ”€ env.py โ”œโ”€โ”€ env.py +โ”œโ”€โ”€ local_test.py โ”œโ”€โ”€ scenarios/ +โ””โ”€โ”€ Dockerfile.hud โ”œโ”€โ”€ setup/ + โ”œโ”€โ”€ evaluate/ + โ””โ”€โ”€ Dockerfile.hud +``` + +Most environments fall somewhere between. Split when files get hard to navigate. + +## What's Next + +**Test locally.** See [Testing Environments](/advanced/testing-environments) for debugging and scenario testing. + +**Deploy.** Push to GitHub, connect on [hud.ai](https://hud.ai). See [Deploy](/quick-links/deploy). diff --git a/docs/index-legacy.mdx b/docs/index-legacy.mdx new file mode 100644 index 00000000..ecccffeb --- /dev/null +++ b/docs/index-legacy.mdx @@ -0,0 +1,113 @@ +--- +title: "Introduction" +description: "OSS environment + evals toolkit for AI agents." +icon: "book" +--- + + +**Version 0.4.73** - Latest stable release + + + + + Test Claude, Operator, or custom agents on benchmarks like SheetBench and OSWorld + + + + Wrap any software in dockerized MCP for scalable and generalizable agent evaluation + + + +## What is HUD? + +HUD connects AI agents to software environments using the Model Context Protocol (MCP). Whether you're evaluating existing agents or building new environments, HUD provides the infrastructure. + +```mermaid +graph LR + Agent["๐Ÿค– Any Agent
(Claude, Operator, etc.)"] + MCP["๐Ÿ”Œ MCP Protocol
(Tool Calls)"] + Env["๐Ÿ“ฆ Any Environment
(Browser, OS, etc.)"] + + Agent -->|"call_tool()"| MCP + MCP -->|"click(x, y)"| Env + Env -->|"screenshot"| MCP + MCP -->|"get_response()"| Agent + + style Agent fill:#3b82f6,stroke:#1e40af,stroke-width:2px,color:#ffffff + style MCP fill:#f59e0b,stroke:#d97706,stroke-width:2px,color:#ffffff + style Env fill:#10b981,stroke:#047857,stroke-width:2px,color:#ffffff +``` + +## Why HUD? + +- **๐Ÿ”Œ MCP-native**: Any agent can connect to any environment +- **๐Ÿ“ก Live telemetry**: Debug every tool call at [hud.ai](https://hud.ai) +- **โšก HUD Gateway**: Unified inference API for all LLMs +- **๐Ÿš€ Production-ready**: From local Docker to cloud scale +- **๐ŸŽฏ Built-in benchmarks**: OSWorld-Verified, SheetBench-50, and more +- **๐Ÿ”ง CLI tools**: Create, develop, and run with `hud init`, `hud dev`, `hud run`, `hud eval` + + + + Run your first agent evaluation with zero setup + + + + Unified inference API for OpenAI, Anthropic, Gemini, and Open Source Models + + + + Give your AI assistant full knowledge of HUD docs + + + + + +## Quick Example + +```python +import asyncio, os, hud +from hud.datasets import Task +from hud.agents import ClaudeAgent + +async def main(): + # Define evaluation task with remote MCP + task = Task( + prompt="Win a game of 2048 by reaching the 128 tile", + mcp_config={ + "hud": { + "url": "https://mcp.hud.ai/v3/mcp", + "headers": { + "Authorization": f"Bearer {os.getenv('HUD_API_KEY')}", + "Mcp-Image": "hudevals/hud-text-2048:0.1.3" + } + } + }, + setup_tool={"name": "setup", "arguments": {"name": "board", "arguments": { "board_size": 4}}}, + evaluate_tool={"name": "evaluate", "arguments": {"name": "max_number", "arguments": {"target": 64}}} + ) + + # Run agent (auto-creates MCP client) + agent = ClaudeAgent.create() + result = await agent.run(task) + print(f"Score: {result.reward}") + +asyncio.run(main()) +``` + +## Community + + + + Star the repo and contribute + + + + Join our community + + + +### Are you an enterprise building agents? + +[๐Ÿ“… Hop on a call](https://cal.com/jay-hud) or [๐Ÿ“ง founders@hud.ai](mailto:founders@hud.ai) + diff --git a/docs/index.mdx b/docs/index.mdx index da51407e..6bd837f1 100644 --- a/docs/index.mdx +++ b/docs/index.mdx @@ -1,104 +1,126 @@ --- title: "Introduction" -description: "OSS RL environment + evals toolkit." +description: "Build, evaluate, and train AI agents." icon: "book" --- - -**Version 0.4.74** - Latest stable release - +HUD gives you three things: a unified API for every model, a way to turn your code into agent-callable tools, and infrastructure to run evaluations at scale. - - - Test Claude, Operator, or custom agents on benchmarks like SheetBench and OSWorld - +## Install - - Wrap any software in dockerized MCP for scalable and generalizable agent evaluation - +```bash +# Install CLI +uv tool install hud-python --python 3.12 - - Use reinforcement learning and GRPO on evaluations to improve agent performance - - +# Set your API key +hud set HUD_API_KEY=your-key-here +``` -## What is HUD? - -HUD connects AI agents to software environments using the Model Context Protocol (MCP). Whether you're evaluating existing agents, building new environments, or training models with RL, HUD provides the infrastructure. - -```mermaid -graph LR - Agent["๐Ÿค– Any Agent
(Claude, Operator, etc.)"] - MCP["๐Ÿ”Œ MCP Protocol
(Tool Calls)"] - Env["๐Ÿ“ฆ Any Environment
(Browser, OS, etc.)"] - - Agent -->|"call_tool()"| MCP - MCP -->|"click(x, y)"| Env - Env -->|"screenshot"| MCP - MCP -->|"get_response()"| Agent - - style Agent fill:#3b82f6,stroke:#1e40af,stroke-width:2px,color:#ffffff - style MCP fill:#f59e0b,stroke:#d97706,stroke-width:2px,color:#ffffff - style Env fill:#10b981,stroke:#047857,stroke-width:2px,color:#ffffff +Get your API key at [hud.ai/settings/api-keys](https://hud.ai/settings/api-keys). + +## 1. Gateway: Any Model, One API + +Stop juggling API keys. Point any OpenAI-compatible client at `inference.hud.ai` and use Claude, GPT, Gemini, or Grok: + +```python +from openai import AsyncOpenAI +import os + +client = AsyncOpenAI( + base_url="https://inference.hud.ai", + api_key=os.environ["HUD_API_KEY"] +) + +response = await client.chat.completions.create( + model="claude-sonnet-4-5", # or gpt-4o, gemini-2.5-pro, grok-4-1-fast... + messages=[{"role": "user", "content": "Hello!"}] +) ``` -## Why HUD? +Every call is traced. View them at [hud.ai/home](https://hud.ai/home). -- **๐Ÿ”Œ MCP-native**: Any agent can connect to any environment -- **๐Ÿ“ก Live telemetry**: Debug every tool call at [hud.ai](https://hud.ai) -- **โšก HUD Gateway**: Unified inference API for all LLMs -- **๐Ÿš€ Production-ready**: From local Docker to cloud scale -- **๐ŸŽฏ Built-in benchmarks**: OSWorld-Verified, SheetBench-50, and more - - **๐Ÿ”ง CLI tools**: Create, develop, run, and train with `hud init`, `hud dev`, `hud run`, `hud eval`, `hud rl` +โ†’ [More on Gateway](/quick-links/gateway) - - - Run your first agent evaluation with zero setup - +## 2. Environments: Your Code, Agent-Ready - - Unified inference API for OpenAI, Anthropic, Gemini, and Open Source Models - +A production API is one live instance with shared stateโ€”you can't run 1,000 parallel tests without them stepping on each other. Environments spin up fresh for every evaluation: isolated, deterministic, reproducible. Each generates training data. - - Give your AI assistant full knowledge of HUD docs - - +Turn your code into tools agents can call. Define scripts that evaluate what agents do: + +```python +from hud import Environment + +env = Environment("my-env") + +@env.tool() +def search(query: str) -> str: + """Search the knowledge base.""" + return db.search(query) + +@env.scenario("find-answer") +async def find_answer(question: str): + answer = yield f"Find the answer to: {question}" + yield 1.0 if "correct" in answer.lower() else 0.0 +``` +Scripts define the prompt (first yield) and the scoring logic (second yield). The agent runs in between. +โ†’ [More on Environments](/quick-links/environments) -## Quick Example +## 3. Evals: Test and Improve + +Run your scenario with different models. Compare results: ```python -import asyncio, os, hud -from hud.datasets import Task -from hud.agents import ClaudeAgent - -async def main(): - # Define evaluation task with remote MCP - task = Task( - prompt="Win a game of 2048 by reaching the 128 tile", - mcp_config={ - "hud": { - "url": "https://mcp.hud.ai/v3/mcp", - "headers": { - "Authorization": f"Bearer {os.getenv('HUD_API_KEY')}", - "Mcp-Image": "hudevals/hud-text-2048:0.1.3" - } - } - }, - setup_tool={"name": "setup", "arguments": {"name": "board", "arguments": { "board_size": 4}}}, - evaluate_tool={"name": "evaluate", "arguments": {"name": "max_number", "arguments": {"target": 64}}} +import hud + +task = env("find-answer", question="What is 2+2?") + +async with hud.eval(task, variants={"model": ["gpt-4o", "claude-sonnet-4-5"]}, group=5) as ctx: + response = await client.chat.completions.create( + model=ctx.variants["model"], + messages=[{"role": "user", "content": ctx.prompt}] ) - - # Run agent (auto-creates MCP client) - agent = ClaudeAgent.create() - result = await agent.run(task) - print(f"Score: {result.reward}") + await ctx.submit(response.choices[0].message.content) +``` + +**Variants** test different configurations. **Groups** repeat each to see the distribution. Results show up on [hud.ai](https://hud.ai/home) with scores, traces, and side-by-side comparisons. -asyncio.run(main()) +โ†’ [More on A/B Evals](/quick-links/ab-testing) + +## 4. Deploy and Scale + +Push your environment to GitHub, connect it on [hud.ai](https://hud.ai), and run thousands of evals in parallel. Every run generates training data. + +```bash +hud init # Scaffold environment +git push # Push to GitHub +# Connect on hud.ai โ†’ New โ†’ Environment +hud eval my-org/my-eval --model gpt-4o --group-size 100 ``` +โ†’ [More on Deploy](/quick-links/deploy) + +## Next Steps + + + + One endpoint for every model. Full observability. + + + + Tools, scripts, and local testing. + + + + Variants, groups, and finding what works. + + + + Run at scale. Generate training data. + + + ## Community @@ -107,11 +129,12 @@ asyncio.run(main()) - Join our community + Join the community -### Are you an enterprise building agents? +## Enterprise -[๐Ÿ“… Hop on a call](https://cal.com/jay-hud) or [๐Ÿ“ง founders@hud.ai](mailto:founders@hud.ai) +Building agents at scale? We work with teams on custom environments, benchmarks, and training pipelines. +[๐Ÿ“… Book a call](https://cal.com/jay-hud) ยท [๐Ÿ“ง founders@hud.ai](mailto:founders@hud.ai) diff --git a/docs/llm-quickstart.mdx b/docs/llm-quickstart.mdx index 7bde2a04..bcd99d95 100644 --- a/docs/llm-quickstart.mdx +++ b/docs/llm-quickstart.mdx @@ -30,5 +30,5 @@ icon: "sparkles" -Try asking your assistant: "How do I create a custom agent in HUD?" or "Help me debug MCP tool calls" +Try asking: "How do I create an Environment with tools?" or "How do scripts and evals work in HUD?" \ No newline at end of file diff --git a/docs/migration.mdx b/docs/migration.mdx new file mode 100644 index 00000000..f8f76ce0 --- /dev/null +++ b/docs/migration.mdx @@ -0,0 +1,183 @@ +--- +title: "Migrating from v4" +description: "Transition from Task-based environments to the unified Environment class" +icon: "arrow-right-arrow-left" +--- + +v4 separated environments (Docker containers) from evaluation logic (Task objects). v5 unifies everything in the `Environment` classโ€”tools, setup, and scoring live together. + + +**Deprecation Notice**: `LegacyTask`, `setup_tool`, and `evaluate_tool` are deprecated in v0.5.0 and will be removed in v0.6.0 (no earlier than March 1st, 2026). Use `Task.from_v4()` for quick migration or `@env.scenario()` for new code. + + +## Good News: Your Code Still Works + +`Environment` inherits from `MCPServer`. Same API, same behavior. Just change the import: + +```python +# Before +from hud.server import MCPServer +mcp = MCPServer("my-env") + +@mcp.tool() +def my_tool(): ... + +mcp.run() +``` + +```python +# After +from hud import Environment +env = Environment("my-env") + +@env.tool() +def my_tool(): ... + +env.run() +``` + +That's it. Your Dockerfile, your tools, your `run()` callโ€”all unchanged. Environment adds scripts, connectors, and integrations on top. + +## Migration Path 1: Quick Conversion with Task.from_v4() + +The fastest way to migrate existing v4 codeโ€”no changes to task definitions needed: + +```python +# BEFORE (deprecated in v0.6.0) +from hud.datasets import LegacyTask + +legacy_task = LegacyTask( + prompt="Navigate to google.com", + mcp_config={"hud": {...}}, + setup_tool={"name": "navigate", "arguments": {"url": "https://google.com"}}, + evaluate_tool={"name": "check_url", "arguments": {}} +) + +# AFTER - One-line conversion +from hud.eval import Task + +task = Task.from_v4(legacy_task) # Converts LegacyTask โ†’ Task +# Also works with: Task.from_v4(dict), Task.from_v4(json_string) + +# Works the same with agents +agent = ClaudeAgent.create() +result = await agent.run(task) +``` + +`Task.from_v4()` automatically: +- Runs `setup_tool` at the start of evaluation +- Runs `evaluate_tool` at the end to compute reward +- Preserves all existing behavior + +## Migration Path 2: Full Scenario Migration (Recommended) + +For new code or when refactoring, migrate `setup_tool` and `evaluate_tool` to `@env.scenario()`. + +**The rule is simple:** +- `setup_tool` code โ†’ **before the first yield** +- `evaluate_tool` code โ†’ **after the first yield** + +```python +# BEFORE (deprecated in v0.6.0) +task = LegacyTask( + prompt="What's the current URL?", + mcp_config={"hud": {...}}, + setup_tool={"name": "navigate", "arguments": {"url": "https://google.com"}}, + evaluate_tool={"name": "check_url", "arguments": {"expected": "google.com"}} +) + +# AFTER +from hud import Environment + +env = Environment("browser").connect_hub("hud-evals/browser") + +@env.scenario("navigate-google") +async def navigate_google(): + # ===== SETUP SECTION (replaces setup_tool) ===== + await env.call_tool("navigate", url="https://google.com") + + # ===== PROMPT (first yield) ===== + answer = yield "What's the current URL?" + + # ===== EVALUATE SECTION (replaces evaluate_tool) ===== + result = await env.call_tool("check_url", expected="google.com") + + # ===== REWARD (second yield) ===== + yield 1.0 if result else 0.0 + +# Create task from scenario +task = env("navigate-google") +``` + +### Multiple setup_tool Calls + +If you have multiple setup tools, just call them in sequence: + +```python +# BEFORE +setup_tool=[ + {"name": "navigate", "arguments": {"url": "..."}}, + {"name": "login", "arguments": {"user": "..."}}, + {"name": "go_to_page", "arguments": {"page": "settings"}} +] + +# AFTER +@env.scenario("settings-test") +async def settings_test(): + # Multiple setup steps - just call them in order + await env.call_tool("navigate", url="...") + await env.call_tool("login", user="...") + await env.call_tool("go_to_page", page="settings") + + answer = yield "Verify the settings page loaded correctly" + + result = await env.call_tool("check_settings") + yield 1.0 if result else 0.0 +``` + +## Using with Built-in Agents + +Built-in agents (ClaudeAgent, OpenAIAgent, etc.) work with both patterns: + +```python +from hud.agents import ClaudeAgent + +agent = ClaudeAgent.create() + +# Works with Task from scenario +result = await agent.run(env("navigate-google")) + +# Works with Task.from_v4() conversion +result = await agent.run(Task.from_v4(legacy_task)) +``` + +## Optional: Bring Your Own Agent + +v5 gives you the `hud.eval()` context manager for maximum flexibility: + +```python +async with hud.eval(env("checkout", product="laptop")) as ctx: + # Use OpenAI, Anthropic, your own agentโ€”whatever you want + response = await client.chat.completions.create( + model="gpt-4o", + messages=[{"role": "user", "content": ctx.prompt}], + tools=ctx.as_openai_chat_tools() + ) + + # Handle tool calls, run your agent loop... + await ctx.submit(response.choices[0].message.content) + +print(ctx.reward) +``` + +The old `ClaudeAgent` and `OperatorAgent` still workโ€”even with the new `hud.eval()` system. But now you're not locked into a specific agent spec. Pair with the [Gateway](/quick-links/gateway) to use any model through one API. + +## Quick Reference + +| v4 (deprecated in v0.6.0) | v5 | +|---------------------------|-----| +| `LegacyTask(...)` | `Task.from_v4(...)` (quick) or `env("scenario", ...)` (recommended) | +| `setup_tool` | Code before first yield in `@env.scenario()` | +| `evaluate_tool` | Code after first yield in `@env.scenario()` | +| `MCPServer` | `Environment` (drop-in replacement) | +| `agent.run(task)` | Still works, or use `hud.eval()` for BYOA | diff --git a/docs/quick-links/ab-testing.mdx b/docs/quick-links/ab-testing.mdx new file mode 100644 index 00000000..0c1215f8 --- /dev/null +++ b/docs/quick-links/ab-testing.mdx @@ -0,0 +1,61 @@ +--- +title: "A/B Evals" +description: "Find out which model actually performs best for your use case." +icon: "flask-vial" +--- + +LLM outputs vary from run to runโ€”ask the same question twice and you might get different quality answers. To find out which model actually performs best, you need to test each one multiple times and look at the spread. **Variants** let you test different models side-by-side. **Groups** repeat each test so you see the full distribution, not just one lucky or unlucky result. + +## Variants + +Pass the configurations you want to test: + +```python +import hud + +async with hud.eval(variants={"model": ["gpt-4o", "claude-sonnet-4-5"]}) as ctx: + response = await client.chat.completions.create( + model=ctx.variants["model"], + messages=[{"role": "user", "content": "What is 2+2?"}] + ) + ctx.reward = 1.0 if "4" in response.choices[0].message.content else 0.0 + +for result in ctx.results: + print(f"{result.variants}: reward={result.reward}") +``` + +## Groups + +Run each variant multiple times to get a distribution: + +```python +async with hud.eval( + variants={"model": ["gpt-4o", "claude-sonnet-4-5"]}, + group=5 # 10 runs total: 2 models ร— 5 each +) as ctx: + ... +``` + +The `hud.eval` manager will parallelize your evals automatically and show the distribution across all your runs on [hud.ai](https://hud.ai/home). + +## Remote Rollouts + +Once you've [deployed an environment](/quick-links/deploy#deploying-environments) and created evals, run them by name: + +```python +async with hud.eval("my-org/checkout-laptop", variants={"model": ["gpt-4o", "claude"]}) as ctx: + response = await client.chat.completions.create( + model=ctx.variants["model"], + messages=[{"role": "user", "content": ctx.prompt}] + ) +``` + +The platform loads everythingโ€”environment, prompt, evaluation logic, comparisons across models. You just provide the agent. + +Or via CLI: + +```bash +hud eval my-org/checkout-laptop --model gpt-4o --group-size 5 +``` + +Or run directly on the platformโ€”see [Running at Scale](/quick-links/deploy#running-at-scale). diff --git a/docs/quick-links/deploy.mdx b/docs/quick-links/deploy.mdx new file mode 100644 index 00000000..5553c7d8 --- /dev/null +++ b/docs/quick-links/deploy.mdx @@ -0,0 +1,66 @@ +--- +title: "Deploy" +description: "Deploy environments. Create evals. Run and train at scale." +icon: "rocket" +--- + +You've built an environment with tools and scripts. Deploy it to the platform and you can run evals at scaleโ€”hundreds of parallel runs across models, all traced, all generating training data. + +## Deploying Environments + +Start with `hud init` ([see Environments](/quick-links/environments)) to scaffold locally. When ready: + +1. Go to [hud.ai](https://hud.ai) โ†’ **New** โ†’ **Environment** +2. Connect your GitHub repo and name your environment +3. Push changes and it rebuilds automatically, like Vercel + +Your environmentโ€”tools, scripts, everythingโ€”is now live. Connect from anywhere: + +```python +env.connect_hub("my-org/my-env") +``` + +## Running at Scale + +Once deployed, create evals on [hud.ai](https://hud.ai) from your scripts. Each eval is a frozen configurationโ€”same prompt, same scoring, every time. + +Your scenario might take arguments: + +```python +@env.scenario("checkout") +async def checkout_flow(product_name: str, apply_coupon: bool = False): + yield f"Complete checkout for {product_name}" + (" with coupon" if apply_coupon else "") + yield 1.0 if order_confirmed() else 0.0 +``` + +On the platform, click **New Eval** โ†’ select your scenario โ†’ fill in the arguments. Create multiple evals from the same scenario: + +| Eval Name | Arguments | +|-----------|-----------| +| `checkout-laptop` | `product_name="Laptop"`, `apply_coupon=False` | +| `checkout-phone-coupon` | `product_name="Phone"`, `apply_coupon=True` | +| `checkout-headphones` | `product_name="Headphones"`, `apply_coupon=False` | + +Then run themโ€”select an eval, choose variants and groups, launch hundreds of runs in parallel. Every run is traced. Results show scores, distributions, and side-by-side model comparisons. These become your training data. + +For A/B testing with variants and groups, see [A/B Evals](/quick-links/ab-testing). + +## What's Next? + +With your environment deployed: + +- **Scale**: Launch thousands of rollouts. Every run generates tracesโ€”prompts, tool calls, rewards. +- **Analyze**: See which evals agents struggle with. Compare models across your entire benchmark. +- **Train**: Use runs as training data. Fine-tune on successful completions. Run reinforcement learning to optimize for your specific environment. + +The loop: deploy โ†’ eval at scale โ†’ analyze โ†’ train โ†’ redeploy. Agents get better at *your* environment. + + + + Connect OpenAI, Anthropic, LangChain, and more. + + + + Turn production services into safe test environments. + + diff --git a/docs/quick-links/environments.mdx b/docs/quick-links/environments.mdx new file mode 100644 index 00000000..2827055a --- /dev/null +++ b/docs/quick-links/environments.mdx @@ -0,0 +1,108 @@ +--- +title: "Environments" +description: "Turn your code into agent-callable tools. Define how agents are evaluated." +icon: "cube" +--- + +An environment is everything an agent can interact withโ€”your APIs, services, databases, wrapped as tools. But it's more than that: the environment also defines how agents are *evaluated* through **scripts**. When you deploy an environment, you're creating a sandbox that agents can learn from at scale. + +## Why Environments, Not API Servers? + +Your production API is a single live instance with shared stateโ€”you can't run 500 tests against it in parallel without causing chaos. Environments spin up fresh for every evaluation: isolated, deterministic, reproducible. Run thousands in parallel, each starting from the exact state you define, each generating training data. An API server is a live system you observe. An environment is a sandbox you control. + +## Tools + +Start with `hud init` to scaffold an environmentโ€”works with existing codebases or from scratch: + +```bash +hud init +``` + +Every tool is just a function. Decorate it with `@env.tool()` and agents can call it: + +```python +from hud import Environment + +env = Environment("my-env") + +@env.tool() +async def search(query: str) -> str: + """Search the knowledge base.""" + return db.search(query) +``` + +Got a FastAPI app? One line: + +```python +env.connect_fastapi(app) +``` + +All your routes become tools. Run it: + +```python +async with env() as ctx: + tools = await ctx.list_tools() + result = await ctx.call_tool("search", query="test") +``` + +## Scripts + +To evaluate an agent, you need two things: what to tell it, and how to score what it did. Scripts capture both with two `yield` statements: + +```python +@env.scenario("checkout") +async def checkout_flow(product_name: str): + # Yield the prompt, receive the agent's final answer + answer = yield f"Add '{product_name}' to cart and complete checkout" + + # Score based on environment state and/or the answer + order_exists = await check_order_status(product_name) + yield 1.0 if order_exists else 0.0 +``` + +The agent runs between the yields. First yield sends the prompt and returns the agent's answer. Second yield checks environment stateโ€”database rows, files, API callsโ€”and returns a reward. Scripts live with the environment because only the environment knows how to verify what happened. + +## Evals + +Call the environment with a scenario name and arguments to create a task: + +```python +task = env("checkout", product_name="Laptop") + +async with hud.eval(task, group=4) as ctx: + # Connect your agent here. Handle tool calls, run agent loop... + response = await client.chat.completions.create( + model="gpt-4o", + messages=[{"role": "user", "content": ctx.prompt}], + tools=ctx.as_openai_chat_tools() + ) + + await ctx.submit(response.choices[0].message.content) + +print(ctx.reward) +``` + +This creates a trace on [hud.ai](https://hud.ai/home). Add [variants](/quick-links/ab-testing) to A/B test across models. To run evals at scale, [deploy your environment](/quick-links/deploy). + +## Mock Mode + +Testing your agent loop without hitting real services? Mock mode returns fake responses based on tool schemas: + +```python +env.mock() +env.mock_tool("search", "Mock search results") # Manual override of mock + +async with hud.eval(env(), group=4) as ctx: + tools = env.as_openai_chat_tools() + + response = await client.chat.completions.create( + model="claude-sonnet-4-5", + messages=[{"role": "user", "content": "Search for X"}], + tools=tools + ) + + # Returns mock value instead of hitting real service + result = await env.call_tool(response.choices[0].message.tool_calls[0]) +``` +Your agent code stays the sameโ€”just toggle `env.mock()` for local testing. + diff --git a/docs/quick-links/gateway.mdx b/docs/quick-links/gateway.mdx new file mode 100644 index 00000000..11d5d73d --- /dev/null +++ b/docs/quick-links/gateway.mdx @@ -0,0 +1,128 @@ +--- +title: "Gateway" +description: "One endpoint for every model. One API key. Full observability." +icon: "server" +--- + +Stop juggling API keys. HUD Gateway routes to Anthropic, OpenAI, Gemini, xAI, and more through a single OpenAI-compatible endpointโ€”with built-in telemetry. Swap `model="gpt-4o"` for `model="claude-sonnet-4-5"` and you're [A/B testing](/quick-links/ab-testing) across providers. Continuous RL from production coming soon. + +## Quick Start + +Point any OpenAI-compatible client at `inference.hud.ai`: + + + +```python Python +from openai import AsyncOpenAI +import os + +client = AsyncOpenAI( + base_url="https://inference.hud.ai", + api_key=os.environ["HUD_API_KEY"] +) + +response = await client.chat.completions.create( + model="claude-sonnet-4-5", # or gpt-4o, gemini-2.5-pro, grok-4-1-fast... + messages=[{"role": "user", "content": "Hello!"}] +) +``` + +```bash curl +curl -X POST https://inference.hud.ai/chat/completions \ + -H "Authorization: Bearer $HUD_API_KEY" \ + -H "Content-Type: application/json" \ + -d '{ + "model": "claude-sonnet-4-5", + "messages": [{"role": "user", "content": "Hello!"}] + }' +``` + + + +## Supported Models + +Full list at [hud.ai/models](https://hud.ai/models). + + +| Model | Routes | +|-------|--------| +| `claude-sonnet-4-5` | chat, messages | +| `claude-haiku-4-5` | chat, messages | +| `claude-opus-4-5` | chat, messages | +| `claude-opus-4-1` | chat, messages | + + + +| Model | Routes | +|-------|--------| +| `gpt-5.1` | chat, responses | +| `gpt-5-mini` | chat, responses | +| `gpt-4o` | chat, responses | +| `gpt-4o-mini` | chat, responses | +| `operator` | responses | + + + +| Model | Routes | +|-------|--------| +| `gemini-3-pro-preview` | chat | +| `gemini-2.5-pro` | chat | +| `gemini-2.5-computer-use-preview` | gemini | + + + +| Model | Routes | +|-------|--------| +| `grok-4-1-fast` | chat | +| `z-ai/glm-4.5v` | chat | + + +## Telemetry + +Wrap code in a plain `hud.eval()` to group inference calls. In the trace you'll see the full conversation in sequence, not scattered API calls. + +```python +async with hud.eval(): + response = await client.chat.completions.create( + model="claude-sonnet-4-5", + messages=[{"role": "user", "content": "Hello!"}] + ) +``` + +Or inject a trace ID manually if you're not using `hud.eval()`. Generate a UUID and pass it with each request in a task: + + + +```python Python +import uuid + +trace_id = str(uuid.uuid4()) # e.g. "a1b2c3d4-e5f6-7890-abcd-ef1234567890" + +response = await client.chat.completions.create( + model="claude-sonnet-4-5", + messages=[{"role": "user", "content": "Hello!"}], + extra_headers={"Trace-Id": trace_id} +) +``` + +```bash curl +curl -X POST https://inference.hud.ai/chat/completions \ + -H "Authorization: Bearer $HUD_API_KEY" \ + -H "Content-Type: application/json" \ + -H "Trace-Id: a1b2c3d4-e5f6-7890-abcd-ef1234567890" \ + -d '{ + "model": "claude-sonnet-4-5", + "messages": [{"role": "user", "content": "Hello!"}] + }' +``` + + + +View traces at [hud.ai/home](https://hud.ai/home). + +## Routes + +- **chat** โ€” `/chat/completions` (OpenAI-compatible) +- **messages** โ€” `/messages` (Anthropic-compatible) +- **responses** โ€” `/responses` (OpenAI Responses API) +- **gemini** โ€” Google Gemini native API diff --git a/docs/quickstart.mdx b/docs/quickstart.mdx index 650f200a..6e14401c 100644 --- a/docs/quickstart.mdx +++ b/docs/quickstart.mdx @@ -55,7 +55,19 @@ Get up and running with HUD in minutes. Follow these four steps to install the C -## Environments/CLI Quick Reference +## SDK Quick Reference + +```python +import hud + +# Run evaluation with the new eval API +async with hud.eval("hud-evals/SheetBench-50:0") as ctx: + agent = MyAgent() + result = await agent.run(ctx) + ctx.reward = result.reward +``` + +## CLI Quick Reference ```bash # Create sample environment diff --git a/docs/reference/agents.mdx b/docs/reference/agents.mdx index fa092a50..06316c24 100644 --- a/docs/reference/agents.mdx +++ b/docs/reference/agents.mdx @@ -97,7 +97,7 @@ Claude-specific implementation using Anthropic's API. ```python from hud.agents import ClaudeAgent -from hud.datasets import Task +from hud.datasets import LegacyTask agent = ClaudeAgent.create( checkpoint_name="claude-sonnet-4-5", @@ -105,7 +105,7 @@ agent = ClaudeAgent.create( ) result = await agent.run( - Task( + LegacyTask( prompt="Navigate to example.com", mcp_config={ "hud": { @@ -245,12 +245,12 @@ agent = OpenAIChatAgent.create( ```python from hud.agents import ClaudeAgent -from hud.datasets import Task +from hud.datasets import LegacyTask agent = ClaudeAgent.create() result = await agent.run( - Task( + LegacyTask( prompt="Click the submit button", mcp_config={ "hud": { @@ -270,7 +270,7 @@ print(f"Reward: {result.reward}, Done: {result.done}") ### With Setup and Evaluation ```python -task = Task( +task = LegacyTask( prompt="Find the price of the product", mcp_config={ "hud": { diff --git a/docs/reference/cli/eval.mdx b/docs/reference/cli/eval.mdx index 0bfad028..e8be3c21 100644 --- a/docs/reference/cli/eval.mdx +++ b/docs/reference/cli/eval.mdx @@ -227,5 +227,5 @@ hud cancel --all - [Tasks Reference](/reference/tasks) - Task configuration - [Agents Reference](/reference/agents) - Agent options -- [`hud rl`](/reference/cli/rl) - RL training +- [`hud rft`](/reference/cli/rft) - Reinforcement fine-tuning - [`hud cancel`](/reference/cli/misc) - Cancel remote jobs diff --git a/docs/reference/cli/overview.mdx b/docs/reference/cli/overview.mdx index a474e3ef..49d226a1 100644 --- a/docs/reference/cli/overview.mdx +++ b/docs/reference/cli/overview.mdx @@ -21,8 +21,7 @@ The HUD CLI provides a complete toolkit for creating, developing, and running MC - `hud debug` โ€” 5โ€‘phase compliance test - `hud run` โ€” Execute (Python module/command/Docker) - `hud eval` โ€” Run agents on tasks/datasets - - `hud rl` โ€” Train with GRPO on tasks - - `hud rft` โ€” Fine-tune models with RL (BETA, invite-only) + - `hud rft` โ€” Fine-tune models (BETA, invite-only) @@ -62,8 +61,7 @@ hud --version | `hud debug` | Image/dir/config | 5โ€‘phase compliance test | `hud debug my-env:latest` | | `hud run` | Module/command/image | Execute server (local/remote) | `hud run controller --reload` | | `hud eval` | Tasks/dataset | Run agent on tasks | `hud eval tasks.json claude` | -| `hud rl` | Tasks/dataset | Train with GRPO | `hud rl tasks.json --local` | -| `hud rft` | Tasks file | Fine-tune with RL (BETA, invite-only) | `hud rft run tasks.json` | +| `hud rft` | Tasks file | Fine-tune models (BETA, invite-only) | `hud rft run tasks.json` | ### Other Commands | Command | Description | Example | diff --git a/docs/reference/cli/rft.mdx b/docs/reference/cli/rft.mdx index 8d1d3be1..771b806d 100644 --- a/docs/reference/cli/rft.mdx +++ b/docs/reference/cli/rft.mdx @@ -1,6 +1,6 @@ --- title: "hud rft" -description: "Reinforcement Fine-Tuning commands (invite-only)" +description: "Fine-Tuning commands (invite-only)" icon: "brain-circuit" --- @@ -12,7 +12,7 @@ RFT is currently in BETA. Features and APIs may change. **Access Required**: RFT is available by invite only. Contact [founders@hud.ai](mailto:founders@hud.ai) to request access. -The `hud rft` command group provides tools for fine-tuning models using reinforcement learning on HUD tasks. +The `hud rft` command group provides tools for fine-tuning models on HUD tasks. ## Subcommands @@ -133,4 +133,3 @@ hud rft status f5f050a3-99c1-4339-b819-ccb1325f79d8 --verbose ## See Also - [Beta RFT Documentation](/beta/rft) - Detailed guide and examples -- [hud rl](/reference/cli/rl) - Standard reinforcement learning training diff --git a/docs/reference/cli/rl.mdx b/docs/reference/cli/rl.mdx deleted file mode 100644 index f644770b..00000000 --- a/docs/reference/cli/rl.mdx +++ /dev/null @@ -1,87 +0,0 @@ ---- -title: "hud rl" -description: "Run GRPO reinforcement learning on tasks" -icon: "brain" ---- - -The `hud rl` command trains an agent with GRPO on tasks, locally or via the HUD remote service. - -## Usage - -```bash -hud rl [TASKS_FILE|DATASET] [MODEL] [OPTIONS] -``` - -## Arguments - - - Path to tasks JSON/JSONL file or HuggingFace dataset name. If omitted, looks for a tasks file in the current directory. - - - - Model to train (default: interactive selection) - - -## Options - - - Path to existing configuration file. Short: `-c` - - - - Output directory for checkpoints. Short: `-o` - - - - Restart the vLLM server before training - - - - Enable verbose output. Short: `-v` - - - - Disable DistributedDataParallel (even with multiple GPUs) - - - - Specific GPUs for DDP (e.g., `0,1,2,3`) - - - - Specific GPU for vLLM server - - - - Run training locally instead of the remote HUD server - - -## Behavior - -- If no tasks file is provided, an interactive picker helps locate one. -- Remote mode (default) converts tasks to remote MCP automatically (build/push as needed) and launches remote training. -- Local mode runs training on your machine (delegated to `local_runner`). - -## Examples - -```bash -# Remote (default): auto-convert tasks to remote, then train -hud rl tasks.json --model claude-rl - -# Local training with GPU selection -hud rl tasks.json llama3.1 --local --ddp-gpus 0,1 --vllm-gpu 0 - -# Use a dataset directly (remote) -hud rl hud-evals/SheetBench-50 --model claude-rl -``` - -## See Also - -- [`hud eval`](/reference/cli/eval) -- [`hud get`](/reference/cli/get) -- [`hud build`](/reference/cli/build) -- [`hud push`](/reference/cli/push) - -## Pricing & Billing - -See hosted vLLM and training GPU rates in the [Training Quickstart โ†’ Pricing](/train-agents/quickstart#pricing). Manage usage and billing at `https://hud.ai/project/billing`. \ No newline at end of file diff --git a/docs/reference/environments.mdx b/docs/reference/environments.mdx index 889477f9..94942849 100644 --- a/docs/reference/environments.mdx +++ b/docs/reference/environments.mdx @@ -1,490 +1,302 @@ --- -title: "Environments" -description: "SDK reference for building MCP environments" -icon: "cube" +title: "Environment" +description: "SDK reference for the Environment class - tools, connectors, and integrations" +icon: "desktop" --- -The HUD SDK provides `MCPServer` for building MCP-compatible environments that work with any MCP client. +`Environment` is the unified class for defining tools, connecting to services, and formatting for any LLM provider. -## MCPServer +## Environment ```python -from hud.server import MCPServer +from hud import Environment + +env = Environment("my-env") ``` -Enhanced FastMCP server with Docker-friendly features for building HUD environments. +### Constructor -**Constructor Parameters:** | Parameter | Type | Description | Default | |-----------|------|-------------|---------| -| `name` | `str` | Server name for MCP handshake | Required | -| `instructions` | `str` | Server instructions/description | `None` | -| `**fastmcp_kwargs` | `Any` | Additional FastMCP parameters | - | +| `name` | `str` | Environment name | `"environment"` | +| `instructions` | `str \| None` | Description/instructions | `None` | +| `conflict_resolution` | `ConflictResolution` | How to handle tool name conflicts | `PREFIX` | + +### Context Manager + +Environment must be used as an async context manager to connect: -**Key Features:** -1. **SIGTERM handling** - Graceful shutdown in containers via custom runner -2. **Initialize decorator** - Async setup during MCP initialize request (stdout is temporarily redirected to stderr during initialization to avoid corrupting MCP output) -3. **Shutdown decorator** - Runs only on SIGTERM (container termination), not on hotโ€‘reload/SIGINT -4. **Enhanced add_tool()** - Automatically handles `BaseTool` instances and raw FastMCP Tool objects -5. **Tool decorator passthrough** - `@mcp.tool` returns the original function for easy composition -6. **FastMCP inheritance** - All FastMCP methods available (`mount`, `resource`, `tool`) +```python +async with env: + tools = env.as_openai_chat_tools() + result = await env.call_tool("my_tool", arg="value") +``` -### Decorators +## Defining Tools -#### @initialize +### @env.tool() -Run async setup during MCP initialize request: +Register functions as callable tools: ```python -mcp = MCPServer(name="my-env") +@env.tool() +def count_letter(text: str, letter: str) -> int: + """Count occurrences of a letter in text.""" + return text.lower().count(letter.lower()) + +@env.tool() +async def fetch_data(url: str) -> dict: + """Fetch JSON data from URL.""" + async with httpx.AsyncClient() as client: + response = await client.get(url) + return response.json() +``` + +Tools are automatically documented from type hints and docstrings. + +## Scripts + +Scripts define evaluation logic with two yields: -@mcp.initialize -async def setup_environment(ctx): - """ - Initialize environment resources. +```python +@env.scenario("checkout") +async def checkout_flow(product: str): + # First yield: send prompt, receive answer + answer = yield f"Add '{product}' to cart and checkout" - Args: - ctx: RequestContext with: - - ctx.meta: Client metadata dict - - ctx.session: MCP ServerSession - """ - # Access metadata from agent (if provided) - if ctx.meta: - progress_token = ctx.meta.get("progressToken") - display_width = ctx.meta.get("display_width", 1920) - display_height = ctx.meta.get("display_height", 1080) - - # Send progress notifications - if progress_token: - await ctx.session.send_progress_notification( - progress_token=progress_token, - progress=50, - total=100, - message="Initializing environment..." - ) + # Second yield: return reward based on result + order_exists = await check_order(product) + yield 1.0 if order_exists else 0.0 ``` -#### @shutdown - -Run cleanup on SIGTERM (container termination only): +Create Tasks from scripts: ```python -@mcp.shutdown -async def cleanup(): - """Clean up resources on shutdown.""" - if browser_provider: - browser_provider.close() - logger.info("Cleanup complete") +task = env("checkout", product="laptop") + +async with hud.eval(task) as ctx: + await agent.run(ctx.prompt) + await ctx.submit(agent.response) ``` -### Tool Registration +## Connectors + +Connect to external services as tool sources. + +### connect_hub() -Three ways to register tools: +Connect to a deployed HUD environment: ```python -# 1. Decorator for simple functions -@mcp.tool() -async def my_tool(param: str) -> dict: - return {"result": param} - -# 2. Add BaseTool instances -from hud.tools import BashTool -bash = BashTool() -mcp.add_tool(bash) # Automatically uses bash.mcp internally - -# 3. Add non-BaseTool instances directly -from custom import PlaywrightTool -playwright = PlaywrightTool() -mcp.add_tool(playwright) # Added as-is +env.connect_hub("my-org/browser", prefix="browser") +# Tools available as browser_navigate, browser_click, etc. ``` -### Hub Pattern (mount) +### connect_fastapi() -Use BaseHub for organized tool namespaces: +Import FastAPI routes as tools: ```python -from hud.tools import BaseHub +from fastapi import FastAPI -# Create hub -setup_hub = BaseHub("setup") +api = FastAPI() -# Add internal tools (hidden from agents) -@setup_hub.tool("board") -async def setup_board(size: int = 4): - game = setup_hub.env - game.reset(size=size) - return [TextContent(text=f"{size}x{size} board initialized")] +@api.get("/users/{user_id}", operation_id="get_user") +def get_user(user_id: int): + return {"id": user_id, "name": "Alice"} -# Mount hub on server -mcp.mount(setup_hub) - -# Agents call via dispatcher: setup(name="board", arguments={"size": 4}) +env.connect_fastapi(api) +# Tool available as get_user ``` -### Resources +| Parameter | Type | Description | Default | +|-----------|------|-------------|---------| +| `app` | `FastAPI` | FastAPI application | Required | +| `name` | `str \| None` | Server name | `app.title` | +| `prefix` | `str \| None` | Tool name prefix | `None` | +| `include_hidden` | `bool` | Include routes with `include_in_schema=False` | `True` | + +### connect_openapi() -Expose metadata via MCP resources: +Import from OpenAPI spec: ```python -@mcp.resource("telemetry://live") -async def get_telemetry(): - """Expose live telemetry data.""" - return { - "provider": os.getenv("BROWSER_PROVIDER"), - "status": "running" if browser_provider else "stopped", - "live_url": browser_provider.get_live_view_url() if browser_provider else None, - "timestamp": datetime.now().isoformat() - } +env.connect_openapi("https://api.example.com/openapi.json") ``` -### Running the Server +### connect_server() + +Mount an MCPServer or FastMCP directly: ```python -if __name__ == "__main__": - # Run with SIGTERM handling (stdio by default) - mcp.run() +from fastmcp import FastMCP - # Or use development transports (HTTP/SSE) - mcp.run(transport="http", port=8765) - mcp.run(transport="sse", port=8080) -``` +tools = FastMCP("tools") -When using HTTP/SSE, HUD development helper endpoints are available: +@tools.tool +def greet(name: str) -> str: + return f"Hello, {name}!" -- `GET /hud` โ€“ overview -- `GET /hud/tools` โ€“ list tools with schemas -- `GET /hud/resources` โ€“ list resources -- `GET /hud/prompts` โ€“ list prompts +env.connect_server(tools) +``` -## Real Environment Examples +### connect_mcp_config() -### Minimal Environment +Connect via MCP config dict: ```python -# src/hud_controller/server.py -from hud.server import MCPServer -from mcp.types import TextContent - -mcp = MCPServer(name="counter-env") -counter = {"value": 0} - -@mcp.tool() -async def setup(start_value: int = 0): - """Initialize counter.""" - counter["value"] = start_value - return {"status": "ready", "counter": counter["value"]} - -@mcp.tool() -async def increment(): - """Increment counter.""" - counter["value"] += 1 - return [TextContent(text=f"Counter: {counter['value']}", type="text")] - -@mcp.tool() -async def evaluate(target: int): - """Check if target reached.""" - from hud.tools.types import EvaluationResult - return EvaluationResult( - reward=1.0 if counter["value"] >= target else 0.0, - done=counter["value"] >= target - ) - -if __name__ == "__main__": - mcp.run() +env.connect_mcp_config({ + "my-server": { + "command": "uvx", + "args": ["some-mcp-server"] + } +}) ``` -### text_2048 Environment +### connect_image() -From `environments/text_2048/src/hud_controller/server.py`: +Connect to a Docker image via stdio: ```python -from hud.server import MCPServer -from .game import Game2048 -from .tools import MoveTool -from .setup import setup as setup_hub -from .evaluate import evaluate as evaluate_hub - -mcp = MCPServer(name="text-2048") -game = None - -@mcp.initialize -async def initialize_environment(ctx): - global game - - # Progress notifications - progress_token = getattr(ctx.meta, "progressToken", None) if ctx.meta else None - - async def send_progress(progress: int, message: str): - if progress_token: - await ctx.session.send_progress_notification( - progress_token=progress_token, - progress=progress, - total=100, - message=message - ) - - await send_progress(0, "Starting 2048 game environment...") - - # Create game - game = Game2048() - game.reset() - - await send_progress(50, "Setting up game board...") - - # Set game on hubs - setup_hub.env = game - evaluate_hub.env = game - - # Mount hubs - mcp.mount(setup_hub) - mcp.mount(evaluate_hub) - - await send_progress(70, "Configuring tools...") - - # Add move tool - mcp.add_tool(MoveTool(env=game)) - - await send_progress(100, "2048 environment ready") +env.connect_image("mcp/fetch") ``` -### remote_browser Environment +## Tool Formatting -From `environments/remote_browser/src/hud_controller/server.py`: +Convert tools to provider-specific formats. + +### OpenAI ```python -from hud.server import MCPServer -from hud.tools.computer import HudComputerTool, AnthropicComputerTool, OpenAIComputerTool -from .tools import PlaywrightToolWithMemory, BrowserExecutor -from .setup import setup as setup_hub -from .evaluate import evaluate as evaluate_hub -from .providers import get_provider - -mcp = MCPServer( - name="HUD Remote Browser Environment", - instructions="""Remote browser automation environment...""" +# Chat Completions API +tools = env.as_openai_chat_tools() +response = await client.chat.completions.create( + model="gpt-4o", + messages=messages, + tools=tools, ) -# Global state -browser_provider = None -playwright_tool = None - -@mcp.resource("telemetry://live") -async def get_telemetry_resource(): - """MCP resource with live browser status.""" - return { - "provider": os.getenv("BROWSER_PROVIDER", "unknown"), - "status": "running" if browser_provider else "stopped", - "live_url": browser_provider.get_live_view_url() if browser_provider else None, - "cdp_url": browser_provider.cdp_url if browser_provider else None - } +# Responses API +tools = env.as_openai_responses_tools() -@mcp.initialize -async def initialize_environment(ctx): - global browser_provider, playwright_tool - - # Get metadata - metadata = ctx.meta - progress_token = metadata.get("progressToken", None) - - # Initialize provider - provider_name = os.getenv("BROWSER_PROVIDER") - provider_class = get_provider(provider_name) - browser_provider = provider_class(config) - - # Launch browser - cdp_url = await browser_provider.launch() - - # Create playwright tool - playwright_tool = PlaywrightToolWithMemory(cdp_url=cdp_url) - await playwright_tool._ensure_browser() - - # Add playwright tool (not a BaseTool, added directly) - mcp.add_tool(playwright_tool) - - # Create computer tools - executor = BrowserExecutor(playwright_tool) - tool_kwargs = {"executor": executor} - - # Add display dimensions from metadata - if metadata: - width = metadata.get("display_width") - height = metadata.get("display_height") - if width and height: - tool_kwargs["width"] = width - tool_kwargs["height"] = height - - # Add computer tools (all are BaseTool subclasses) - mcp.add_tool(HudComputerTool(**tool_kwargs)) - mcp.add_tool(AnthropicComputerTool(**tool_kwargs)) - mcp.add_tool(OpenAIComputerTool(**tool_kwargs)) - - # Mount hubs - setup_hub.env = playwright_tool - evaluate_hub.env = playwright_tool - mcp.mount(setup_hub) - mcp.mount(evaluate_hub) - -@mcp.shutdown -async def shutdown_environment(): - """Cleanup browser resources.""" - global browser_provider - if browser_provider: - browser_provider.close() - browser_provider = None +# Agents SDK (requires openai-agents) +tools = env.as_openai_agent_tools() ``` -## Standard Structure +### Anthropic/Claude -### Directory Layout - -``` -my-environment/ -โ”œโ”€โ”€ Dockerfile -โ”œโ”€โ”€ pyproject.toml -โ”œโ”€โ”€ controller/ # MCP controller (stdio) -โ”‚ โ”œโ”€โ”€ __init__.py # mcp = MCPServer() -โ”‚ โ”œโ”€โ”€ __main__.py # python -m controller โ†’ mcp.run() -โ”‚ โ”œโ”€โ”€ hooks.py # @mcp.initialize / @mcp.shutdown -โ”‚ โ””โ”€โ”€ tools.py # @mcp.tool(...) -โ””โ”€โ”€ environment/ # Optional backend (HTTP/IPC) - โ””โ”€โ”€ server.py # e.g., FastAPI app +```python +tools = env.as_claude_tools() +response = await client.messages.create( + model="claude-sonnet-4-5", + messages=messages, + tools=tools, +) ``` -### Dockerfile +### Gemini -```dockerfile -FROM python:3.11-slim +```python +tools = env.as_gemini_tools() +config = env.as_gemini_tool_config() +``` -WORKDIR /app +### LangChain -# Copy and install -COPY pyproject.toml ./ -COPY controller/ ./controller/ -COPY environment/ ./environment/ -RUN pip install --no-cache-dir -e . +```python +# Requires langchain-core +tools = env.as_langchain_tools() +``` -ENV ENV_SERVER_PORT=8005 +### LlamaIndex -# Start optional backend, then MCP controller on stdio -CMD ["sh", "-c", "uvicorn environment.server:app --host 0.0.0.0 --port $ENV_SERVER_PORT --log-level warning & python -m controller"] +```python +# Requires llama-index-core +tools = env.as_llamaindex_tools() ``` -### Hub Module Pattern - -Example from text_2048: +### Google ADK ```python -# src/hud_controller/setup/__init__.py -from hud.tools.base import BaseHub +# Requires google-adk +tools = env.as_adk_tools() +``` -setup = BaseHub("setup") +## Calling Tools -# Import all setup functions to register them -from . import board +### call_tool() -__all__ = ["setup"] +Execute tools with auto-format detection: + +```python +# Simple call +result = await env.call_tool("my_tool", arg="value") -# src/hud_controller/setup/board.py -from . import setup +# From OpenAI tool call +result = await env.call_tool(response.choices[0].message.tool_calls[0]) -@setup.tool("board") -async def setup_board(board_size: int = 4): - """Initialize game board.""" - game = setup.env # Access environment from hub - game.reset(size=board_size) - return [TextContent(text=f"{board_size}x{board_size} game initialized")] +# From Claude tool use +result = await env.call_tool(response.content[0]) # tool_use block ``` -## Key Concepts +Returns result in matching format (OpenAI tool call โ†’ OpenAI tool message, etc.). -### Environment State +## Mock Mode -Three patterns for managing state: +Test without real connections: -1. **Global variables** (simple environments): - ```python - game = None - - @mcp.initialize - async def initialize_environment(ctx): - global game - game = Game2048() - ``` +```python +env.mock() # Enable mock mode -2. **Context class** (complex environments): - ```python - class EnvironmentContext: - def __init__(self): - self.browser = None - self.page = None - - env = EnvironmentContext() - ``` +# Set specific mock outputs +env.mock_tool("navigate", "Navigation successful") +env.mock_tool("screenshot", b"fake_image_data") -3. **Hub env attribute** (for tool access): - ```python - setup_hub.env = game # Tools access via hub.env - ``` +async with env: + result = await env.call_tool("navigate", url="https://example.com") + # Returns "Navigation successful" instead of actually navigating -### Tool Lifecycle +env.unmock() # Disable mock mode +``` -1. **Setup tools** - Hidden from agents, prepare environment state -2. **Interaction tools** - Available to agents for control -3. **Evaluate tools** - Hidden from agents, score performance +| Method | Description | +|--------|-------------| +| `mock(enable=True)` | Enable/disable mock mode | +| `unmock()` | Disable mock mode | +| `mock_tool(name, output)` | Set specific mock output | +| `is_mock` | Check if mock mode is enabled | -### Progress Notifications +## Properties -Send [progress updates](https://modelcontextprotocol.io/specification/basic/utilities/progress) during long-running operations: +| Property | Type | Description | +|----------|------|-------------| +| `name` | `str` | Environment name | +| `prompt` | `str \| None` | Default prompt (set by scenarios or agent code) | +| `is_connected` | `bool` | True if in context | +| `connections` | `dict[str, Connector]` | Active connections | -```python -async def send_progress(progress: int, message: str): - if progress_token: - await ctx.session.send_progress_notification( - progress_token=progress_token, - progress=progress, - total=100, - message=message - ) -``` +## Creating Tasks - -Progress notifications follow the [MCP progress specification](https://modelcontextprotocol.io/specification/basic/utilities/progress#progress-flow). The `progressToken` comes from the client's request [metadata](https://modelcontextprotocol.io/specification/basic/index#_meta). - - -### Metadata Access - -Agent metadata flows through initialization: +Call the environment to create a Task: ```python -@mcp.initialize -async def initialize_environment(ctx): - # From agent's metadata class variable - width = ctx.meta.get("display_width", 1920) if ctx.meta else 1920 - height = ctx.meta.get("display_height", 1080) if ctx.meta else 1080 -``` +# With scenario +task = env("checkout", product="laptop") -## Testing +# Without scenario (just the environment) +task = env() +``` -```bash -# CLI testing -hud debug my-env:latest -hud analyze my-env:latest +Then run with `hud.eval()`: -# Python testing -async def test(): - from hud.clients import MCPClient - - client = MCPClient({ - "env": {"command": "docker", "args": ["run", "-i", "my-env"]} - }) - - async with client: - tools = await client.list_tools() - result = await client.call_tool("setup", {"value": 0}) +```python +async with hud.eval(task, variants={"model": ["gpt-4o"]}) as ctx: + ... ``` ## See Also -- [Build Environments](/build-environments) - Getting started guide -- [Tools](/reference/tools) - Tool implementation reference -- [Environment Spec](/build-environments/spec) - Technical specification and architecture \ No newline at end of file +- [Evals](/reference/evals) - hud.eval() reference +- [MCPServer](/reference/mcpserver) - Building MCP servers +- [Environments Guide](/quick-links/environments) - Getting started guide + diff --git a/docs/reference/evals.mdx b/docs/reference/evals.mdx new file mode 100644 index 00000000..425e461e --- /dev/null +++ b/docs/reference/evals.mdx @@ -0,0 +1,208 @@ +--- +title: "Evals" +description: "SDK reference for hud.eval() - the unified evaluation context manager" +icon: "flask-vial" +--- + +`hud.eval()` is the primary way to run evaluations. It creates an `EvalContext` with telemetry, handles parallel execution, and integrates with the HUD platform. + +## hud.eval() + +```python +import hud + +async with hud.eval() as ctx: + # ctx is an EvalContext (extends Environment) + response = await client.chat.completions.create(...) + ctx.reward = 1.0 +``` + +### Parameters + +| Parameter | Type | Description | Default | +|-----------|------|-------------|---------| +| `source` | `Task \| list[Task] \| str \| None` | Task objects from `env()`, task slugs, or None | `None` | +| `variants` | `dict[str, Any] \| None` | A/B test configuration (lists expand to combinations) | `None` | +| `group` | `int` | Runs per variant for statistical significance | `1` | +| `group_ids` | `list[str] \| None` | Custom group IDs for parallel runs | `None` | +| `job_id` | `str \| None` | Job ID to link traces to | `None` | +| `api_key` | `str \| None` | API key for backend calls | `None` | +| `max_concurrent` | `int \| None` | Maximum concurrent evaluations | `None` | +| `trace` | `bool` | Send telemetry to backend | `True` | +| `quiet` | `bool` | Suppress console output | `False` | + +### Source Types + +The `source` parameter accepts: + +```python +# 1. Blank eval - manual setup and reward +async with hud.eval() as ctx: + ctx.reward = compute_reward() + +# 2. Task from Environment (recommended) +env = Environment("my-env") +task = env("checkout", product="laptop") # Creates Task from scenario +async with hud.eval(task) as ctx: + await agent.run(ctx.prompt) + +# 3. Task slug (loads from platform) +async with hud.eval("my-org/browser-task") as ctx: + await agent.run(ctx) + +# 4. Multiple tasks +tasks = [env("checkout", product="laptop"), env("checkout", product="phone")] +async with hud.eval(tasks) as ctx: + await agent.run(ctx.prompt) +``` + +### Variants + +Test multiple configurations in parallel: + +```python +async with hud.eval( + eval, + variants={"model": ["gpt-4o", "claude-sonnet-4-5"]}, +) as ctx: + model = ctx.variants["model"] # Current variant + response = await client.chat.completions.create(model=model, ...) +``` + +Lists expand to all combinations: + +```python +variants = { + "model": ["gpt-4o", "claude"], + "temperature": [0.0, 0.7], +} +# Creates 4 combinations: gpt-4o+0.0, gpt-4o+0.7, claude+0.0, claude+0.7 +``` + +### Groups + +Run each variant multiple times for statistical significance: + +```python +async with hud.eval(eval, variants={"model": ["gpt-4o"]}, group=5) as ctx: + # Runs 5 times - see the distribution of results + ... +``` + +Total runs = `len(evals) ร— len(variant_combinations) ร— group` + +### Concurrency Control + +```python +async with hud.eval( + evals, + max_concurrent=10, # Max 10 parallel evaluations +) as ctx: + ... +``` + +## EvalContext + +`EvalContext` extends `Environment` with evaluation tracking. + +### Properties + +| Property | Type | Description | +|----------|------|-------------| +| `trace_id` | `str` | Unique trace identifier | +| `eval_name` | `str` | Evaluation name | +| `prompt` | `str \| None` | Task prompt (from scenario or task) | +| `variants` | `dict[str, Any]` | Current variant assignment | +| `reward` | `float \| None` | Evaluation reward (settable) | +| `answer` | `str \| None` | Submitted answer | +| `error` | `BaseException \| None` | Error if failed | +| `results` | `list[EvalContext]` | Results from parallel runs | +| `headers` | `dict[str, str]` | Trace headers for HTTP requests | +| `job_id` | `str \| None` | Parent job ID | +| `group_id` | `str \| None` | Group ID for parallel runs | +| `index` | `int` | Index in parallel execution | + +### Methods + +All `Environment` methods are available, plus: + +```python +# Submit answer (passes to scenario for evaluation) +await ctx.submit(answer) + +# Set reward directly +ctx.reward = 1.0 + +# Access tools in provider formats +tools = ctx.as_openai_chat_tools() + +# Call tools +result = await ctx.call_tool("my_tool", arg="value") +``` + +### Headers for Telemetry + +Inside an eval context, trace headers are automatically injected into HTTP requests: + +```python +async with hud.eval() as ctx: + # Requests to HUD services include Trace-Id automatically + response = await client.chat.completions.create(...) + + # Manual access + print(ctx.headers) # {"Trace-Id": "..."} +``` + +## Working with Environments + +The recommended pattern is to create Evals from an Environment: + +```python +from hud import Environment +import hud + +env = Environment("my-env") + +@env.tool() +def count_letter(text: str, letter: str) -> int: + return text.lower().count(letter.lower()) + +@env.scenario("count") +async def count_scenario(sentence: str, letter: str): + answer = yield f"How many '{letter}' in '{sentence}'?" + correct = str(sentence.lower().count(letter.lower())) + yield correct in answer + +# Create a Task from the scenario +task = env("count", sentence="Strawberry", letter="r") + +# Run with variants +async with hud.eval(task, variants={"model": ["gpt-4o", "claude"]}) as ctx: + response = await client.chat.completions.create( + model=ctx.variants["model"], + messages=[{"role": "user", "content": ctx.prompt}], + tools=ctx.as_openai_chat_tools(), + ) + await ctx.submit(response.choices[0].message.content or "") +``` + +## Results + +After parallel runs complete, access results on the summary context: + +```python +async with hud.eval(eval, variants={"model": ["gpt-4o", "claude"]}, group=3) as ctx: + ... + +# ctx.results contains all individual EvalContexts +for result in ctx.results: + print(f"{result.variants}: reward={result.reward}, answer={result.answer}") +``` + +## See Also + +- [Environments](/reference/environments) - Environment class reference +- [A/B Evals](/quick-links/ab-testing) - Variants and groups guide +- [Deploy](/quick-links/deploy) - Running evals at scale +- [`hud eval` CLI](/reference/cli/eval) - Command-line interface + diff --git a/docs/reference/mcpserver.mdx b/docs/reference/mcpserver.mdx new file mode 100644 index 00000000..42d33e2b --- /dev/null +++ b/docs/reference/mcpserver.mdx @@ -0,0 +1,510 @@ +--- +title: "MCPServer" +description: "SDK reference for building MCP servers" +icon: "server" +--- + +`MCPServer` is the base class for building MCP-compatible servers that work with any MCP client. It extends FastMCP with Docker-friendly features. + +## Why MCP? + +Traditional agent frameworks couple agents tightly to specific environments. MCP decouples them: + + + + - Agent code hardcoded for each environment + - No standardization across tools + - Difficult to swap agents or environments + + + + - Any agent works with any environment + - Standard protocol for all interactions + - Easy to swap components + + + +MCP standardizes agent-environment communication through JSON-RPC messages. Agents call tools exposed by servers and receive structured responses. + +## MCPServer + +```python +from hud.server import MCPServer +``` + +Enhanced FastMCP server with Docker-friendly features for building HUD environments. + +**Constructor Parameters:** +| Parameter | Type | Description | Default | +|-----------|------|-------------|---------| +| `name` | `str` | Server name for MCP handshake | Required | +| `instructions` | `str` | Server instructions/description | `None` | +| `**fastmcp_kwargs` | `Any` | Additional FastMCP parameters | - | + +**Key Features:** +1. **SIGTERM handling** - Graceful shutdown in containers via custom runner +2. **Initialize decorator** - Async setup during MCP initialize request (stdout is temporarily redirected to stderr during initialization to avoid corrupting MCP output) +3. **Shutdown decorator** - Runs only on SIGTERM (container termination), not on hotโ€‘reload/SIGINT +4. **Enhanced add_tool()** - Automatically handles `BaseTool` instances and raw FastMCP Tool objects +5. **Tool decorator passthrough** - `@mcp.tool` returns the original function for easy composition +6. **FastMCP inheritance** - All FastMCP methods available (`mount`, `resource`, `tool`) + +### Decorators + +#### @initialize + +Run async setup during MCP initialize request: + +```python +mcp = MCPServer(name="my-env") + +@mcp.initialize +async def setup_environment(ctx): + """ + Initialize environment resources. + + Args: + ctx: RequestContext with: + - ctx.meta: Client metadata dict + - ctx.session: MCP ServerSession + """ + # Access metadata from agent (if provided) + if ctx.meta: + progress_token = ctx.meta.get("progressToken") + display_width = ctx.meta.get("display_width", 1920) + display_height = ctx.meta.get("display_height", 1080) + + # Send progress notifications + if progress_token: + await ctx.session.send_progress_notification( + progress_token=progress_token, + progress=50, + total=100, + message="Initializing environment..." + ) +``` + +#### @shutdown + +Run cleanup on SIGTERM (container termination only): + +```python +@mcp.shutdown +async def cleanup(): + """Clean up resources on shutdown.""" + if browser_provider: + browser_provider.close() + logger.info("Cleanup complete") +``` + +### Tool Registration + +Three ways to register tools: + +```python +# 1. Decorator for simple functions +@mcp.tool() +async def my_tool(param: str) -> dict: + return {"result": param} + +# 2. Add BaseTool instances +from hud.tools import BashTool +bash = BashTool() +mcp.add_tool(bash) # Automatically uses bash.mcp internally + +# 3. Add non-BaseTool instances directly +from custom import PlaywrightTool +playwright = PlaywrightTool() +mcp.add_tool(playwright) # Added as-is +``` + +### Hub Pattern (mount) + +Use BaseHub for organized tool namespaces: + +```python +from hud.tools import BaseHub + +# Create hub +setup_hub = BaseHub("setup") + +# Add internal tools (hidden from agents) +@setup_hub.tool("board") +async def setup_board(size: int = 4): + game = setup_hub.env + game.reset(size=size) + return [TextContent(text=f"{size}x{size} board initialized")] + +# Mount hub on server +mcp.mount(setup_hub) + +# Agents call via dispatcher: setup(name="board", arguments={"size": 4}) +``` + +### Resources + +Expose metadata via MCP resources: + +```python +@mcp.resource("telemetry://live") +async def get_telemetry(): + """Expose live telemetry data.""" + return { + "provider": os.getenv("BROWSER_PROVIDER"), + "status": "running" if browser_provider else "stopped", + "live_url": browser_provider.get_live_view_url() if browser_provider else None, + "timestamp": datetime.now().isoformat() + } +``` + +### Running the Server + +```python +if __name__ == "__main__": + # Run with SIGTERM handling (stdio by default) + mcp.run() + + # Or use development transports (HTTP/SSE) + mcp.run(transport="http", port=8765) + mcp.run(transport="sse", port=8080) +``` + +When using HTTP/SSE, HUD development helper endpoints are available: + +- `GET /hud` โ€“ overview +- `GET /hud/tools` โ€“ list tools with schemas +- `GET /hud/resources` โ€“ list resources +- `GET /hud/prompts` โ€“ list prompts + +## Real Environment Examples + +### Minimal Environment + +```python +# src/hud_controller/server.py +from hud.server import MCPServer +from mcp.types import TextContent + +mcp = MCPServer(name="counter-env") +counter = {"value": 0} + +@mcp.tool() +async def setup(start_value: int = 0): + """Initialize counter.""" + counter["value"] = start_value + return {"status": "ready", "counter": counter["value"]} + +@mcp.tool() +async def increment(): + """Increment counter.""" + counter["value"] += 1 + return [TextContent(text=f"Counter: {counter['value']}", type="text")] + +@mcp.tool() +async def evaluate(target: int): + """Check if target reached.""" + from hud.tools.types import EvaluationResult + return EvaluationResult( + reward=1.0 if counter["value"] >= target else 0.0, + done=counter["value"] >= target + ) + +if __name__ == "__main__": + mcp.run() +``` + +### text_2048 Environment + +From `environments/text_2048/src/hud_controller/server.py`: + +```python +from hud.server import MCPServer +from .game import Game2048 +from .tools import MoveTool +from .setup import setup as setup_hub +from .evaluate import evaluate as evaluate_hub + +mcp = MCPServer(name="text-2048") +game = None + +@mcp.initialize +async def initialize_environment(ctx): + global game + + # Progress notifications + progress_token = getattr(ctx.meta, "progressToken", None) if ctx.meta else None + + async def send_progress(progress: int, message: str): + if progress_token: + await ctx.session.send_progress_notification( + progress_token=progress_token, + progress=progress, + total=100, + message=message + ) + + await send_progress(0, "Starting 2048 game environment...") + + # Create game + game = Game2048() + game.reset() + + await send_progress(50, "Setting up game board...") + + # Set game on hubs + setup_hub.env = game + evaluate_hub.env = game + + # Mount hubs + mcp.mount(setup_hub) + mcp.mount(evaluate_hub) + + await send_progress(70, "Configuring tools...") + + # Add move tool + mcp.add_tool(MoveTool(env=game)) + + await send_progress(100, "2048 environment ready") +``` + +### remote_browser Environment + +From `environments/remote_browser/src/hud_controller/server.py`: + +```python +from hud.server import MCPServer +from hud.tools.computer import HudComputerTool, AnthropicComputerTool, OpenAIComputerTool +from .tools import PlaywrightToolWithMemory, BrowserExecutor +from .setup import setup as setup_hub +from .evaluate import evaluate as evaluate_hub +from .providers import get_provider + +mcp = MCPServer( + name="HUD Remote Browser Environment", + instructions="""Remote browser automation environment...""" +) + +# Global state +browser_provider = None +playwright_tool = None + +@mcp.resource("telemetry://live") +async def get_telemetry_resource(): + """MCP resource with live browser status.""" + return { + "provider": os.getenv("BROWSER_PROVIDER", "unknown"), + "status": "running" if browser_provider else "stopped", + "live_url": browser_provider.get_live_view_url() if browser_provider else None, + "cdp_url": browser_provider.cdp_url if browser_provider else None + } + +@mcp.initialize +async def initialize_environment(ctx): + global browser_provider, playwright_tool + + # Get metadata + metadata = ctx.meta + progress_token = metadata.get("progressToken", None) + + # Initialize provider + provider_name = os.getenv("BROWSER_PROVIDER") + provider_class = get_provider(provider_name) + browser_provider = provider_class(config) + + # Launch browser + cdp_url = await browser_provider.launch() + + # Create playwright tool + playwright_tool = PlaywrightToolWithMemory(cdp_url=cdp_url) + await playwright_tool._ensure_browser() + + # Add playwright tool (not a BaseTool, added directly) + mcp.add_tool(playwright_tool) + + # Create computer tools + executor = BrowserExecutor(playwright_tool) + tool_kwargs = {"executor": executor} + + # Add display dimensions from metadata + if metadata: + width = metadata.get("display_width") + height = metadata.get("display_height") + if width and height: + tool_kwargs["width"] = width + tool_kwargs["height"] = height + + # Add computer tools (all are BaseTool subclasses) + mcp.add_tool(HudComputerTool(**tool_kwargs)) + mcp.add_tool(AnthropicComputerTool(**tool_kwargs)) + mcp.add_tool(OpenAIComputerTool(**tool_kwargs)) + + # Mount hubs + setup_hub.env = playwright_tool + evaluate_hub.env = playwright_tool + mcp.mount(setup_hub) + mcp.mount(evaluate_hub) + +@mcp.shutdown +async def shutdown_environment(): + """Cleanup browser resources.""" + global browser_provider + if browser_provider: + browser_provider.close() + browser_provider = None +``` + +## Standard Structure + +### Directory Layout + +``` +my-environment/ +โ”œโ”€โ”€ Dockerfile +โ”œโ”€โ”€ pyproject.toml +โ”œโ”€โ”€ controller/ # MCP controller (stdio) +โ”‚ โ”œโ”€โ”€ __init__.py # mcp = MCPServer() +โ”‚ โ”œโ”€โ”€ __main__.py # python -m controller โ†’ mcp.run() +โ”‚ โ”œโ”€โ”€ hooks.py # @mcp.initialize / @mcp.shutdown +โ”‚ โ””โ”€โ”€ tools.py # @mcp.tool(...) +โ””โ”€โ”€ environment/ # Optional backend (HTTP/IPC) + โ””โ”€โ”€ server.py # e.g., FastAPI app +``` + +### Dockerfile + +```dockerfile +FROM python:3.11-slim + +WORKDIR /app + +# Copy and install +COPY pyproject.toml ./ +COPY controller/ ./controller/ +COPY environment/ ./environment/ +RUN pip install --no-cache-dir -e . + +ENV ENV_SERVER_PORT=8005 + +# Start optional backend, then MCP controller on stdio +CMD ["sh", "-c", "uvicorn environment.server:app --host 0.0.0.0 --port $ENV_SERVER_PORT --log-level warning & python -m controller"] +``` + +### Hub Module Pattern + +Example from text_2048: + +```python +# src/hud_controller/setup/__init__.py +from hud.tools.base import BaseHub + +setup = BaseHub("setup") + +# Import all setup functions to register them +from . import board + +__all__ = ["setup"] + +# src/hud_controller/setup/board.py +from . import setup + +@setup.tool("board") +async def setup_board(board_size: int = 4): + """Initialize game board.""" + game = setup.env # Access environment from hub + game.reset(size=board_size) + return [TextContent(text=f"{board_size}x{board_size} game initialized")] +``` + +## Key Concepts + +### Environment State + +Three patterns for managing state: + +1. **Global variables** (simple environments): + ```python + game = None + + @mcp.initialize + async def initialize_environment(ctx): + global game + game = Game2048() + ``` + +2. **Context class** (complex environments): + ```python + class EnvironmentContext: + def __init__(self): + self.browser = None + self.page = None + + env = EnvironmentContext() + ``` + +3. **Hub env attribute** (for tool access): + ```python + setup_hub.env = game # Tools access via hub.env + ``` + +### Tool Lifecycle + +1. **Setup tools** - Hidden from agents, prepare environment state +2. **Interaction tools** - Available to agents for control +3. **Evaluate tools** - Hidden from agents, score performance + +### Progress Notifications + +Send [progress updates](https://modelcontextprotocol.io/specification/basic/utilities/progress) during long-running operations: + +```python +async def send_progress(progress: int, message: str): + if progress_token: + await ctx.session.send_progress_notification( + progress_token=progress_token, + progress=progress, + total=100, + message=message + ) +``` + + +Progress notifications follow the [MCP progress specification](https://modelcontextprotocol.io/specification/basic/utilities/progress#progress-flow). The `progressToken` comes from the client's request [metadata](https://modelcontextprotocol.io/specification/basic/index#_meta). + + +### Metadata Access + +Agent metadata flows through initialization: + +```python +@mcp.initialize +async def initialize_environment(ctx): + # From agent's metadata class variable + width = ctx.meta.get("display_width", 1920) if ctx.meta else 1920 + height = ctx.meta.get("display_height", 1080) if ctx.meta else 1080 +``` + +## Testing + +```bash +# CLI testing +hud debug my-env:latest +hud analyze my-env:latest + +# Python testing +async def test(): + from hud.clients import MCPClient + + client = MCPClient({ + "env": {"command": "docker", "args": ["run", "-i", "my-env"]} + }) + + async with client: + tools = await client.list_tools() + result = await client.call_tool("setup", {"value": 0}) +``` + +## See Also + +- [Environments](/reference/environments) - Environment class (client-side) +- [Tools](/reference/tools) - Tool implementation reference +- [Evals](/reference/evals) - Running evaluations \ No newline at end of file diff --git a/docs/reference/tasks.mdx b/docs/reference/tasks.mdx index 0bd6d76a..44f93138 100644 --- a/docs/reference/tasks.mdx +++ b/docs/reference/tasks.mdx @@ -4,12 +4,16 @@ description: "SDK reference for task configuration and dataset utilities" icon: "list-check" --- -The HUD SDK provides the `Task` class for defining agent objectives and dataset utilities for managing task collections. +The HUD SDK provides the `LegacyTask` class for defining agent objectives and dataset utilities for managing task collections. -## Task Class + +`LegacyTask` is deprecated. For new code, use `env("scenario_name", **args)` to create Task objects. See [Environments](/reference/environments) for the recommended approach. + + +## LegacyTask Class ```python -from hud.datasets import Task +from hud.datasets import LegacyTask ``` Pydantic model that defines an agent's objective, setup, and evaluation criteria. @@ -31,7 +35,7 @@ Pydantic model that defines an agent's objective, setup, and evaluation criteria The `mcp_config` field automatically resolves environment variables using `${VAR_NAME}` syntax: ```python -task = Task( +task = LegacyTask( prompt="Navigate to the dashboard", mcp_config={ "browser": { @@ -45,7 +49,7 @@ task = Task( ) ``` -Variables are resolved when Task is created from a dict - this is why datasets should store raw dictionaries. +Variables are resolved when LegacyTask is created from a dict - this is why datasets should store raw dictionaries. ## Running Tasks @@ -208,7 +212,7 @@ The `agent_config` field on tasks supports: | `initial_screenshot` | `bool` | Take screenshot before first action | ```python -task = Task( +task = LegacyTask( prompt="Complete the form", mcp_config={...}, agent_config={ diff --git a/docs/reference/types.mdx b/docs/reference/types.mdx index 8361353a..57f8cdb5 100644 --- a/docs/reference/types.mdx +++ b/docs/reference/types.mdx @@ -6,133 +6,126 @@ icon: "code" Core types used throughout the HUD SDK. -## Trace +## Task -Returned by `agent.run()`. Contains the result of an agent execution. +Created by calling an Environment. Holds configuration for running an evaluation. ```python -from hud.types import Trace +from hud import Environment + +env = Environment("my-env") +task = env("scenario_name", arg1="value") # Returns Task ``` | Field | Type | Description | |-------|------|-------------| -| `reward` | `float` | Evaluation score (0.0-1.0) | -| `done` | `bool` | Whether execution completed | -| `content` | `str \| None` | Final response content | -| `isError` | `bool` | Whether an error occurred | -| `info` | `dict[str, Any]` | Additional metadata | -| `task` | `Task \| None` | The executed task | -| `trace` | `list[TraceStep]` | Execution trace steps | -| `messages` | `list[Any]` | Final conversation state | +| `env` | `Environment \| dict \| None` | Source environment | +| `scenario` | `str \| None` | Scenario name to run | +| `args` | `dict[str, Any]` | Script arguments | +| `trace_id` | `str \| None` | Trace identifier | +| `job_id` | `str \| None` | Parent job ID | +| `group_id` | `str \| None` | Group ID for parallel runs | +| `index` | `int` | Index in parallel execution | +| `variants` | `dict[str, Any] \| None` | Variant assignment | -## AgentResponse +## EvalContext -Returned by agent `get_response()` methods. Represents a single model response. +Returned by `hud.eval()`. Extends Environment with evaluation tracking. ```python -from hud.types import AgentResponse +async with hud.eval(task) as ctx: + print(ctx.prompt) # Task prompt + print(ctx.variants) # Current variant + ctx.reward = 1.0 # Set reward ``` -| Field | Type | Description | -|-------|------|-------------| -| `tool_calls` | `list[MCPToolCall]` | Tools to execute | -| `done` | `bool` | Whether agent should stop | -| `content` | `str \| None` | Response text | -| `reasoning` | `str \| None` | Model reasoning/thinking | -| `info` | `dict[str, Any]` | Provider-specific metadata | -| `isError` | `bool` | Error flag | +| Property | Type | Description | +|----------|------|-------------| +| `trace_id` | `str` | Unique trace identifier | +| `eval_name` | `str` | Evaluation name | +| `prompt` | `str \| None` | Task prompt | +| `variants` | `dict[str, Any]` | Current variant assignment | +| `reward` | `float \| None` | Evaluation reward | +| `answer` | `str \| None` | Submitted answer | +| `error` | `BaseException \| None` | Error if failed | +| `results` | `list[EvalContext]` | Results from parallel runs | +| `headers` | `dict[str, str]` | Trace headers | ## MCPToolCall -Represents a tool call to be executed. +Represents a tool call to execute. ```python from hud.types import MCPToolCall + +call = MCPToolCall( + name="navigate", + arguments={"url": "https://example.com"} +) ``` | Field | Type | Description | |-------|------|-------------| -| `id` | `str` | Unique identifier (auto-generated if not provided) | +| `id` | `str` | Unique identifier (auto-generated) | | `name` | `str` | Tool name | | `arguments` | `dict[str, Any]` | Tool arguments | -**Example:** - -```python -tool_call = MCPToolCall( - name="playwright", - arguments={"action": "click", "selector": "#submit"} -) -``` - ## MCPToolResult Result from executing a tool call. ```python from hud.types import MCPToolResult + +result = MCPToolResult( + content=[TextContent(text="Success", type="text")], + isError=False +) ``` | Field | Type | Description | |-------|------|-------------| | `content` | `list[ContentBlock]` | Result content blocks | -| `structuredContent` | `dict[str, Any] \| None` | Structured result data | -| `isError` | `bool` | Whether the tool call failed | +| `structuredContent` | `dict \| None` | Structured result data | +| `isError` | `bool` | Whether the call failed | -## Task +## Trace -Defines an agent task with prompt, environment config, and lifecycle tools. +Returned by `agent.run()`. Contains the result of an agent execution. ```python -from hud.types import Task +from hud.types import Trace + +result = await agent.run(task, max_steps=20) +print(result.reward, result.done) ``` | Field | Type | Description | |-------|------|-------------| -| `prompt` | `str` | Instruction for the agent | -| `mcp_config` | `dict` | Environment connection config | -| `id` | `str \| None` | Unique identifier (required for datasets) | -| `system_prompt` | `str \| None` | Custom system prompt | -| `setup_tool` | `dict \| list[dict] \| None` | Tool(s) to initialize state | -| `evaluate_tool` | `dict \| list[dict] \| None` | Tool(s) to score performance | -| `agent_config` | `BaseAgentConfig \| None` | Task-specific agent config | -| `metadata` | `dict \| None` | Additional task metadata | - -**Example:** - -```python -task = Task( - prompt="Navigate to example.com and click login", - mcp_config={ - "hud": { - "url": "https://mcp.hud.ai/v3/mcp", - "headers": { - "Authorization": "Bearer ${HUD_API_KEY}", - "Mcp-Image": "hudpython/hud-remote-browser:latest" - } - } - }, - setup_tool={"name": "playwright", "arguments": {"action": "navigate", "url": "https://example.com"}}, - evaluate_tool={"name": "evaluate", "arguments": {"name": "url_contains", "substring": "/login"}} -) -``` +| `reward` | `float` | Evaluation score (0.0-1.0) | +| `done` | `bool` | Whether execution completed | +| `content` | `str \| None` | Final response content | +| `isError` | `bool` | Whether an error occurred | +| `info` | `dict[str, Any]` | Additional metadata | +| `trace` | `list[TraceStep]` | Execution trace steps | +| `messages` | `list[Any]` | Final conversation state | -## BaseAgentConfig +## AgentResponse -Standard agent configuration that tasks can override. +Returned by agent `get_response()` methods. ```python -from hud.types import BaseAgentConfig +from hud.types import AgentResponse ``` -| Field | Type | Description | Default | -|-------|------|-------------|---------| -| `allowed_tools` | `list[str] \| None` | Tool patterns to expose | `None` (all) | -| `disallowed_tools` | `list[str] \| None` | Tool patterns to hide | `None` | -| `system_prompt` | `str \| None` | Custom system prompt | `None` | -| `append_setup_output` | `bool` | Include setup output in first turn | `True` | -| `initial_screenshot` | `bool` | Include screenshot in initial context | `True` | -| `response_tool_name` | `str \| None` | Lifecycle tool for responses | `None` | +| Field | Type | Description | +|-------|------|-------------| +| `tool_calls` | `list[MCPToolCall]` | Tools to execute | +| `done` | `bool` | Whether agent should stop | +| `content` | `str \| None` | Response text | +| `reasoning` | `str \| None` | Model reasoning/thinking | +| `info` | `dict[str, Any]` | Provider-specific metadata | +| `isError` | `bool` | Error flag | ## AgentType @@ -140,6 +133,9 @@ Enum of supported agent types. ```python from hud.types import AgentType + +agent_cls = AgentType.CLAUDE.cls +agent = agent_cls.create() ``` | Value | Agent Class | @@ -150,25 +146,44 @@ from hud.types import AgentType | `AgentType.GEMINI` | `GeminiAgent` | | `AgentType.OPENAI_COMPATIBLE` | `OpenAIChatAgent` | -**Example:** +## ContentBlock + +MCP content types (from `mcp.types`): ```python -from hud.types import AgentType +from mcp.types import TextContent, ImageContent -agent_cls = AgentType.CLAUDE.cls # Returns ClaudeAgent class -agent = agent_cls.create() +# Text +TextContent(text="Hello", type="text") + +# Image +ImageContent(data="base64...", mimeType="image/png", type="image") ``` -## ContentBlock +## EvaluationResult -MCP content block types (from `mcp.types`): +Returned by evaluation tools. -- `TextContent` - Text content with `text` field -- `ImageContent` - Image with `data` (base64) and `mimeType` -- `EmbeddedResource` - Embedded resource reference +```python +from hud.tools.types import EvaluationResult -## See Also +result = EvaluationResult( + reward=0.8, + done=True, + content="Task completed", + info={"score": 80} +) +``` -- [Agents Reference](/reference/agents) - Agent classes and configuration -- [Tasks Reference](/reference/tasks) - Task configuration details +| Field | Type | Description | +|-------|------|-------------| +| `reward` | `float` | Score (0.0-1.0) | +| `done` | `bool` | Task complete | +| `content` | `str \| None` | Details | +| `info` | `dict` | Metadata | + +## See Also +- [Evals](/reference/evals) - hud.eval() reference +- [Environments](/reference/environments) - Environment class +- [Agents](/reference/agents) - Agent classes diff --git a/docs/train-agents/quickstart.mdx b/docs/train-agents/quickstart.mdx deleted file mode 100644 index 32e83471..00000000 --- a/docs/train-agents/quickstart.mdx +++ /dev/null @@ -1,126 +0,0 @@ ---- -title: "RL Quickstart" -icon: "graduation-cap" ---- - -## Prerequisites - -- HUD API key: Remote training requires authentication. Set `HUD_API_KEY` before running: - -```bash -export HUD_API_KEY="sk-hud-..." # get one at https://hud.ai -# Or persist it locally: -hud set HUD_API_KEY=sk-hud-... -``` - -- Docker daemon: For local runs (using `--local`) or when training against a local Docker image, ensure Docker Desktop is installed and the Docker daemon is running. - -## Quickstart - -Install and download a taskset: - -```bash -uv tool install hud-python@latest --python 3.12 -hud get hud-evals/2048-basic -``` - -### 1) Simple: Train (remote by default) - -```bash -hud rl 2048-basic.json -``` - -This launches training remotely and automatically provisions a vLLM server and a trainer for you. You can monitor progress on https://hud.ai. The server persists between runs, so you can rerun training or evaluate against the same endpoint. - -Optional baseline first (Claude or Operator): - -```bash -hud eval 2048-basic.json -``` - -### 2) Run on your own machine/remote - -Use any provider with at least 2 GPUs (one for inference, one for training). Run locally with the flag `--local`: - -```bash -uv tool install hud-python@latest --python 3.12 -hud get hud-evals/2048-basic -hud rl 2048-basic.json --local -``` - -### Recommended setups - -- 2ร— A100: quick iteration, shorter runs -- 8ร— A100: higher throughput for larger tasksets - -Training throughput depends on task complexity and parallelism (`max_parallel_episodes`). - -### 3) Build your own environment (hud init) - -Create a new MCP environment, develop with hot-reload, and train on a production image: - -```bash -hud init my-env && cd my-env -hud dev --interactive -# When ready to run: -hud rl -``` - -Change the tasks.json to include other tasks you want to train on. - -See [hud init](/reference/cli/init) for options and details. - - -## Getting the best performance - -Often training a good model requires many iterations over the parameters of the trainer. Take the config generated by `hud rl` and modify it to various values to do a hyperparameter sweep. - -For easy launching, specify the tasks and config upfront, and add `--yes` to automatically launch vllm and training. - -```bash -hud rl taskset.json --config rl-config.json --yes -``` - -Additionally, sometimes it may be helpful to run an initial analysis on the dataset to determine which tasks would be the most informative to trian on. In that case either start with a deployed model or run `hud rl` without training, and then: - -```bash -hud eval taskset.json --full --group-size 6 --max-steps 5 -``` - -This will prompt you for the model choice, produce a table of accuracies per task. Prefer tasks which are 10%-60% accurate for training. - -Some general findings from our internal training runs: -- As many different tasks per gradient update as possible (runs with 4+ GPUs and batch size of 50+ are much more stable than single GPU runs) -- Batch size should be somewhere around 2/X where X is the accuracy of that given task on an untrained model. - -### Pricing - -Below is the pricing by GPU type. Actual prices vary โ€” see https://hud.ai/project/billing for current rates. - -vLLM GPU Pricing (2 Hosted GPUs) - -| GPU type | Memory | Est. price/hr | -| --- | --- | --- | -| A100 80GB | 80 GB | $4.95 | -| H100 80GB | 80 GB | $7.95 | - -Training GPU Pricing - -| GPU type | Memory | Est. price/hr | -| --- | --- | --- | -| A100 80GB | 80 GB | $3.95 | -| H100 80GB | 80 GB | $5.40 | - ---- - -### Learn more - - - - Complete guide to building environments from scratch - - - - Full `hud rl` command options and usage - - \ No newline at end of file diff --git a/docs/train-agents/tasks.mdx b/docs/train-agents/tasks.mdx deleted file mode 100644 index 58131b6f..00000000 --- a/docs/train-agents/tasks.mdx +++ /dev/null @@ -1,80 +0,0 @@ ---- -title: Dataset Design -icon: table ---- - -## Tasks format - -HUD tasksets can be provided in two primary formats (both supported): - -1) A single JSON file containing a list of task objects (recommended) - -```json -[ - { - "id": "browser_2048_128", - "prompt": "Reach 128 in 2048.", - "mcp_config": { - "hud": { - "url": "https://mcp.hud.ai/v3/mcp", - "headers": { - "Authorization": "Bearer ${HUD_API_KEY}", - "Mcp-Image": "hudevals/hud-browser:0.1.3" - } - } - }, - "setup_tool": {"name": "launch_app", "arguments": {"app_name": "2048"}}, - "evaluate_tool": {"name": "evaluate", "arguments": {"name": "game_2048_max_number", "arguments": {"target": 128}}} - } -] -``` - -Save as `2048-basic.json` and run: - -```bash -hud eval 2048-basic.json -hud rl 2048-basic.json -``` - -2) JSONL file with one task object per line - -- prompt: instruction for the agent -- mcp_config: where to run the environment (local docker or remote MCP) -- setup_tool (optional): a tool call to prepare the environment -- evaluate_tool: a tool call to compute reward -- system_prompt (optional): extra guidance for the agent - -## Hosting on HuggingFace - -You can host tasksets on the Hub and fetch them with: - -```bash -hud get hud-evals/2048-basic -``` - -The command downloads the JSONL task file and places it in your project directory. - -This allows running the full dataset or training with simply: - -```bash -hud eval hud-evals/2048-basic -hud rl hud-evals/2048-basic -``` - -## Tips - -- Keep tasks self-contained; use `setup_tool` to open apps or load data -- Ensure `evaluate_tool` returns a numeric reward per episode -- Use small task counts to iterate quickly; scale up once stable - - - - Learn how to run benchmarks - - - - Deep-dive into MCP configs and tools - - - - diff --git a/environments/README.md b/environments/README.md deleted file mode 100644 index 40cba300..00000000 --- a/environments/README.md +++ /dev/null @@ -1,956 +0,0 @@ -# How to Build HUD-Compatible MCP Environments - -This document is a step-by-step guide for turning *any* piece of software that can run in a Docker container into a **Model Context Protocol (MCP)** environment that the HUD SDK can evaluate or control. Weโ€™ll move through six short phases, each with a clear checkpoint. - -> **Big picture** -> โ€ข An *agent* (LLM) wants to solve tasks inside a *software environment*. -> โ€ข Your job: give that environment a clean, programmable surface โ€“ a set of -> *tools* the agent can invoke. -> โ€ข MCP is simply the wire-format we use to move those tool calls back and forth -> (like gRPC or HTTP but JSON-RPC over stdio/Docker). -> โ€ข FastMCP is the underlying SDK; HUD provides **MCPServer** โ€“ a thin wrapper that -> adds SIGTERM handling, `@initialize` / `@shutdown` decorators, and easier -> tool registration while remaining 100 % compatible with FastMCP. -> -> The picture: -> ```text -> LLM Agent โ”€โ”€JSON-RPCโ”€โ”€โ–บ FastMCP server (your code) โ”€โ”€โ–บ real app / game / browser -> ``` -> Your job is to wrap *any* app in an MCP server so agents can control it reproducibly & safely. - ---- - -## Phase Overview - -| Phase | Goal | -|-------|------| -| 1 | A Docker image that *starts* and prints to **stderr** | -| 2 | A minimal MCP server that responds to `initialize` over **stdio** | -| 3 | Working `setup`, `evaluate`, and **interaction** tools | -| 4 | Image launches remotely on the HUD platform & exposes live telemetry | -| 5 | Fast local iteration with `hud dev` hot-reload | - -Take the phases one at a time; do **not** jump ahead. Each stage's checkpoint is the foundation for the next. - -## Reference Implementations - -This repository includes two complete MCP environment implementations that demonstrate different levels of complexity: - -### 1. `text_2048` - Simple Game Environment -A minimalist ASCII-based 2048 game that showcases: -- Basic hub pattern with setup/evaluate tools -- Custom interaction tools (move command) -- Clean separation of game logic and MCP server -- Minimal dependencies (Python only) -- Perfect for learning the core concepts - -### 2. `remote_browser` - Advanced Browser Automation -A sophisticated browser automation environment featuring: -- Multiple cloud browser provider integrations (AnchorBrowser, Steel, BrowserBase, HyperBrowser, Kernel) -- Both Playwright and computer tools for interaction -- Extensive setup/evaluate capabilities (navigation, cookies, sheets, element checks) -- Live telemetry with browser viewing URLs -- Production-ready error handling and cleanup - -๐Ÿ’ก **Follow along with text_2048** as you work through each phase - it demonstrates all the core patterns with minimal complexity. - -### Installing the HUD CLI - -The HUD SDK includes a powerful CLI for debugging and analyzing MCP environments: - -```bash -# Install HUD CLI globally with uv (recommended) -uv tool install hud-python@latest --python 3.12 - -# Or use without installing -uvx --from hud-python hud --help - -# Verify installation -hud --help -``` - -Common commands: -```bash -# Debug your Docker image (runs 5-phase test) -hud debug my-mcp-server:latest - -# Analyze available tools and resources -hud analyze my-mcp-server:latest --format json - -# Debug any command-based MCP server -hud debug --command "python my_server.py" -``` -While you move through the phases it's handy to run the **interactive checker** to make sure nothing broke: - -```bash -# First build your Docker image -docker build -t my-environment environments/my-environment - -# Then debug it -hud debug my-environment -``` - -**What's the difference?** -- **`hud debug`** - Tests your environment in 5 phases, checking startup, MCP protocol, tools, and readiness. Use this first! -- **`hud analyze`** - Explores the environment to discover all tools, resources, and capabilities. Only works after debug passes phase 3. - -The script walks the *same* checklist and prints coloured, human-friendly hints whenever something fails. - -| What it validates | Phase | -|-------------------|-------| -| Container starts & logs to **stderr** | 1 | -| MCP server responds to an `initialize` request | 2 | -| Discovers `setup`, `evaluate`, and interaction tools | 3 | -| Calls `setup` / `evaluate`, checks telemetry & startup time | 4 | -| Spawns three concurrent clients to stress-test resources | 5 | - -๐Ÿ’ก **Run it after finishing each phase.** If the checker exits with a red โŒ, scroll up for the gold-coloured *hint* block โ€“ it usually points directly to the root cause. - ---- - -## Phase 1 โ€“ Write a Dockerfile - -**Goal โ†’** Create a container that can run your MCP server with proper Python packaging. - -Key principles: -- **stdout** is reserved for MCP protocol (JSON-RPC) -- **stderr** is for all logs and debug output -- Use proper Python packaging with `pyproject.toml` -- Run as a module for clean imports - -### Dockerfile Template - -```dockerfile -FROM python:3.11-slim - -# Prevent Python from buffering output (important for logs) -ENV PYTHONUNBUFFERED=1 \ - PYTHONDONTWRITEBYTECODE=1 - -WORKDIR /app - -# Copy package files -COPY pyproject.toml ./ -COPY src/ ./src/ - -# Install in editable mode for development flexibility -RUN pip install --no-cache-dir -e . - -# Run as a module to ensure proper package imports -CMD ["python", "-m", "my_module.server"] -``` - -### Build & Test - -```bash -docker build -t my-environment . - -# Test Phase 1: Container should start without errors -docker run --rm -i my-environment -``` - -### Recommended Environment Structure - -For Python-based MCP environments, use this standard structure: - -``` -my-environment/ -โ”œโ”€โ”€ Dockerfile -โ”œโ”€โ”€ README.md -โ”œโ”€โ”€ server/ # MCP server package -โ”‚ โ”œโ”€โ”€ pyproject.toml # MCP dependencies (hud-python, etc.) -โ”‚ โ”œโ”€โ”€ __init__.py # Empty package marker -โ”‚ โ”œโ”€โ”€ main.py # mcp = MCPServer() + lifecycle hooks -โ”‚ โ”œโ”€โ”€ tools.py # router = MCPRouter() + @router.tool decorators -โ”‚ โ”œโ”€โ”€ setup/ # Setup router (modular approach) -โ”‚ โ”‚ โ”œโ”€โ”€ __init__.py -โ”‚ โ”‚ โ”œโ”€โ”€ basic.py # Basic setup functions -โ”‚ โ”‚ โ””โ”€โ”€ advanced.py # Advanced setup functions -โ”‚ โ””โ”€โ”€ evaluate/ # Evaluate router (modular approach) -โ”‚ โ”œโ”€โ”€ __init__.py -โ”‚ โ”œโ”€โ”€ checks.py # Basic evaluation checks -โ”‚ โ””โ”€โ”€ metrics.py # Advanced metrics evaluators -โ””โ”€โ”€ environment/ # Backend service package - โ”œโ”€โ”€ pyproject.toml # Backend dependencies (fastapi, uvicorn) - โ”œโ”€โ”€ __init__.py - โ””โ”€โ”€ server.py # FastAPI app with /health, /act, /reset, /state -``` - -This structure enables: -- Clean separation of concerns (environment logic, tools, setup, evaluation) -- Easy volume mounting for development (Phase 5) -- Standard Python packaging with `pip install -e .` -- Modular organization - each setup/evaluator in its own file for clarity - -โ€ข **One Dockerfile only** โ€“ no docker-compose. -โ€ข If you're building a GUI environment, start from `hudpython/novnc-base:latest` instead and leave VNC configuration for later phases. - -Checkpoint reached? Congratulations โ€“ move on. - -๐Ÿ‘‰ Quick sanity check: `hud debug my-environment` (verifies Phase 1 automatically) - -Need inspiration? Check out our reference implementations: -โ€ข [`text_2048/Dockerfile`](./text_2048/Dockerfile) - Minimal Python setup, perfect for simple environments -โ€ข [`remote_browser/Dockerfile`](./remote_browser/Dockerfile) - Uses pre-built base image with browser dependencies -โ€ข [`browser/Dockerfile`](./browser/Dockerfile) - Multi-stage build with full GUI support - ---- - -## Phase 2 โ€“ Create the MCP Server - -**Goal โ†’** a Python process that: -1. Speaks MCP over **stdio**. -2. Responds correctly to the `initialize` request. -3. Logs everything to **stderr**. - -The MCP lifecycle is *initialize โ†’ operate โ†’ shutdown* (see spec link above). - -### Skeleton server (MCPServer) - -```python -import sys -import logging -from hud.server import MCPServer - -# 1๏ธโƒฃ Always log to stderr โ€“ stdout is reserved for JSON-RPC -logging.basicConfig( - stream=sys.stderr, - level=logging.INFO, - format='[%(levelname)s] %(asctime)s | %(name)s | %(message)s' -) - -# Create the server early so decorators can reference it -mcp = MCPServer(name="My Environment") - -# Run heavy one-time setup during MCP initialize -@mcp.initialize -async def initialize_environment(session=None, progress_token=None): - """Heavy one-time setup โ€“ start databases, launch background apps, etc.""" - logging.info("starting core servicesโ€ฆ") - await start_services() # your coroutine - logging.info("services ready") - -if __name__ == "__main__": - mcp.run() -``` - -*(Replace `start_services()` with whatever takes noticeable startup time โ€“ browsers, DBs, X servers, โ€ฆ)* - -### Adapt Dockerfile - -At the end of your Dockerfile, you must launch the MCP server as the container's main process, ensuring it communicates over stdio (stdin/stdout). This is typically done by setting the `CMD` or `ENTRYPOINT` to run your server module directly, for example: - - -```dockerfile -FROM python:3.11-slim - -WORKDIR /app -COPY . . - -# Optional: install requirements -# RUN pip install -r requirements.txt - -CMD ["python", "-m", "your_module_name"] # Replace 'your_module_name' with your actual entrypoint module -``` - -### Three validation steps (run them **in order**) - -| # | What you do | Why it matters | -|---|-------------|----------------| -| 1 | **Direct stdio test** โ€“ pipe the JSON below into your script | Proves the Python code handles `initialize` without any client or Docker noise | -| 2 | **MCP Inspector** โ€“ `npx @modelcontextprotocol/inspector python -m my_package.server` | Lets you click around: view capabilities, tools, resources | -| 3 | **Inside Docker** โ€“ rebuild the image and run it | This is *exactly* how HUD will execute the server | -| 4 | **Run `hud debug`** โ€“ `hud debug my-environment` | Combines the above checks & points out common mistakes | - -#### JSON for step 1 - -```jsonc -{ "jsonrpc": "2.0", "id": 1, "method": "initialize", "params": { - "protocolVersion": "2024-11-05", - "capabilities": {"roots": {"listChanged": true}}, - "clientInfo": {"name": "DevClient", "title": "Dev", "version": "0.0.0"} -}} -``` - -Pipe it: - -```bash -echo '' | python -m my_package.server -``` - -If all three validations succeed, you have a real MCP server โ€“ time to make it useful. - ---- - -## Phase 3 โ€“ Add Setup / Evaluate / Interaction Tools - -**Goal โ†’** tools are discoverable in the Inspector *and* callable from the HUD SDK. - -๐Ÿ‘‰ After wiring in the tools, confirm with `hud debug my-environment --max-phase 3` โ€“ it now checks for their presence and basic execution. - -๐Ÿ” Once debug passes phase 3, you can analyze the environment: -```bash -hud analyze my-environment # Interactive view of tools and resources -hud analyze my-environment --format json # JSON output for scripts -hud analyze my-environment --format markdown # Generate documentation -``` - -1. Write **`setup`** and **`evaluate`** tools first โ€“ they are *lifecycle* tools and never shown to the LLM. -2. Register at least one **interaction** tool (`computer`, `playwright`, or your own). - -### Approach 1: Simple Direct Implementation - -For simple environments with just a few setup/evaluate functions, you can use direct tool decorators with **MCPServer**: - -```python -from hud.server import MCPServer -from hud.tools import HudComputerTool - -mcp = MCPServer(name="my-environment") - -@mcp.tool() -async def setup(config: dict) -> dict: - ... # prepare environment - -@mcp.tool() -async def evaluate(config: dict) -> dict: - ... # return {"reward": <0-1>, "done": bool} - -@mcp.initialize -async def initialize_environment(session=None, progress_token=None): - custom_tool = HudComputerTool() - mcp.add_tool(custom_tool.mcp) - - # Any other initialization -``` - -### Approach 2: Hub Pattern (Recommended for Complex Environments) - -The BaseHub pattern provides a clean way to organize multiple setup/evaluate functions with automatic discovery and registration. **A BaseHub is fundamentally another MCP server (it's a subclass of FastMCP)** that you mount to your main server, providing namespace separation and modular organization. All hub functions are exposed through one tool named after the hub, and a resource that can list all of its tools. - -When mounted, the hub's tools are accessible through a single tool that dispatches to the appropriate function: -```json -{ - "name": "setup", - "arguments": { - "name": "reset", // Which function in the hub to call - "arguments": {"param": "value"} // Additional parameters - } -} -``` - -```python -# In setup/__init__.py -from hud.tools.base import BaseHub - -# Create the setup hub (a sub-server) -setup = BaseHub("setup") - -# Import all setup modules to register their tools -from . import basic, advanced # This registers all @setup.tool() decorated functions - -# In setup/basic.py -from . import setup -from mcp.types import TextContent - -@setup.tool() -async def reset(**kwargs): - """Reset the environment to its initial state. - - Args: - **kwargs: Additional parameters - - Returns: - TextContent - """ - # Access environment from the hub - env = setup.env - await env.reset_state() - return TextContent( - text="Environment reset to initial state", - type="text" - ) - -@setup.tool() -async def seed_data(num_items: int = 5): - """Seed the environment with test data. - - Args: - num_items: Number of items to create - - Returns: - TextContent - """ - # Access environment from the hub - env = setup.env - items = await env.create_items(num_items) - return TextContent( - text=f"Created {len(items)} items", - type="text" - ) - -# In evaluate/__init__.py -from hud.tools.base import BaseHub - -# Create the evaluate hub (another sub-server) -evaluate = BaseHub("evaluate") - -# Import all evaluator modules -from . import checks, metrics - -# In evaluate/checks.py -from . import evaluate -from hud.tools.types import EvaluationResult - -@evaluate.tool() -async def task_complete(expected_count: int): - """Check if the expected number of tasks are completed. - - Args: - expected_count: Expected number of completed tasks - - Returns: - EvaluationResult - """ - # Access environment from the hub - env = evaluate.env - completed = await env.count_completed() - return EvaluationResult( - reward=min(completed / expected_count, 1.0), - done=completed >= expected_count, - content=f"Completed {completed}/{expected_count} tasks", - info={"completed": completed, "expected": expected_count} - ) - -# In server.py -from .setup import setup as setup_hub -from .evaluate import evaluate as evaluate_hub - -# Create MCP server -mcp = MCPServer(name="my-environment") - -@mcp.initialize -async def initialize_environment(ctx): - """Initialize the environment with progress notifications.""" - # Extract progress token from context - progress_token = getattr(ctx.meta, "progressToken", None) if ctx.meta else None - # Send progress updates if available - async def send_progress(progress: int, message: str): - if progress_token: - await ctx.session.send_progress_notification( - progress_token=progress_token, - progress=progress, - total=100, - message=message, - ) - - await send_progress(10, "Starting environment initialization...") - - # Initialize your environment state/context - env = await create_environment_context() - await send_progress(50, "Environment created...") - - # Set environment on hubs - setup_hub.env = env - evaluate_hub.env = env - - # Mount hubs to MCP server - mcp.mount(setup_hub) - mcp.mount(evaluate_hub) - await send_progress(80, "Tools registered...") - - # Register any custom interaction tools - if hasattr(env, 'custom_tool'): - mcp.add_tool(env.custom_tool.mcp) - - await send_progress(100, "Environment ready!") -``` - -The BaseHub pattern provides: -- **Namespace isolation**: Tools are grouped under the hub's name (e.g., "setup", "evaluate") -- **Modular organization**: Each hub can be developed and tested independently -- **Type safety**: Full type hints preserved for parameters and returns - -When you call a hub's tool, you specify which function to execute: -```python -# Calling the "reset" function in the setup hub -await client.call_tool("setup", {"name": "reset"}) - -# Calling the "task_complete" function in the evaluate hub -await client.call_tool("evaluate", {"name": "task_complete", "expected_count": 5}) -``` - -### Test workflow - -1. **Inspector first** โ€“ restart the server, refresh the *Tools* tab, confirm the new tools appear. -2. **Run `hud debug my-environment`** โ€“ this validates initialization, tool discovery and basic calls automatically. -3. **Rebuild the image** โ€“ `docker build -t my-environment .`. -4. **HUD SDK script test** โ€“ run a short script like the one below. GUI environments built from `hudpython/novnc-base` still expose a VNC viewer on โ€“ keep it open while testing. - -```python -import asyncio -import hud -from hud.datasets import Task -from hud.agents import ClaudeAgent -from hud.clients import MCPClient - -async def main(): - # `trace` captures *everything* that happens and sends it to hud.ai - async with hud.async_trace("local_test"): - task = Task( - prompt="Complete the task", - mcp_config={ - "local": { - "command": "docker", - "args": ["run", "--rm", "-i", "my-environment:latest"] - } - }, - setup_tool={"name": "setup", "arguments": {"name": "todo_seed", "num_items": 5}}, - evaluate_tool={"name": "evaluate", "arguments": {"name": "todo_completed", "expected_count": 2}} - ) - client = MCPClient(mcp_config=task.mcp_config) - - agent = ClaudeAgent( - mcp_client=client, - model="claude-3-7-sonnet-20250219", - allowed_tools=["computer"] # or ["move"] for text_2048 - ) - - result = await agent.run(task) - print(result) - - await client.close() - -asyncio.run(main()) -``` - -The `trace` context manager sends a full timeline of agent actions, tool calls, and rewards to hud.ai โ€“ perfect for debugging. - -See `examples/01_hello_2048.py` and `examples/task_with_setup_eval.py` for larger end-to-end demos. - ---- - -## Phase 4 โ€“ Remote Deployment & HUD Runner - -**Goal โ†’** the exact same image runs in parallel on hundreds of instances, and exposes more telemetry so the hud.ai can visualise the whole lifecycle. - -### 1. Publish your image - -Log in to Docker Hub (or any registry HUD can pull from) and push a tagged build: - -```bash -docker tag my-environment yourdockerhubuser/my-environment:latest -docker push yourdockerhubuser/my-environment:latest -``` - -*(If youโ€™re using a private registry, make sure the HUD worker has pull credentials.)* - -### 2. Launch it remotely (gmail_remote pattern) - -Here's how to configure a remote MCP server that runs **the same Docker image**: - -```python -from hud import settings -from hud.clients import MCPClient - -# Your image is in a registry, now tell HUD to pull & run it on demand -config = { - "hud": { - "url": settings.hud_mcp_url, - "headers": { - "Authorization": f"Bearer {settings.api_key}", - "Mcp-Image": "yourdockerhubuser/my-environment:latest", # which image to launch - }, - } -} - -client = MCPClient(mcp_config=config) -``` - -_Steps 3 and 4 below are **optional but highly recommended** once the image boots successfully._ - -Spin up **many** agents in parallel by just launching multiple tasks โ€“ HUD will queue and start as many containers as resources allow. - -### 3. Progress updates during `initialize` (Optional) - -At remote scale it can take 10-30 s for heavy services to boot. Use the new -`@mcp.initialize` decorator plus the `session` / `progress_token` parameters to -stream status messages: - -```python -@mcp.initialize -async def initialize_environment(session=None, progress_token=None): - async def send(p, msg): - if session and progress_token: - await session.send_progress_notification( - progress_token=progress_token, - progress=p, - total=100, - message=msg - ) - await send(10, "Starting X11...") - await start_x11() - await send(50, "Launching browserโ€ฆ") - await launch_browser() - await send(100, "ready") -``` - -Those messages are displayed live on hud.ai alongside resource graphs โ€“ perfect feedback while you wait. - -### 4. Live telemetry (`telemetry://live`) (Optional) - -Expose a resource named `telemetry://live` exactly like in `environments/browser/src/hud_controller/server.py` to return live url to be displayed on hud.ai. - -Once all of the above works you can unleash *hundreds* of concurrent agents on your new environment. - ---- - -## Phase 5 โ€“ Hot-Reload Development - -For rapid local development, run the controller and environment servers separately. This enables instant code updates without Docker rebuilds. - -### Development Setup - -You'll need **two terminal windows** for local development: - -#### Terminal 1: MCP Server -```bash -cd environments/my-environment/server -hud dev # Auto-detects and runs with hot-reload - -# Optional flags: -hud dev --inspector # Launch MCP Inspector -hud dev --interactive # Launch interactive testing mode -hud dev --stdio # Use stdio transport (default: HTTP) -hud dev --watch ../shared # Watch additional directories -``` - -The `hud dev` command: -- Auto-detects the MCP module in the current directory -- Watches for file changes and reloads automatically -- Runs on HTTP by default (http://localhost:8765/mcp) -- Can launch MCP Inspector for testing tools -- Can launch interactive mode for manual testing - -#### Terminal 2: Environment Server (Backend) -```bash -cd environments/my-environment/environment -uvicorn server:app --reload # Standard uvicorn with hot-reload -``` - -For the backend, we simply use `uvicorn` directly since it already provides excellent hot-reload capabilities. - -### Development Workflow - -1. Start both servers in separate terminals -2. Edit code in either `server/` or `environment/` - changes reload automatically -3. Test changes immediately without rebuilding Docker images -4. Use MCP Inspector or interactive mode to test tools -5. When ready, build the complete Docker image: `hud build` - -### Quick Cursor Setup - -Add to `.cursor/mcp.json` (or use the deeplink from `hud dev` output): - -```json -{ - "mcpServers": { - "my-environment-dev": { - "url": "http://localhost:8765/mcp" - } - } -} -``` - -**Note**: Make sure both MCP server and environment backend are running when using with Cursor or agents. - -### Process Separation for Stateful Environments - -**Important Architecture Pattern**: For environments that maintain state (browsers, databases, running applications), you should separate the MCP server process from the actual environment process. This separation is critical for effective hot-reload development. - -#### Why Process Separation? - -When `hud dev` restarts your MCP server for code changes, you don't want to lose: -- Open browser windows and navigation state -- Database connections and data -- Running application state -- X11/VNC sessions -- Any expensive initialization - -#### Architecture Pattern - -``` -โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” -โ”‚ MCP Server โ”‚โ”€โ”€โ”€โ”€โ–ถโ”‚ Environment Process โ”‚ -โ”‚ (Restartable) โ”‚ โ”‚ (Persistent) โ”‚ -โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ - โ–ฒ โ”‚ - โ”‚ โ”‚ - โ””โ”€โ”€โ”€ Communication โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ - (Socket, API, gRPC) -``` - -#### Implementation Example - -1. **Create a Context Server** (`context_server.py`): -```python -from hud.server.context import run_context_server - -class PersistentEnvironmentContext: - def __init__(self): - self.state = {} - self.resources = None - - def startup(self): - # One-time expensive initialization - self.resources = initialize_expensive_resources() - - def get_state(self): - return self.state - -if __name__ == "__main__": - context = PersistentEnvironmentContext() - context.startup() - # Run on Unix socket - asyncio.run(run_context_server(context, "/tmp/my_env_ctx.sock")) -``` - -2. **Connect from MCP Server** (`server.py`): -```python -from hud.server.context import attach_context - -@mcp.initialize -async def initialize_environment(ctx): - # Connect to persistent context - persistent_ctx = attach_context("/tmp/my_env_ctx.sock") - - # Use existing state without reinitializing - state = persistent_ctx.get_state() - resources = persistent_ctx.get_resources() -``` - -3. **Update Dockerfile** to run both processes: -```dockerfile -# Start context server in background -CMD ["sh", "-c", "python -m hud_controller.context_server & python -m hud_controller.server"] -``` - -#### Communication Options - -- **Unix Sockets** (recommended for local): Fast, simple, no network overhead -- **TCP/HTTP API**: Good for distributed systems -- **gRPC**: Type-safe, efficient for complex APIs -- **Shared Memory**: Ultra-fast for large data - -See the `browser` environment for a complete production example of this pattern. - -### 4. Cursor rules โ€“ paste this once - -Inside `.cursor/rules/mcp_environment_iteration.mdc` add (or verify) the following so the agent always knows the expected iteration loop: - -```mdc ---- -description: Improve an MCP environment -alwaysApply: false ---- -Setup -1. Make sure the user has set up the mcp config for the environment by seeing if you have access to the tools by the given name (i.e. my-environment-dev), and make sure the title is in dev mode. If not, ask the user to make a dev version! -2. Make sure you can find the source folder for this environment. Explore its contents and README. -3. Clarify the objectives and ask follow up questions on the initial query to determine precise implementation details. - -Iteration -1. Use the exposed tools by the environment to interact with it. This means navigating around with a computer, editing, launching commands, whatever means accessible to you. If there are any exposed resources, try to access them to determine the structure of the calls. -2. Based on the objectives, test and verify the functionality of different tools and parts of the environment. If any tool call responds with an error, note it down. If any interaction with the environment is wrong, unexpected, incomplete, or parts of the environment are not developed fully, note it down. If any new problem sets up wrong or evaluation does not match the expected outcome, note it down. All of these inconsistencies you should note down in your TODOs. -3. Then, based on the TODOs, view the source folder and find the places where those errors would occur. Think about the system and how to fix it. Then fix it. -4. After you've fixed your TODO items, go back to step 2 and test them. Test through all of your available tools, and use feedback (such as screenshots) to determine your progress. If they now work as expected, mark them as complete. If not, continue the loop from step 2. Be extremely careful, scrupolous and attentive to all details. Never assume something is working unless you've tested it fully for all of its edge cases. -5. The only time you can exit this iteration loop is if you're adding if there is no feasible way to create input conditions to test something. In this case, ask the user for help and recap your progress. If you're simply changing tools, changing code, and still have more realistic TODOs, the restart_server tool automatically refreshes the environment and you should continue working. In *all* other cases, you must continue this iteration loop until you can come up with no more TODOs. You must not halt.``` - -### 5. Prompt the agent - -```txt -Context: In the my-environment folder, I have a browser app environment. I've built a tool to interact with it called my-environment-dev. -Interaction: There are multiple tools to setup and evaluate the environment. There are also interaction tools for you to be able to move around it, and a screenshot tool to see the state. Use all of the available tools. -Objective: Please test if all setup, evaluation functions are working. This means you should come up with new problem definitions to test all functionality on. Be creative in how you pick edge cases to test on. -Rules: @mcp_environment_iteration.mdc -``` - ---- - -## Phase 6 โ€“ Optional Polish & Extensions - -### Deeper dive into registries - -An environment often needs *structured knowledge* about tasks, evaluation logic, or problem definitions. The browser examples keep these in three explicit registries: - -| Registry | Purpose | Example resource URI | -|----------|---------|----------------------| -| **Setup** | How to seed the environment before the agent starts | `setup://registry` & `setup://{env}` | -| **Evaluators** | Functions that decide success & reward | `evaluators://registry` | -| **Problems** | Bundled benchmarks / tasks with their own setup & evaluate pairs | `problems://registry` | - -Each registry is just a dictionary mapping a *name* to a *class*. Use a **decorator** to register classes: - -```python -from .registry import setup, evaluator, problem - -@setup("todo_seed") -class TodoSeed: - ... - -@evaluator("todo_completed") -class TodoCompleted: - ... - -@problem("todo_basic", description="Complete two todo items", difficulty="easy") -class TodoBasic: - def get_setup(self): - return {"name": "todo_seed", "arguments": {"num_items": 5}} - def get_evaluation(self): - return {"name": "todo_completed", "arguments": {"expected_count": 2}} -``` - -Decorators keep registration *next to the implementation* and avoid manual bookkeeping. The server simply exposes the combined metadata through an MCP **resource**. Follow `environments/browser/src/hud_controller/problems/registry.py` as a template and expose the JSON with `@mcp.resource("problems://registry")`. - -### Other finishing touches - -* **Performance** โ€“ lazy-load heavy resources, pool DB connections, cache expensive calls. -* **Security** โ€“ sandbox untrusted code, keep secrets in env vars, audit-log every tool call. -* **Creative ideas** โ€“ API simulators, network test-beds, game worldsโ€ฆ if it fits in Docker it can be an MCP environment. - ---- - -## Contributing to Existing Environments - -When improving existing environments, follow these guidelines: - -### 1. Understanding the Environment - -Before making changes: -- Read the environment's README and any documentation -- Run `hud debug ` to test the environment -- Run `hud analyze ` (after debug passes phase 3) to explore capabilities -- Explore the folder structure and identify key components -- Test existing setup/evaluate functions to understand behavior - -### 2. Making Improvements - -**Adding New Setup Functions** -```python -# In setup/my_new_setup.py -from . import setup -from hud.tools import BaseSetup, TextContent - -@setup("my_new_setup", description="Clear description of what this does") -class MyNewSetup(BaseSetup): - async def __call__(self, context, param1: str, param2: int = 10) -> TextContent: - # Implementation - return TextContent(...) -``` - -**Adding New Evaluators** -```python -# In evaluate/my_evaluator.py -from . import evaluator -from hud.tools import BaseEvaluator, EvaluationResult - -@evaluator("my_check", description="What this evaluates") -class MyCheckEvaluator(BaseEvaluator): - async def __call__(self, context, threshold: float) -> EvaluationResult: - score = await context.calculate_score() - return { - "reward": min(score / 100, 1.0), - "done": score >= threshold, - "info": {"score": score, "threshold": threshold} - } -``` - -### 3. Testing Your Changes - -**Use `hud dev` for Hot-Reload Development** -```bash -# Navigate to the environment directory -cd environments/my-environment - -# Start development server with hot-reload -hud dev --build - -# In another terminal, test your changes -hud analyze hud-my-environment:dev - -# Or use interactive mode to test tools directly -hud dev --build --interactive -``` - -The `hud dev` command automatically: -- Mounts your `src/` directory for live code updates -- Handles container lifecycle and restarts -- Provides an HTTP endpoint for testing -- Shows logs for debugging - -## Testing Your Environment - -Once your environment is working, create comprehensive tests to ensure it stays that way: - -### Creating Test Files - -Each environment should have a test file following this pattern: -- `environments//test__mcp.py` - -The test file should include: -1. **Docker Build Test**: Ensure the image builds successfully -2. **MCP Initialization Tests**: Verify phases 1-3 using `hud debug` -3. **Tool-Specific Tests**: Test your environment's unique tools -4. **Integration Tests**: Test complete workflows - -Example test structure: -```python -class TestMyEnvironment: - IMAGE_NAME = "my-environment-test:latest" - - @classmethod - def setup_class(cls): - """Build Docker image before tests""" - # Build the image - - def test_phase1_basic_startup(self): - """Test container starts""" - - @pytest.mark.asyncio - async def test_phase2_3_mcp_initialize_and_tools(self): - """Test MCP init and tool discovery""" - - @pytest.mark.asyncio - async def test_environment_specific_tools(self): - """Test your custom tools""" -``` - -### Running Tests - -You can run tests directly with pytest: - -```bash -# Run all tests for an environment -cd environments/text_2048 -pytest test_text_2048_mcp.py -v -``` - -### Test Dependencies - -Add pytest to your environment's `pyproject.toml`: - -```toml -[project.optional-dependencies] -test = ["pytest>=7.0", "pytest-asyncio>=0.20"] -``` - -## Summary - -1. Start with a *plain* Dockerfile โ€“ verify it runs. -2. Add a minimal FastMCP server โ€“ verify with stdio, Inspector, Docker. -3. Implement tools โ€“ verify discovery + execution. -4. Run the same image remotely โ€“ verify telemetry. -5. Automate the loop with cursor-mcp. -6. **Write comprehensive tests** โ€“ ensure reliability. -7. Polish and extend as inspiration strikes. - -Happy building โ€“ and remember: **stderr is your friend, stdout belongs to MCP.** ๐Ÿš€ diff --git a/environments/blank/.env.example b/environments/blank/.env.example deleted file mode 100644 index 86f9a702..00000000 --- a/environments/blank/.env.example +++ /dev/null @@ -1,7 +0,0 @@ -# HUD API Configuration -# Get your API key from https://hud.ai/account -HUD_API_KEY="" - -# Anthropic API Configuration (optional) -# Required for using Claude agents - get from https://console.anthropic.com/ -ANTHROPIC_API_KEY="" diff --git a/environments/blank/Dockerfile b/environments/blank/Dockerfile deleted file mode 100644 index fd2639bd..00000000 --- a/environments/blank/Dockerfile +++ /dev/null @@ -1,22 +0,0 @@ -FROM public.ecr.aws/docker/library/python:3.11-bookworm - -WORKDIR /app - -RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/* - -# Copy and install MCP server dependencies -COPY server/pyproject.toml ./server/ -RUN pip install --no-cache-dir ./server - -# Copy and install environment dependencies -COPY environment/pyproject.toml ./environment/ -RUN pip install --no-cache-dir ./environment - -# Copy source code after dependencies -COPY server/ ./server/ -COPY environment/ ./environment/ - -ENV ENV_SERVER_PORT=8005 - -# Start environment server in background, then run MCP server with hot-reload -CMD ["sh", "-c", "uvicorn environment.server:app --host 0.0.0.0 --port $ENV_SERVER_PORT --log-level warning --reload >&2 & sleep 0.5 && hud dev server.main --stdio"] diff --git a/environments/blank/README.md b/environments/blank/README.md deleted file mode 100644 index e62c47e4..00000000 --- a/environments/blank/README.md +++ /dev/null @@ -1,128 +0,0 @@ -# Blank Environment - -Minimal starter template for building HUD environments. -See [docs](https://docs.hud.ai/build-environments) for the complete environment design workflow. - -## Architecture - -**`environment/`** - Produces structured data - -- Owns all state (game logic, browser sessions, databases, etc.) -- Exposes HTTP endpoints `/health`, `/act`, `/reset`, `/state` that return structured information about the environment state - -**`server/`** - Wraps data in MCP tools - -- Calls environment endpoints to get structured data for the agent, and environment setup/evaluation -- Agents and tasks interact only with these tools! - -**Why separate?** Edit tools for the agent or tasks without restarting the heavy environment backend. - -## Development - -```bash -# Terminal 1 - Environment backend -cd environment -uv run uvicorn server:app --reload - -# Terminal 2 - MCP server -cd server -uv run hud dev -``` - -Uncomment the `setup` tool in `server/tools.py`, save, and watch it reload. -Visit http://localhost:8765/docs to see the new tool appear instantly. - -In general, we recommend starting work on the environment backend first, then developing the MCP server to expose the right things to the agent. - -For complex environments that require many dependencies, we recommend running `hud dev` in the environment root: - -```bash -cd .. -hud dev -``` - -## Tasks & Evaluation - -```bash -# Build first in the global folder with the Dockerfile (creates blank:0.1.0) -hud build -``` - -Your `tasks.json` uses `docker run` to launch the environment: - -```json -{ - "prompt": "Your task prompt", - "mcp_config": { - "local": { - "command": "docker", - "args": ["run", "--rm", "-i", "blank:0.1.0"] - } - } -} -``` - -**Commands:** - -```bash -# Build first -hud build - -# Test task locally -hud eval tasks.json - -# Push environment for remote running -hud push - -# Production RL training -hud rl tasks.json # Auto-converts dockerโ†’remote, builds & pushes if needed -``` - -## Publishing Your Environment - -Once your environment is ready, you can share it with the community: - -### 1. Push to Registry - -```bash -# Build and push your environment (requires docker hub login and hud api key) -hud build -hud push -``` - -### 2. Create a Dataset - -Create a dataset on HuggingFace with your tasks: - -**Option A: Upload manually** - -1. Upload your `tasks.json` to HuggingFace -2. Make sure it's **public** to appear on leaderboards - -**Option B: Use the SDK** - -```python -from hud.datasets import save_tasks -import json - -# Load your tasks -with open("tasks.json") as f: - tasks = json.load(f) - -# Push to HuggingFace -save_tasks(tasks, repo_id="your-org/your-dataset") -``` - -### 3. Run and Track Performance - -```bash -# Run Claude on your benchmark -hud eval "your-org/your-dataset" claude - -# View results at: -# hud.ai/leaderboards/your-org/your-dataset -``` - -**Note**: Only public HuggingFace datasets appear as leaderboards! - -๐Ÿ“š Learn more: [Creating Benchmarks](https://docs.hud.ai/evaluate-agents/create-benchmarks) | [Leaderboards](https://docs.hud.ai/evaluate-agents/leaderboards) diff --git a/environments/blank/environment/README.md b/environments/blank/environment/README.md deleted file mode 100644 index b902ec25..00000000 --- a/environments/blank/environment/README.md +++ /dev/null @@ -1,16 +0,0 @@ -# Environment - -Backend service: owns state and exposes HTTP APIs the controller calls. - -Endpoints (FastAPI) -- `GET /health` โ†’ {status: ok} -- `POST /act` โ†’ increments counter and returns {count} -- `POST /reset` โ†’ resets counter -- `GET /state` โ†’ returns {count} - -Run (dev) -```bash -uv run uvicorn server:app --reload --port 8005 -``` - -Principle: treat like a backend. Keep longโ€‘lived state here; add endpoints as tools need them. diff --git a/environments/blank/environment/__init__.py b/environments/blank/environment/__init__.py deleted file mode 100644 index d9cd6199..00000000 --- a/environments/blank/environment/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Blank environment package.""" diff --git a/environments/blank/environment/pyproject.toml b/environments/blank/environment/pyproject.toml deleted file mode 100644 index 8256f97e..00000000 --- a/environments/blank/environment/pyproject.toml +++ /dev/null @@ -1,16 +0,0 @@ -[project] -name = "blank-environment" -version = "0.1.0" -description = "Backend service for blank environment" -requires-python = ">=3.11" -dependencies = [ - "fastapi", - "uvicorn[standard]", -] - -[build-system] -requires = ["hatchling"] -build-backend = "hatchling.build" - -[tool.hatch.build.targets.wheel] -packages = ["."] diff --git a/environments/blank/environment/server.py b/environments/blank/environment/server.py deleted file mode 100644 index 7a382599..00000000 --- a/environments/blank/environment/server.py +++ /dev/null @@ -1,40 +0,0 @@ -"""Minimal FastAPI environment server (HTTP-based).""" - -from fastapi import FastAPI - -import logging -import sys - -logging.basicConfig( - stream=sys.stderr, - level=logging.INFO, - format="[%(levelname)s] %(asctime)s | %(name)s | %(message)s", -) - -app = FastAPI(title="Blank Environment API") - -_count = 0 - - -@app.get("/health") -def health(): - return {"status": "ok"} - - -@app.post("/act") -def act(): - global _count - _count += 1 - return {"count": _count} - - -@app.post("/reset") -def reset(): - global _count - _count = 0 - return {"ok": True} - - -@app.get("/state") -def state(): - return {"count": _count} diff --git a/environments/blank/server/README.md b/environments/blank/server/README.md deleted file mode 100644 index 19fc7068..00000000 --- a/environments/blank/server/README.md +++ /dev/null @@ -1,21 +0,0 @@ -# MCP Server - -MCP layer that wraps environment data in tools for agent interaction. - -## Structure - -- `main.py` - Server initialization, imports routers -- `tools.py` - MCP tools that call environment HTTP endpoints - -## Development - -```bash -# Start MCP server with hot-reload -uv run hud dev -``` - -## Key Principles - -- Keep tools thin - call environment HTTP endpoints -- Use routers for organization -- All long-lived state lives in `environment/`, not here \ No newline at end of file diff --git a/environments/blank/server/__init__.py b/environments/blank/server/__init__.py deleted file mode 100644 index 219d9cdd..00000000 --- a/environments/blank/server/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""MCP server package.""" diff --git a/environments/blank/server/main.py b/environments/blank/server/main.py deleted file mode 100644 index bbe98d13..00000000 --- a/environments/blank/server/main.py +++ /dev/null @@ -1,43 +0,0 @@ -import sys -import logging -from hud.server import MCPServer -from server.shared import http_client - -# Configure logging to stderr -logging.basicConfig( - stream=sys.stderr, - level=logging.INFO, - format="[%(levelname)s] %(asctime)s | %(name)s | %(message)s", - force=True, -) -for logger_name in ["httpx", "httpcore"]: - logging.getLogger(logger_name).setLevel(logging.WARNING) - -# Create main MCP server -mcp = MCPServer(name="blank-environment") - -# Include routers -from server.tools import router as tools_router - -mcp.include_router(tools_router) - - -# Lifecycle hooks -@mcp.initialize -async def init(): - """Check if the environment is healthy""" - if http_client: - await http_client.get("/health") - else: - raise ValueError("http_client is not set") - - -@mcp.shutdown -async def cleanup(): - """Close the HTTP client""" - if http_client: - await http_client.aclose() - - -if __name__ == "__main__": - mcp.run(transport="stdio") diff --git a/environments/blank/server/pyproject.toml b/environments/blank/server/pyproject.toml deleted file mode 100644 index 403f92c0..00000000 --- a/environments/blank/server/pyproject.toml +++ /dev/null @@ -1,19 +0,0 @@ -[project] -name = "blank-server" -version = "0.1.0" -description = "MCP server for blank environment" -requires-python = ">=3.11" -dependencies = [ - "hud-python>=0.4.54", - "httpx>=0.28.1", -] - -[build-system] -requires = ["hatchling"] -build-backend = "hatchling.build" - -[tool.hatch.metadata] -allow-direct-references = true - -[tool.hatch.build.targets.wheel] -packages = ["."] diff --git a/environments/blank/server/shared.py b/environments/blank/server/shared.py deleted file mode 100644 index ad81fac5..00000000 --- a/environments/blank/server/shared.py +++ /dev/null @@ -1,15 +0,0 @@ -from __future__ import annotations - -import os -import httpx - -# Environment port (as string to simplify formatting) -ENV_SERVER_PORT = os.getenv("ENV_SERVER_PORT", "8005") - -# Shared HTTP client for talking to the environment backend -http_client = httpx.AsyncClient( - base_url=f"http://localhost:{ENV_SERVER_PORT}", - timeout=10.0, -) - -__all__ = ["ENV_SERVER_PORT", "http_client"] diff --git a/environments/blank/server/tools.py b/environments/blank/server/tools.py deleted file mode 100644 index 32f3c414..00000000 --- a/environments/blank/server/tools.py +++ /dev/null @@ -1,35 +0,0 @@ -"""Tools router for environment interaction.""" - -from hud.server import MCPRouter -from hud.tools.types import EvaluationResult -from server.shared import http_client - -router = MCPRouter() - - -@router.tool -async def act() -> str: - """Perform one action step in the environment (increment the counter).""" - resp = await http_client.post("/act") - data = resp.json() - return f"Action #{data.get('count', 0)} performed. Current count: {data.get('count', 0)}" - - -@router.tool -async def setup() -> str: - """Initialize or reset the environment to its starting state.""" - await http_client.post("/reset") - return "Counter reset to 0" - - -@router.tool -async def evaluate(target: int = 10) -> EvaluationResult: - """Evaluate progress toward the target count and return a reward and done flag.""" - resp = await http_client.get("/state") - current_count = resp.json().get("count", 0) - delta = target - current_count - reward = max(0.0, 1.0 - abs(delta) / target) if target > 0 else current_count - done = current_count >= target - return EvaluationResult( - reward=reward, done=done, content=f"Counter at {current_count}/{target}" - ) diff --git a/environments/blank/tasks.json b/environments/blank/tasks.json deleted file mode 100644 index f24e7b63..00000000 --- a/environments/blank/tasks.json +++ /dev/null @@ -1,44 +0,0 @@ -[ - { - "prompt": "Increment the counter to reach 3", - "mcp_config": { - "local": { - "command": "docker", - "args": [ - "run", - "--rm", - "-i", - "blank:latest" - ] - } - }, - "agent_config": { - "allowed_tools": ["act"], - "append_setup_output": true - }, - "setup_tool": { - "name": "setup", - "arguments": {} - }, - "integration_test_tool": [ - { - "name": "act", - "arguments": {} - }, - { - "name": "act", - "arguments": {} - }, - { - "name": "act", - "arguments": {} - } - ], - "evaluate_tool": { - "name": "evaluate", - "arguments": { - "target": 3 - } - } - } -] diff --git a/environments/blank/test_task.py b/environments/blank/test_task.py deleted file mode 100644 index 0f46690a..00000000 --- a/environments/blank/test_task.py +++ /dev/null @@ -1,52 +0,0 @@ -#!/usr/bin/env python -""" -Simple example of running tasks from tasks.json. Make sure to have run hud build. -""" - -from __future__ import annotations - -import asyncio -import json - -from hud.clients import MCPClient -from hud.datasets import Task - - -async def run_task(task_data: dict): - task = Task(**task_data) - client = MCPClient(mcp_config=task.mcp_config) - - try: - print("Initializing client...") - await client.initialize() - - result = await client.call_tool(task.setup_tool) # type: ignore - print(f"โœ… Setup: {result.content}") - - print("\n๐Ÿ”„ Performing actions:") - for _ in range(10): - result = await client.call_tool(name="act", arguments={}) - print(f" {result.content}") - - result = await client.call_tool(task.evaluate_tool) # type: ignore - print(f"\n๐Ÿ“Š Evaluation: {result.content}") - - return result.content - except Exception as e: - if "connection" in str(e).lower(): - print( - "โŒ Could not connect. Make sure 'hud dev --build' is running in another terminal." - ) - else: - raise e - finally: - await client.shutdown() - - -async def main(): - for task_data in json.load(open("tasks.json")): - await run_task(task_data) - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/environments/browser/.dockerignore b/environments/browser/.dockerignore deleted file mode 100644 index f91da037..00000000 --- a/environments/browser/.dockerignore +++ /dev/null @@ -1,101 +0,0 @@ -# Git -.git -.gitignore - -# Node -environment/*/frontend/node_modules -environment/*/frontend/.next -environment/*/frontend/build -environment/*/frontend/dist -environment/*/frontend/.turbo -environment/*/frontend/.vercel -environment/*/frontend/next-env.d.ts -environment/*/frontend/package-lock.json -# General Node/Next artifacts anywhere -node_modules -**/node_modules -**/.next -**/.turbo -**/.vercel -*.log - -# Python -__pycache__ -**/__pycache__ -*.pyc -*.pyo -*.pyd -.Python -*.egg-info -.pytest_cache -.mypy_cache -.coverage -.venv -venv -env -environment/*/backend/.venv -environment/*/backend/venv -environment/*/backend/__pycache__ - -# Database - exclude ALL database files -*.db -*.sqlite -*.db-journal -*.db-wal -*.db-shm -**/*.db -**/*.sqlite -**/*.db-journal -**/*.db-wal -**/*.db-shm - -# IDE -.vscode -.idea -*.swp -*.swo - -# OS -.DS_Store -Thumbs.db - -# Documentation -*.md -!app/README.md -!launch/README.md - -# Unix sockets, locks, pids (can break Docker context on Windows) -**/*.sock -**/*.socket -**/*.pipe -**/*.pid -**/*.lock -**/*.ipc - -# Symlinks and special files -**/*.lnk -**/symlink* -**/.venv -**/.env -**/venv -**/env - -# Temporary and cache files -*.tmp -*.temp -*.cache -**/*.tmp -**/*.temp -**/*.cache -**/tmp/ -**/temp/ -**/cache/ - -# Lock files that might have special permissions -yarn.lock -poetry.lock -Pipfile.lock -**/yarn.lock -**/*.lock -environment/uv.lock -controller/uv.lock \ No newline at end of file diff --git a/environments/browser/.gitignore b/environments/browser/.gitignore deleted file mode 100644 index 5397595a..00000000 --- a/environments/browser/.gitignore +++ /dev/null @@ -1,100 +0,0 @@ -# Dependencies -node_modules/ -.pnp -.pnp.js - -# Testing -coverage/ -.coverage -.pytest_cache/ -htmlcov/ - -# Next.js -.next/ -out/ -build/ -*.tsbuildinfo -next-env.d.ts - -# Production -dist/ - -# Misc -.DS_Store -*.pem -Thumbs.db - -# Debug -npm-debug.log* -yarn-debug.log* -yarn-error.log* -.pnpm-debug.log* - -# Local env files -.env -.env.local -.env.development.local -.env.test.local -.env.production.local - -# Vercel -.vercel - -# TypeScript -*.tsbuildinfo - -# Python -__pycache__/ -*.py[cod] -*$py.class -*.so -.Python -env/ -venv/ -.venv/ -ENV/ -env.bak/ -venv.bak/ -*.egg-info/ -.installed.cfg -*.egg -MANIFEST - -# uv -.venv/ -uv.lock - -# Database -*.db -*.sqlite -*.sqlite3 -app.db - -# IDEs -.vscode/ -.idea/ -*.swp -*.swo -*~ -.project -.classpath -.c9/ -*.launch -.settings/ -*.sublime-workspace - -# OS -.DS_Store -.DS_Store? -._* -.Spotlight-V100 -.Trashes -ehthumbs.db -Thumbs.db - -# Logs -logs/ -*.log - -# Docker -.dockerignore.local \ No newline at end of file diff --git a/environments/browser/Dockerfile b/environments/browser/Dockerfile deleted file mode 100644 index e25a71f9..00000000 --- a/environments/browser/Dockerfile +++ /dev/null @@ -1,60 +0,0 @@ -# syntax=docker/dockerfile:1 -FROM hudevals/hud-browser-base:latest AS setup - -WORKDIR /app - -# Layer 1: Install server dependencies -COPY server/pyproject.toml /app/server/ -RUN cd /app/server && uv pip install --system --break-system-packages . - -# Layer 2: Install environment dependencies -COPY environment/pyproject.toml /app/environment/ -RUN cd /app/environment && uv pip install --system --break-system-packages . - -# Layer 3: Copy source code (changes here don't invalidate dependency layers) -COPY server/ /app/server/ -COPY environment/ /app/environment/ - -# Auto-discover and install/build all frontend apps -RUN set -e; \ - for pkg in $(find /app/environment -type f -path '*/frontend/package.json'); do \ - app_dir=$(dirname "$pkg"); \ - echo "Installing dependencies in $app_dir"; \ - if [ -f "$app_dir/package-lock.json" ]; then \ - (cd "$app_dir" && npm ci --no-audit --no-fund); \ - else \ - (cd "$app_dir" && npm install --no-audit --no-fund); \ - fi; \ - done && \ - for pkg in $(find /app/environment -type f -path '*/frontend/package.json'); do \ - app_dir=$(dirname "$pkg"); \ - if [ -f "$app_dir/next.config.js" ]; then \ - echo "Building Next.js app in $app_dir"; \ - (cd "$app_dir" && npm run build); \ - fi; \ - done - -# Make scripts executable -RUN find /app/environment -name "*.py" -type f -exec chmod +x {} \; - -# Environment configuration -ENV MCP_TRANSPORT="stdio" -ENV HUD_LOG_STREAM="stderr" -ENV PYTHONUNBUFFERED="1" -ENV PYTHONWARNINGS="ignore::SyntaxWarning:pyautogui" -ENV DISPLAY=":1" -ENV PYTHONPATH=/app - -# Expose ports -EXPOSE 8000 8080 3000-3200 5000-5200 - -# Simple startup: HUD_DEV=1 enables hot-reload; otherwise run production -CMD ["sh", "-c", "\ - if [ \"${HUD_DEV:-0}\" = \"1\" ]; then \ - uvicorn environment.server:app --host 0.0.0.0 --port 8000 --reload --log-level warning >&2 & \ - sleep 5 && cd /app/server && exec hud dev server.main --stdio; \ - else \ - uvicorn environment.server:app --host 0.0.0.0 --port 8000 --log-level warning >&2 & \ - sleep 5 && cd /app/server && exec python3 -m server.main; \ - fi\ -"] \ No newline at end of file diff --git a/environments/browser/Dockerfile.local b/environments/browser/Dockerfile.local deleted file mode 100644 index c5262633..00000000 --- a/environments/browser/Dockerfile.local +++ /dev/null @@ -1,72 +0,0 @@ -# syntax=docker/dockerfile:1 -# Local development Dockerfile that uses local hud-python -FROM hudevals/hud-browser-base:latest AS setup - -WORKDIR /app - -# Layer 0: Install local hud-python -# Copy local hud-python source (build context is repo root) -COPY hud /app/hud-python/hud/ -COPY pyproject.toml /app/hud-python/ -COPY README.md /app/hud-python/ -COPY LICENSE /app/hud-python/ - -# Install local hud-python -RUN cd /app/hud-python && uv pip install --system --break-system-packages -e . - -# Layer 1: Install server dependencies -COPY environments/browser/server/pyproject.toml /app/server/ -RUN cd /app/server && uv pip install --system --break-system-packages . - -# Layer 2: Install environment dependencies -COPY environments/browser/environment/pyproject.toml /app/environment/ -RUN cd /app/environment && uv pip install --system --break-system-packages . - -# Layer 3: Copy source code (changes here don't invalidate dependency layers) -COPY environments/browser/server/ /app/server/ -COPY environments/browser/environment/ /app/environment/ - -# Auto-discover and install/build all frontend apps -RUN set -e; \ - for pkg in $(find /app/environment -type f -path '*/frontend/package.json'); do \ - app_dir=$(dirname "$pkg"); \ - echo "Installing dependencies in $app_dir"; \ - if [ -f "$app_dir/package-lock.json" ]; then \ - (cd "$app_dir" && npm ci --no-audit --no-fund); \ - else \ - (cd "$app_dir" && npm install --no-audit --no-fund); \ - fi; \ - done && \ - for pkg in $(find /app/environment -type f -path '*/frontend/package.json'); do \ - app_dir=$(dirname "$pkg"); \ - if [ -f "$app_dir/next.config.js" ]; then \ - echo "Building Next.js app in $app_dir"; \ - (cd "$app_dir" && npm run build); \ - fi; \ - done - -# Make scripts executable -RUN find /app/environment -name "*.py" -type f -exec chmod +x {} \; - -# Environment configuration -ENV MCP_TRANSPORT="stdio" -ENV HUD_LOG_STREAM="stderr" -ENV PYTHONUNBUFFERED="1" -ENV PYTHONWARNINGS="ignore::SyntaxWarning:pyautogui" -ENV DISPLAY=":1" -ENV PYTHONPATH=/app - -# Expose ports -EXPOSE 8000 8080 3000-3200 5000-5200 - -# Simple startup: HUD_DEV=1 enables hot-reload; otherwise run production -CMD ["sh", "-c", "\ - if [ \"${HUD_DEV:-0}\" = \"1\" ]; then \ - uvicorn environment.server:app --host 0.0.0.0 --port 8000 --reload --log-level warning >&2 & \ - sleep 5 && cd /app/server && exec hud dev server.main --stdio; \ - else \ - uvicorn environment.server:app --host 0.0.0.0 --port 8000 --log-level warning >&2 & \ - sleep 5 && cd /app/server && exec python3 -m server.main; \ - fi\ -"] - diff --git a/environments/browser/README.md b/environments/browser/README.md deleted file mode 100644 index 005e1333..00000000 --- a/environments/browser/README.md +++ /dev/null @@ -1,191 +0,0 @@ -# Browser Environment - -Browser automation environment with GUI access for testing web applications. Includes sample apps (2048, Todo) and supports hot-reload development. - -## Architecture - -**`environment/`** - Produces structured data -- FastAPI backend with X11/VNC services (Linux-only) -- Launches and manages web apps (Next.js frontends + Python backends) -- Exposes HTTP endpoints for app control and state - -**`server/`** - Wraps data in MCP tools -- Browser automation tools (Playwright, computer vision) -- Setup tools (launch apps, seed data) -- Evaluation tools (check game state, todo completion) - -**Why separate?** The environment backend requires X11/VNC/Chromium (Docker-only). The MCP server tools can be edited with hot-reload, while the heavy environment stays running. - -## Development - -This environment **requires Docker** due to X11/VNC dependencies. - -```bash -# Build first (creates hud-browser:0.1.0) -hud build - -# Start with hot-reload -hud dev -``` - -When you run `hud dev` in an environment with a Dockerfile, it automatically: -- Detects Docker mode is needed -- Mounts `server/` and `environment/` as volumes -- Enables hot-reload for both layers - -Edit files in `server/` or `environment/` and they reload inside the container! - -## Publishing Your Environment - -Once your environment is ready, you can share it with the community: - -### 1. Push to Registry -```bash -# Build and push your environment (requires docker hub login and hud api key) -hud build -hud push -``` - -### 2. Create a Dataset - -Create a dataset on HuggingFace with your tasks: - -**Option A: Upload manually** -1. Upload your `tasks.json` to HuggingFace -2. Make sure it's **public** to appear on leaderboards - -**Option B: Use the SDK** -```python -from hud.datasets import save_tasks -import json - -# Load your tasks -with open("tasks.json") as f: - tasks = json.load(f) - -# Push to HuggingFace -save_tasks(tasks, repo_id="your-org/your-dataset") -``` - -### 3. Run and Track Performance - -```bash -# Run Claude on your benchmark -hud eval "your-org/your-dataset" --agent claude - -# View results at: -# hud.ai/leaderboards/your-org/your-dataset -``` - -**Note**: Only public HuggingFace datasets appear as leaderboards! - -๐Ÿ“š Learn more: [Creating Benchmarks](https://docs.hud.ai/evaluate-agents/create-benchmarks) | [Leaderboards](https://docs.hud.ai/evaluate-agents/leaderboards) - -## Architecture Overview - -The browser environment uses a two-process architecture: - -1. **Context Server** (`context.py`): Long-running process that maintains persistent state -2. **MCP Server** (`server.py`): Hot-reloadable process that handles tool requests - -### Key Components - -- **BrowserContext**: Stores persistent state (running apps, ports, playwright instance) -- **ServiceManager**: Manages X11, VNC, and app processes -- **BaseHub Tools**: Setup and evaluate tools organized by app (2048, todo) -- **Multiprocessing Proxy**: Enables state sharing between processes - -### 1. Tool Implementation Pattern - -All setup and evaluate tools should follow this pattern: - -```python -@setup.tool("tool_name") -async def tool_name(param1: type, param2: type): - """Tool description.""" - try: - # Get persistent context - persistent_ctx = setup.env # or evaluate.env - - # Get app ports - backend_port = persistent_ctx.get_app_backend_port("app_name") - - # Make HTTP request - url = f"http://localhost:{backend_port}/api/endpoint" - async with httpx.AsyncClient() as client: - response = await client.method(url, json=data) - response.raise_for_status() - result = response.json() - - # Return result - return TextContent( - text=f"Success message", - type="text" - ) - except Exception as e: - logger.error(f"tool_name failed: {e}") - return TextContent( - text=f"Failed: {str(e)}", - type="text" - ) -``` - -### 2. App Launch Pattern - -When launching apps, ensure ports are stored in the persistent context: - -```python -# In launch_app tool -app_info = await service_manager.launch_app(app_name) - -# Store ports in persistent context for later access -try: - backend_port = service_manager.get_app_port(app_name) - frontend_port = service_manager.get_app_frontend_port(app_name) - persistent_ctx.set_app_ports(app_name, frontend_port, backend_port) -except Exception as e: - logger.error(f"Failed to store ports: {e}") - -# Track app in persistent context -persistent_ctx.add_running_app(app_name) -``` - -### 3. Import Organization - -Keep imports at module level: - -```python -# At top of file -import logging -import httpx -from mcp.types import TextContent -from . import setup - -# Not inside functions -``` - -## Development Workflow - -1. **Start the environment**: `hud dev` -2. **Make changes**: Edit tools in `src/hud_controller/` -3. **Test immediately**: The MCP server hot-reloads automatically -4. **Check logs**: Look for serialization or proxy errors - -## Adding New Apps - -1. Create app directory in `apps/` -2. Add setup tools in `src/hud_controller/setup/app_name.py` -3. Add evaluate tools in `src/hud_controller/evaluate/app_name.py` -4. Follow the HTTP pattern - no `call_app_api` usage -5. Store app ports in persistent context when launching - -## Key Files - -- `context.py`: Persistent state management -- `server.py`: MCP server and tool definitions -- `services.py`: Process management for X11, VNC, apps -- `setup/`: Setup tools organized by app -- `evaluate/`: Evaluation tools organized by app - -Remember: When in doubt, make direct HTTP calls and store state in the persistent context! - diff --git a/environments/browser/browser-base/Dockerfile b/environments/browser/browser-base/Dockerfile deleted file mode 100644 index 57eb9132..00000000 --- a/environments/browser/browser-base/Dockerfile +++ /dev/null @@ -1,50 +0,0 @@ -# syntax=docker/dockerfile:1 -FROM ubuntu:24.04 AS setup - -# Update and install core dependencies (including working Chromium browser) -RUN apt-get update -y \ - && apt-get install -y --no-install-recommends \ - vim \ - openssl \ - ca-certificates \ - curl \ - wget \ - sudo \ - bash \ - net-tools \ - novnc \ - x11vnc \ - xvfb \ - xfce4 \ - locales \ - libpq5 \ - sqlite3 \ - dbus-x11 \ - xfce4-terminal \ - xfonts-base \ - xdotool \ - psmisc \ - scrot \ - pm-utils \ - build-essential \ - unzip \ - xauth \ - gnupg \ - gpg \ - jq \ - git \ - build-essential \ - nodejs \ - npm - -RUN update-ca-certificates - -RUN curl -LsSf https://astral.sh/uv/install.sh | sh -ENV PATH="/root/.local/bin:$PATH" - -# Install git for dependency installation -RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/* - -# Install Playwright -RUN uv pip install --system --break-system-packages playwright -RUN python3 -m playwright install chromium --with-deps \ No newline at end of file diff --git a/environments/browser/browser-base/README.md b/environments/browser/browser-base/README.md deleted file mode 100644 index 21999fec..00000000 --- a/environments/browser/browser-base/README.md +++ /dev/null @@ -1,58 +0,0 @@ -# Browser Base Image - -Base Docker image for browser environments with Playwright, Chromium, and VNC support. - -## Build - -```bash -docker build -t browser-base:latest . -``` - -## Test with VNC Access - -### 1. Start the container - -```bash -docker run -it --rm \ - -p 6080:6080 \ - -p 5900:5900 \ - -e DISPLAY=:1 \ - browser-base:latest \ - bash -``` - -### 2. Inside the container, start display servers - -```bash -Xvfb :1 -screen 0 1920x1080x24 > /dev/null 2>&1 & -x11vnc -display :1 -nopw -listen 0.0.0.0 -forever > /dev/null 2>&1 & -/usr/share/novnc/utils/novnc_proxy --vnc localhost:5900 --listen 6080 > /dev/null 2>&1 & -``` - -### 3. Test Playwright - -```bash -python3 -c " -from playwright.sync_api import sync_playwright -with sync_playwright() as p: - browser = p.chromium.launch(headless=False) - page = browser.new_page() - page.goto('https://example.com') - print('Title:', page.title()) - input('Press Enter to close...') - browser.close() -" -``` - -### 4. View in browser - -Open `http://localhost:6080/vnc.html` to see Chromium running. - -## What's Included - -- Ubuntu 24.04 -- Desktop environment (Xvfb, x11vnc, noVNC, xfce4) -- Node.js & npm -- Python 3 with uv package manager -- Playwright with Chromium -- Development tools (git, curl, wget, etc.) \ No newline at end of file diff --git a/environments/browser/environment/2048/README.md b/environments/browser/environment/2048/README.md deleted file mode 100644 index 474b0c6d..00000000 --- a/environments/browser/environment/2048/README.md +++ /dev/null @@ -1,103 +0,0 @@ -# 2048 Game for Browser Environment - -A browser-based implementation of the 2048 game with configurable target tiles and reward system for RL evaluation. - -## Features - -- **Configurable Target Tile**: Set any power of 2 as target (64, 128, 256, 512, 1024, 2048, etc.) -- **Logarithmic Reward Scaling**: Smooth reward progression using `log(highest_tile) / log(target)` -- **Efficiency Tracking**: Monitor score-to-moves ratio -- **Flexible Board Size**: Support for 3x3 to 6x6 grids -- **Full Evaluation API**: Compatible with RL evaluation system - -## Architecture - -### Backend (FastAPI) -- Core game logic in `game.py` -- RESTful API endpoints for game control -- Evaluation endpoints for RL agents -- SQLite persistence (optional) - -### Frontend (Next.js + React) -- Responsive game board with smooth animations -- Keyboard and touch controls -- Real-time score and progress tracking -- Customizable game parameters - -## Running the Game - -### Standalone -```bash -python launch.py --frontend-port 3001 --backend-port 5001 -``` - -### With Browser Environment -The game integrates with the browser environment's setup and evaluation system. - -## API Endpoints - -### Core Game -- `POST /api/game/new` - Start new game -- `GET /api/game/state` - Get current state -- `POST /api/game/move` - Make a move -- `POST /api/game/set_target` - Set target tile - -### Evaluation -- `GET /api/eval/stats` - Get comprehensive stats -- `GET /api/eval/max_number` - Get highest tile -- `GET /api/eval/efficiency` - Get efficiency ratio -- `POST /api/eval/set_board` - Set specific board -- `POST /api/eval/reset` - Reset game - -## Evaluators - -- `game_2048_max_number` - Check if target tile reached (logarithmic reward) -- `game_2048_efficiency` - Evaluate score/moves ratio -- `game_2048_score_reached` - Check if target score reached -- `game_2048_game_won` - Check if game is won -- `game_2048_game_over` - Check if game is over -- `game_2048_moves_made` - Check minimum moves made - -## Setup Tools - -- `game_2048_board` - Initialize game with size and target -- `game_2048_set_board` - Set specific board state -- `game_2048_near_win` - Set board near winning -- `game_2048_navigate` - Navigate to game URL -- `game_2048_reset` - Reset to initial state - -## Reward System - -The reward system matches the text-2048 environment: - -1. **Max Number Reward**: `min(1.0, log(highest_tile) / log(target))` - - Logarithmic scaling for smooth progression - - Reaches 1.0 when target tile is achieved - -2. **Efficiency Reward**: `min(1.0, ratio / min_ratio)` - - Linear scaling based on score/moves ratio - - Encourages efficient gameplay - -## Development - -### Backend Requirements -- Python 3.8+ -- FastAPI -- NumPy -- uvicorn - -### Frontend Requirements -- Node.js 16+ -- Next.js 14 -- React 18 -- Tailwind CSS - -## Testing - -The game can be tested with the browser environment's evaluation system: - -```python -# Example evaluation -ctx = Context() -result = await game_2048_max_number(ctx, target=2048) -``` \ No newline at end of file diff --git a/environments/browser/environment/2048/backend/game.py b/environments/browser/environment/2048/backend/game.py deleted file mode 100644 index e13f3b38..00000000 --- a/environments/browser/environment/2048/backend/game.py +++ /dev/null @@ -1,241 +0,0 @@ -"""2048 Game Logic for Browser Environment""" - -import random -import numpy as np -from typing import Tuple, Optional, List - - -class Game2048: - """Browser-based 2048 game implementation with configurable target""" - - def __init__(self, size: int = 4, target_tile: int = 2048): - self.size = size - self.target_tile = target_tile - self.board = np.zeros((size, size), dtype=int) - self.score = 0 - self.game_over = False - self.moves_made = 0 - self.won = False - - # Start with 2 random tiles - self.add_random_tile() - self.add_random_tile() - - # Track initial highest tile for reward calculation - self.initial_highest_tile = int(self.board.max()) - - def add_random_tile(self) -> bool: - """Add a random 2 or 4 tile to an empty position""" - empty_cells = [ - (i, j) for i in range(self.size) for j in range(self.size) if self.board[i, j] == 0 - ] - - if not empty_cells: - return False - - i, j = random.choice(empty_cells) - # 90% chance of 2, 10% chance of 4 - self.board[i, j] = 2 if random.random() < 0.9 else 4 - return True - - def compress(self, row: np.ndarray) -> Tuple[np.ndarray, int]: - """Compress a row by moving all non-zero elements to the left and merging""" - new_row = np.zeros_like(row) - pos = 0 - score = 0 - - # Move all non-zero elements to the left - for num in row: - if num != 0: - new_row[pos] = num - pos += 1 - - # Merge adjacent equal elements - i = 0 - while i < len(new_row) - 1: - if new_row[i] != 0 and new_row[i] == new_row[i + 1]: - new_row[i] *= 2 - score += new_row[i] - new_row[i + 1] = 0 - i += 2 - else: - i += 1 - - # Compress again after merging - final_row = np.zeros_like(row) - pos = 0 - for num in new_row: - if num != 0: - final_row[pos] = num - pos += 1 - - return final_row, score - - def move(self, direction: str) -> bool: - """Make a move in the specified direction""" - if self.game_over: - return False - - direction = direction.lower() - if direction not in ["up", "down", "left", "right"]: - return False - - original_board = self.board.copy() - move_score = 0 - - if direction == "left": - for i in range(self.size): - self.board[i], row_score = self.compress(self.board[i]) - move_score += row_score - - elif direction == "right": - for i in range(self.size): - reversed_row = self.board[i][::-1] - compressed, row_score = self.compress(reversed_row) - self.board[i] = compressed[::-1] - move_score += row_score - - elif direction == "up": - for j in range(self.size): - column = self.board[:, j] - compressed, col_score = self.compress(column) - self.board[:, j] = compressed - move_score += col_score - - elif direction == "down": - for j in range(self.size): - column = self.board[:, j][::-1] - compressed, col_score = self.compress(column) - self.board[:, j] = compressed[::-1] - move_score += col_score - - # Check if the board changed - if not np.array_equal(original_board, self.board): - self.score += move_score - self.moves_made += 1 - self.add_random_tile() - self.check_game_status() - return True - - return False - - def check_game_status(self): - """Check if the game is won or over""" - # Check if target tile is reached - if not self.won and self.board.max() >= self.target_tile: - self.won = True - - # Check if game is over (no valid moves) - # Check for empty cells - if 0 in self.board: - self.game_over = False - return - - # Check for possible merges - for i in range(self.size): - for j in range(self.size): - current = self.board[i, j] - # Check right neighbor - if j < self.size - 1 and current == self.board[i, j + 1]: - self.game_over = False - return - # Check bottom neighbor - if i < self.size - 1 and current == self.board[i + 1, j]: - self.game_over = False - return - - self.game_over = True - - def get_state(self) -> dict: - """Get the current game state as a dictionary""" - return { - "board": self.board.tolist(), - "score": int(self.score), - "moves": int(self.moves_made), - "game_over": bool(self.game_over), - "won": bool(self.won), - "highest_tile": int(self.board.max()), - "initial_highest_tile": int(self.initial_highest_tile), - "target_tile": self.target_tile, - "board_size": self.size, - } - - def set_board(self, board: List[List[int]], score: int = 0, moves: int = 0): - """Set a specific board configuration (for testing)""" - self.board = np.array(board, dtype=int) - self.score = score - self.moves_made = moves - self.check_game_status() - - def reset(self, size: Optional[int] = None, target_tile: Optional[int] = None): - """Reset the game to initial state - - Args: - size: Optional new board size - target_tile: Optional new target tile - """ - if size is not None: - self.size = size - if target_tile is not None: - self.target_tile = target_tile - - self.board = np.zeros((self.size, self.size), dtype=int) - self.score = 0 - self.game_over = False - self.won = False - self.moves_made = 0 - self.add_random_tile() - self.add_random_tile() - - # Track initial highest tile after reset - self.initial_highest_tile = int(self.board.max()) - - def can_move(self) -> dict: - """Check which moves are valid""" - valid_moves = {"up": False, "down": False, "left": False, "right": False} - - if self.game_over: - return valid_moves - - # Test each direction without modifying the actual board - original_board = self.board.copy() - - for direction in ["up", "down", "left", "right"]: - test_board = original_board.copy() - self.board = test_board - - # Try the move - if direction == "left": - for i in range(self.size): - compressed, _ = self.compress(self.board[i]) - if not np.array_equal(self.board[i], compressed): - valid_moves[direction] = True - break - - elif direction == "right": - for i in range(self.size): - reversed_row = self.board[i][::-1] - compressed, _ = self.compress(reversed_row) - if not np.array_equal(reversed_row, compressed): - valid_moves[direction] = True - break - - elif direction == "up": - for j in range(self.size): - column = self.board[:, j] - compressed, _ = self.compress(column) - if not np.array_equal(column, compressed): - valid_moves[direction] = True - break - - elif direction == "down": - for j in range(self.size): - column = self.board[:, j][::-1] - compressed, _ = self.compress(column) - if not np.array_equal(column, compressed): - valid_moves[direction] = True - break - - # Restore original board - self.board = original_board - return valid_moves diff --git a/environments/browser/environment/2048/backend/main.py b/environments/browser/environment/2048/backend/main.py deleted file mode 100644 index 8cfba5ce..00000000 --- a/environments/browser/environment/2048/backend/main.py +++ /dev/null @@ -1,246 +0,0 @@ -"""FastAPI backend for 2048 game""" - -from fastapi import FastAPI, HTTPException -from fastapi.middleware.cors import CORSMiddleware -from pydantic import BaseModel -from typing import List, Optional -from datetime import datetime -import sqlite3 -import json -from game import Game2048 - -app = FastAPI(title="2048 Game API", version="1.0.0") - -# Configure CORS -app.add_middleware( - CORSMiddleware, - allow_origins=["http://localhost:3001"], # Different port from todo app - allow_credentials=True, - allow_methods=["*"], - allow_headers=["*"], -) - -# Global game instance (in production, would use sessions/database) -game = Game2048() - - -# Pydantic models -class NewGameRequest(BaseModel): - board_size: int = 4 - target_tile: int = 2048 - - -class MoveRequest(BaseModel): - direction: str # up, down, left, right - - -class SetBoardRequest(BaseModel): - board: List[List[int]] - score: Optional[int] = 0 - moves: Optional[int] = 0 - - -class SetTargetRequest(BaseModel): - target_tile: int - - -class GameState(BaseModel): - board: List[List[int]] - score: int - moves: int - game_over: bool - won: bool - highest_tile: int - initial_highest_tile: int - target_tile: int - board_size: int - - -class EvaluationStats(BaseModel): - board: List[List[int]] - score: int - moves: int - highest_tile: int - target_tile: int - efficiency: float - game_over: bool - won: bool - valid_moves: dict - - -# === CORE GAME API ROUTES === - - -@app.get("/api/status") -def status(): - """Health check endpoint""" - return {"status": "ok", "timestamp": datetime.now().isoformat()} - - -@app.post("/api/game/new", response_model=GameState) -def new_game(request: NewGameRequest): - """Start a new game with specified parameters""" - global game - game = Game2048(size=request.board_size, target_tile=request.target_tile) - return game.get_state() - - -@app.get("/api/game/state", response_model=GameState) -def get_game_state(): - """Get current game state""" - return game.get_state() - - -@app.post("/api/game/move", response_model=GameState) -def make_move(request: MoveRequest): - """Make a move in the specified direction""" - valid = game.move(request.direction) - if not valid and not game.game_over: - raise HTTPException(status_code=400, detail="Invalid move") - return game.get_state() - - -@app.post("/api/game/set_target", response_model=GameState) -def set_target(request: SetTargetRequest): - """Set the target tile for the game""" - game.target_tile = request.target_tile - game.check_game_status() # Re-check win condition - return game.get_state() - - -@app.get("/api/game/valid_moves") -def get_valid_moves(): - """Get which moves are currently valid""" - return game.can_move() - - -# === EVALUATION API ROUTES === - - -@app.get("/api/eval/health") -def eval_health(): - """Health check endpoint for evaluation system""" - return { - "status": "healthy", - "game_active": not game.game_over, - "highest_tile": int(game.board.max()), - "target_tile": game.target_tile, - "timestamp": datetime.now().isoformat(), - } - - -@app.get("/api/eval/stats", response_model=EvaluationStats) -def get_evaluation_stats(): - """Comprehensive evaluation statistics for the game""" - state = game.get_state() - efficiency = state["score"] / state["moves"] if state["moves"] > 0 else 0.0 - - return EvaluationStats( - board=state["board"], - score=state["score"], - moves=state["moves"], - highest_tile=state["highest_tile"], - target_tile=state["target_tile"], - efficiency=efficiency, - game_over=state["game_over"], - won=state["won"], - valid_moves=game.can_move(), - ) - - -@app.get("/api/eval/max_number") -def get_max_number(): - """Get the highest tile value for evaluation""" - state = game.get_state() - return { - "highest_tile": state["highest_tile"], - "target_tile": state["target_tile"], - "progress": state["highest_tile"] / state["target_tile"] if state["target_tile"] > 0 else 0, - "timestamp": datetime.now().isoformat(), - } - - -@app.get("/api/eval/efficiency") -def get_efficiency(): - """Get the game efficiency (score/moves ratio)""" - state = game.get_state() - efficiency = state["score"] / state["moves"] if state["moves"] > 0 else 0.0 - - return { - "score": state["score"], - "moves": state["moves"], - "efficiency": efficiency, - "timestamp": datetime.now().isoformat(), - } - - -@app.get("/api/eval/board") -def get_board(): - """Get current board state for evaluation""" - state = game.get_state() - return { - "board": state["board"], - "board_size": state["board_size"], - "empty_cells": sum(1 for row in state["board"] for cell in row if cell == 0), - "timestamp": datetime.now().isoformat(), - } - - -@app.post("/api/eval/set_board", response_model=GameState) -def set_board(request: SetBoardRequest): - """Set a specific board configuration for testing""" - try: - game.set_board(request.board, request.score, request.moves) - return game.get_state() - except Exception as e: - raise HTTPException(status_code=400, detail=str(e)) - - -@app.post("/api/eval/reset", response_model=GameState) -def reset_game(): - """Reset game to initial state""" - game.reset() - return game.get_state() - - -@app.post("/api/eval/seed") -def seed_test_board(): - """Seed the board with a test configuration""" - # Create a board that's close to winning - test_board = [[1024, 512, 256, 128], [64, 32, 16, 8], [4, 2, 0, 0], [0, 0, 0, 0]] - game.set_board(test_board, score=10000, moves=100) - - return { - "message": "Test board seeded successfully", - "highest_tile": 1024, - "timestamp": datetime.now().isoformat(), - } - - -@app.post("/api/eval/seed_custom") -def seed_custom_board(board: List[List[int]]): - """Seed the board with a custom configuration""" - try: - game.set_board(board) - state = game.get_state() - return { - "message": "Custom board seeded successfully", - "highest_tile": state["highest_tile"], - "timestamp": datetime.now().isoformat(), - } - except Exception as e: - raise HTTPException(status_code=400, detail=str(e)) - - -@app.get("/api/eval/can_move") -def can_move(): - """Check if any moves are available""" - valid_moves = game.can_move() - has_moves = any(valid_moves.values()) - - return { - "can_move": has_moves, - "valid_moves": valid_moves, - "game_over": game.game_over, - "timestamp": datetime.now().isoformat(), - } diff --git a/environments/browser/environment/2048/backend/pyproject.toml b/environments/browser/environment/2048/backend/pyproject.toml deleted file mode 100644 index d3c16ae0..00000000 --- a/environments/browser/environment/2048/backend/pyproject.toml +++ /dev/null @@ -1,9 +0,0 @@ -[project] -name = "game-2048-backend" -version = "1.0.0" -dependencies = [ - "fastapi", - "uvicorn", - "numpy", - "pydantic" -] \ No newline at end of file diff --git a/environments/browser/environment/2048/frontend/app/globals.css b/environments/browser/environment/2048/frontend/app/globals.css deleted file mode 100644 index bd6213e1..00000000 --- a/environments/browser/environment/2048/frontend/app/globals.css +++ /dev/null @@ -1,3 +0,0 @@ -@tailwind base; -@tailwind components; -@tailwind utilities; \ No newline at end of file diff --git a/environments/browser/environment/2048/frontend/app/layout.tsx b/environments/browser/environment/2048/frontend/app/layout.tsx deleted file mode 100644 index bcb24f69..00000000 --- a/environments/browser/environment/2048/frontend/app/layout.tsx +++ /dev/null @@ -1,22 +0,0 @@ -import type { Metadata } from 'next' -import { Inter } from 'next/font/google' -import './globals.css' - -const inter = Inter({ subsets: ['latin'] }) - -export const metadata: Metadata = { - title: '2048 Game', - description: 'A browser-based 2048 game with configurable targets', -} - -export default function RootLayout({ - children, -}: { - children: React.ReactNode -}) { - return ( - - {children} - - ) -} \ No newline at end of file diff --git a/environments/browser/environment/2048/frontend/app/page.tsx b/environments/browser/environment/2048/frontend/app/page.tsx deleted file mode 100644 index 3b56cede..00000000 --- a/environments/browser/environment/2048/frontend/app/page.tsx +++ /dev/null @@ -1,190 +0,0 @@ -'use client'; - -import { useState, useEffect, useCallback } from 'react'; -import GameBoard from '../components/GameBoard'; -import GameControls from '../components/GameControls'; - -// Dynamically determine API URL based on current port -// Backend is always on frontend_port + 1 -const getApiUrl = () => { - if (typeof window !== 'undefined') { - const currentPort = parseInt(window.location.port) || 3000; - return `http://localhost:${currentPort + 1}`; - } - return process.env.NEXT_PUBLIC_API_URL || 'http://localhost:5001'; -}; - -const API_URL = getApiUrl(); - -interface GameState { - board: number[][]; - score: number; - moves: number; - game_over: boolean; - won: boolean; - highest_tile: number; - target_tile: number; - board_size: number; -} - -export default function Game2048() { - const [gameState, setGameState] = useState(null); - const [loading, setLoading] = useState(false); - const [message, setMessage] = useState(''); - - // Load initial game state - useEffect(() => { - fetchGameState(); - }, []); - - // Handle keyboard input - useEffect(() => { - const handleKeyPress = (e: KeyboardEvent) => { - if (gameState?.game_over) return; - - const keyMap: { [key: string]: string } = { - 'ArrowUp': 'up', - 'ArrowDown': 'down', - 'ArrowLeft': 'left', - 'ArrowRight': 'right', - }; - - const direction = keyMap[e.key]; - if (direction) { - e.preventDefault(); - makeMove(direction); - } - }; - - window.addEventListener('keydown', handleKeyPress); - return () => window.removeEventListener('keydown', handleKeyPress); - }, [gameState]); - - const fetchGameState = async () => { - try { - const response = await fetch(`${API_URL}/api/game/state`); - const data = await response.json(); - setGameState(data); - } catch (error) { - console.error('Error fetching game state:', error); - setMessage('Error loading game'); - } - }; - - const makeMove = async (direction: string) => { - if (loading) return; - setLoading(true); - - try { - const response = await fetch(`${API_URL}/api/game/move`, { - method: 'POST', - headers: { 'Content-Type': 'application/json' }, - body: JSON.stringify({ direction }), - }); - - if (response.ok) { - const data = await response.json(); - setGameState(data); - - if (data.won && !gameState?.won) { - setMessage(`๐ŸŽ‰ You reached ${data.target_tile}!`); - } else if (data.game_over) { - setMessage('Game Over! No more moves available.'); - } - } else { - // Invalid move, just ignore - } - } catch (error) { - console.error('Error making move:', error); - } finally { - setLoading(false); - } - }; - - const newGame = async (boardSize: number = 4, targetTile: number = 2048) => { - setLoading(true); - setMessage(''); - - try { - const response = await fetch(`${API_URL}/api/game/new`, { - method: 'POST', - headers: { 'Content-Type': 'application/json' }, - body: JSON.stringify({ board_size: boardSize, target_tile: targetTile }), - }); - - const data = await response.json(); - setGameState(data); - } catch (error) { - console.error('Error starting new game:', error); - setMessage('Error starting new game'); - } finally { - setLoading(false); - } - }; - - // Touch/swipe handling - const [touchStart, setTouchStart] = useState<{ x: number; y: number } | null>(null); - - const handleTouchStart = (e: React.TouchEvent) => { - const touch = e.touches[0]; - setTouchStart({ x: touch.clientX, y: touch.clientY }); - }; - - const handleTouchEnd = (e: React.TouchEvent) => { - if (!touchStart) return; - - const touch = e.changedTouches[0]; - const deltaX = touch.clientX - touchStart.x; - const deltaY = touch.clientY - touchStart.y; - const minSwipeDistance = 50; - - if (Math.abs(deltaX) > Math.abs(deltaY)) { - // Horizontal swipe - if (Math.abs(deltaX) > minSwipeDistance) { - makeMove(deltaX > 0 ? 'right' : 'left'); - } - } else { - // Vertical swipe - if (Math.abs(deltaY) > minSwipeDistance) { - makeMove(deltaY > 0 ? 'down' : 'up'); - } - } - - setTouchStart(null); - }; - - if (!gameState) { - return ( -
-
Loading game...
-
- ); - } - - return ( -
-
-

2048

- - - -
- -
- -
-

Use arrow keys to play

-

Combine tiles to reach {gameState.target_tile}!

-
-
-
- ); -} \ No newline at end of file diff --git a/environments/browser/environment/2048/frontend/components/GameBoard.tsx b/environments/browser/environment/2048/frontend/components/GameBoard.tsx deleted file mode 100644 index d5678e41..00000000 --- a/environments/browser/environment/2048/frontend/components/GameBoard.tsx +++ /dev/null @@ -1,31 +0,0 @@ -import React from 'react'; -import GameTile from './GameTile'; - -interface GameBoardProps { - board: number[][]; -} - -export default function GameBoard({ board }: GameBoardProps) { - const boardSize = board.length; - - return ( -
-
- {board.map((row, i) => - row.map((value, j) => ( - - )) - )} -
-
- ); -} \ No newline at end of file diff --git a/environments/browser/environment/2048/frontend/components/GameControls.tsx b/environments/browser/environment/2048/frontend/components/GameControls.tsx deleted file mode 100644 index b89b3613..00000000 --- a/environments/browser/environment/2048/frontend/components/GameControls.tsx +++ /dev/null @@ -1,104 +0,0 @@ -import React, { useState } from 'react'; - -interface GameState { - score: number; - moves: number; - game_over: boolean; - won: boolean; - highest_tile: number; - target_tile: number; -} - -interface GameControlsProps { - gameState: GameState; - onNewGame: (boardSize: number, targetTile: number) => void; - message: string; -} - -export default function GameControls({ gameState, onNewGame, message }: GameControlsProps) { - const [targetTile, setTargetTile] = useState(gameState.target_tile); - const [boardSize, setBoardSize] = useState(4); - - const efficiency = gameState.moves > 0 - ? (gameState.score / gameState.moves).toFixed(1) - : '0.0'; - - return ( -
- {/* Score and Stats */} -
-
-
Score
-
{gameState.score}
-
-
-
Moves
-
{gameState.moves}
-
-
-
Highest
-
{gameState.highest_tile}
-
-
-
Efficiency
-
{efficiency}
-
-
- - {/* Game Controls */} -
-
-
- - -
- -
- - -
- - -
-
- - {/* Status Message */} - {message && ( -
- {message} -
- )} -
- ); -} \ No newline at end of file diff --git a/environments/browser/environment/2048/frontend/components/GameTile.tsx b/environments/browser/environment/2048/frontend/components/GameTile.tsx deleted file mode 100644 index e3b4bdfc..00000000 --- a/environments/browser/environment/2048/frontend/components/GameTile.tsx +++ /dev/null @@ -1,53 +0,0 @@ -import React from 'react'; - -interface GameTileProps { - value: number; - position: { row: number; col: number }; -} - -export default function GameTile({ value }: GameTileProps) { - const getTileColor = (val: number): string => { - const colors: { [key: number]: string } = { - 0: 'bg-gray-200', - 2: 'bg-yellow-100', - 4: 'bg-yellow-200', - 8: 'bg-orange-300', - 16: 'bg-orange-400', - 32: 'bg-orange-500', - 64: 'bg-red-400', - 128: 'bg-yellow-300', - 256: 'bg-yellow-400', - 512: 'bg-yellow-500', - 1024: 'bg-yellow-600', - 2048: 'bg-yellow-700', - 4096: 'bg-purple-600', - 8192: 'bg-purple-700', - }; - return colors[val] || 'bg-purple-800'; - }; - - const getTextSize = (val: number): string => { - if (val === 0) return ''; - if (val < 100) return 'text-3xl'; - if (val < 1000) return 'text-2xl'; - return 'text-xl'; - }; - - const getTextColor = (val: number): string => { - return val > 4 ? 'text-white' : 'text-gray-800'; - }; - - return ( -
- {value > 0 && value} -
- ); -} \ No newline at end of file diff --git a/environments/browser/environment/2048/frontend/next.config.js b/environments/browser/environment/2048/frontend/next.config.js deleted file mode 100644 index cf97dc63..00000000 --- a/environments/browser/environment/2048/frontend/next.config.js +++ /dev/null @@ -1,6 +0,0 @@ -/** @type {import('next').NextConfig} */ -const nextConfig = { - reactStrictMode: true, -} - -module.exports = nextConfig \ No newline at end of file diff --git a/environments/browser/environment/2048/frontend/package.json b/environments/browser/environment/2048/frontend/package.json deleted file mode 100644 index 7a7e412c..00000000 --- a/environments/browser/environment/2048/frontend/package.json +++ /dev/null @@ -1,28 +0,0 @@ -{ - "name": "game-2048-frontend", - "version": "1.0.0", - "private": true, - "scripts": { - "dev": "next dev", - "build": "next build", - "start": "next start", - "lint": "next lint" - }, - "dependencies": { - "next": "14.1.0", - "react": "^18", - "react-dom": "^18", - "swr": "^2.2.4" - }, - "devDependencies": { - "@types/node": "^20", - "@types/react": "^18", - "@types/react-dom": "^18", - "autoprefixer": "^10.0.1", - "eslint": "^8", - "eslint-config-next": "14.1.0", - "postcss": "^8", - "tailwindcss": "^3.3.0", - "typescript": "^5" - } -} \ No newline at end of file diff --git a/environments/browser/environment/2048/frontend/postcss.config.js b/environments/browser/environment/2048/frontend/postcss.config.js deleted file mode 100644 index 96bb01e7..00000000 --- a/environments/browser/environment/2048/frontend/postcss.config.js +++ /dev/null @@ -1,6 +0,0 @@ -module.exports = { - plugins: { - tailwindcss: {}, - autoprefixer: {}, - }, -} \ No newline at end of file diff --git a/environments/browser/environment/2048/frontend/tailwind.config.js b/environments/browser/environment/2048/frontend/tailwind.config.js deleted file mode 100644 index 47bc0bad..00000000 --- a/environments/browser/environment/2048/frontend/tailwind.config.js +++ /dev/null @@ -1,12 +0,0 @@ -/** @type {import('tailwindcss').Config} */ -module.exports = { - content: [ - './pages/**/*.{js,ts,jsx,tsx,mdx}', - './components/**/*.{js,ts,jsx,tsx,mdx}', - './app/**/*.{js,ts,jsx,tsx,mdx}', - ], - theme: { - extend: {}, - }, - plugins: [], -} \ No newline at end of file diff --git a/environments/browser/environment/2048/frontend/tsconfig.json b/environments/browser/environment/2048/frontend/tsconfig.json deleted file mode 100644 index 9b9948d5..00000000 --- a/environments/browser/environment/2048/frontend/tsconfig.json +++ /dev/null @@ -1,27 +0,0 @@ -{ - "compilerOptions": { - "target": "es5", - "lib": ["dom", "dom.iterable", "esnext"], - "allowJs": true, - "skipLibCheck": true, - "strict": true, - "noEmit": true, - "esModuleInterop": true, - "module": "esnext", - "moduleResolution": "bundler", - "resolveJsonModule": true, - "isolatedModules": true, - "jsx": "preserve", - "incremental": true, - "plugins": [ - { - "name": "next" - } - ], - "paths": { - "@/*": ["./*"] - } - }, - "include": ["next-env.d.ts", "**/*.ts", "**/*.tsx", ".next/types/**/*.ts"], - "exclude": ["node_modules"] -} \ No newline at end of file diff --git a/environments/browser/environment/2048/launch.py b/environments/browser/environment/2048/launch.py deleted file mode 100644 index a5645668..00000000 --- a/environments/browser/environment/2048/launch.py +++ /dev/null @@ -1,284 +0,0 @@ -#!/usr/bin/env python3 -"""2048 game launcher script.""" - -import subprocess -import time -import signal -import sys -import argparse -import logging -import os -import socket -from pathlib import Path -from typing import Optional - -# Configure logging to stderr to avoid stdio contamination -logging.basicConfig(level=logging.INFO, format="[%(asctime)s] 2048: %(message)s", stream=sys.stderr) - -# Global variables to track processes -frontend_process: Optional[subprocess.Popen] = None -backend_process: Optional[subprocess.Popen] = None - - -def cleanup_processes(): - """Clean up running processes.""" - global frontend_process, backend_process - logging.info("Shutting down services...") - - for proc in [frontend_process, backend_process]: - if proc and proc.poll() is None: - proc.terminate() - try: - proc.wait(timeout=5) - except subprocess.TimeoutExpired: - proc.kill() - - -def signal_handler(sig, frame): - """Handle shutdown signals.""" - cleanup_processes() - sys.exit(0) - - -def check_port_available(port: int) -> bool: - """Check if a port is available.""" - sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - sock.settimeout(1) - try: - result = sock.connect_ex(("localhost", port)) - sock.close() - return result != 0 # Port is available if connection fails - except: - return True - - -def launch_app(frontend_port: int = 3001, backend_port: int = 5001): - """Launch the 2048 game with frontend and backend.""" - global frontend_process, backend_process - - # Set up signal handlers - signal.signal(signal.SIGINT, signal_handler) - signal.signal(signal.SIGTERM, signal_handler) - - try: - # Get current directory - app_dir = Path(__file__).parent - frontend_dir = app_dir / "frontend" - backend_dir = app_dir / "backend" - - logging.info( - f"Starting 2048 game - Frontend port: {frontend_port}, Backend port: {backend_port}" - ) - - # Check if ports are available - if not check_port_available(backend_port): - logging.warning(f"Backend port {backend_port} is already in use") - if not check_port_available(frontend_port): - logging.warning(f"Frontend port {frontend_port} is already in use") - - # Prepare backend command - backend_env = { - "PORT": str(backend_port), - "PYTHONPATH": str(backend_dir), - **dict(os.environ), - } - - # Check if we can use uv, otherwise fall back to system python - try: - subprocess.run(["uv", "--version"], check=True, capture_output=True) - backend_cmd = [ - "uv", - "run", - "uvicorn", - "main:app", - "--host", - "0.0.0.0", - "--port", - str(backend_port), - ] - logging.info("Using uv for backend") - except (subprocess.CalledProcessError, FileNotFoundError): - # Fall back to system python with uvicorn - logging.info("uv not available, using system python for backend") - backend_cmd = [ - "python3", - "-m", - "uvicorn", - "main:app", - "--host", - "0.0.0.0", - "--port", - str(backend_port), - ] - - # Prepare frontend command - frontend_env = { - "NEXT_PUBLIC_API_URL": f"http://localhost:{backend_port}", - "PORT": str(frontend_port), - **dict(os.environ), - } - - # Check if dependencies are installed - if frontend_dir.exists(): - node_modules = frontend_dir / "node_modules" - if not node_modules.exists(): - logging.info("Installing frontend dependencies...") - npm_install = subprocess.run( - ["npm", "install"], cwd=frontend_dir, capture_output=True - ) - if npm_install.returncode != 0: - logging.error( - f"Failed to install npm dependencies: {npm_install.stderr.decode()}" - ) - cleanup_processes() - raise RuntimeError("npm install failed") - - # Check if we have a production build - if (frontend_dir / ".next").exists(): - logging.info("Running in production mode (pre-built)...") - frontend_cmd = [ - "npm", - "run", - "start", - "--", - "--port", - str(frontend_port), - "--hostname", - "0.0.0.0", - ] - else: - logging.info("Running in development mode...") - frontend_cmd = [ - "npm", - "run", - "dev", - "--", - "--port", - str(frontend_port), - "--hostname", - "0.0.0.0", - ] - - # ๐Ÿš€ START BOTH PROCESSES IN PARALLEL - logging.info("Starting backend and frontend in parallel...") - - # Start backend - UPDATE GLOBAL VARIABLE - backend_process = subprocess.Popen( - backend_cmd, - cwd=backend_dir, - env=backend_env, - stdin=subprocess.DEVNULL, - stdout=subprocess.DEVNULL, # Don't capture stdout - reserved for MCP - stderr=subprocess.DEVNULL, # Don't capture stderr - reserved for MCP - ) - - # Start frontend immediately (in parallel) - UPDATE GLOBAL VARIABLE - if frontend_dir.exists(): - frontend_process = subprocess.Popen( - frontend_cmd, - cwd=frontend_dir, - env=frontend_env, - stdin=subprocess.DEVNULL, - stdout=subprocess.DEVNULL, # Don't capture stdout - reserved for MCP - stderr=subprocess.DEVNULL, # Don't capture stderr - reserved for MCP - ) - - # ๐Ÿš€ WAIT FOR BOTH IN PARALLEL WITH FAST POLLING - backend_ready = False - frontend_ready = False - - # Use faster polling (every 200ms instead of 1s) - max_attempts_backend = 150 # 30 seconds at 200ms intervals - max_attempts_frontend = 600 # 120 seconds at 200ms intervals - - for attempt in range(max(max_attempts_backend, max_attempts_frontend)): - # Check if processes are still alive - if backend_process and backend_process.poll() is not None: - logging.error(f"Backend process died with exit code {backend_process.returncode}") - cleanup_processes() - raise RuntimeError("Backend failed to start") - - if frontend_process and frontend_process.poll() is not None: - logging.error(f"Frontend process died with exit code {frontend_process.returncode}") - cleanup_processes() - raise RuntimeError("Frontend failed to start") - - # Check backend readiness - if not backend_ready and attempt < max_attempts_backend: - sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - sock.settimeout(0.1) - try: - result = sock.connect_ex(("localhost", backend_port)) - sock.close() - if result == 0: - backend_ready = True - logging.info(f"Backend is ready (attempt {attempt + 1})") - except: - pass - - # Check frontend readiness - if not frontend_ready and attempt < max_attempts_frontend: - sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - sock.settimeout(0.1) - try: - result = sock.connect_ex(("localhost", frontend_port)) - sock.close() - if result == 0: - frontend_ready = True - logging.info(f"Frontend is ready (attempt {attempt + 1})") - except: - pass - - # Exit early if both are ready - if backend_ready and frontend_ready: - break - - time.sleep(0.2) # 200ms intervals instead of 1s - - # Check final status - if not backend_ready: - logging.error("Backend did not start within 30 seconds") - cleanup_processes() - raise RuntimeError("Backend startup timeout") - - if not frontend_ready: - logging.error("Frontend did not start within 2 minutes") - cleanup_processes() - raise RuntimeError("Frontend startup timeout") - - # Log startup information - logging.info("2048 game started successfully!") - logging.info(f"Frontend: http://localhost:{frontend_port}") - logging.info(f"Backend API: http://localhost:{backend_port}/docs") - logging.info("Press Ctrl+C to stop") - - # Wait for processes to finish - while True: - time.sleep(1) - if backend_process and backend_process.poll() is not None: - logging.error("Backend process died unexpectedly") - break - if frontend_process and frontend_process.poll() is not None: - logging.error("Frontend process died unexpectedly") - break - - except Exception as e: - logging.error(f"Error launching app: {e}") - cleanup_processes() - raise - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Launch 2048 Game") - parser.add_argument("--frontend-port", type=int, default=3001, help="Frontend port") - parser.add_argument("--backend-port", type=int, default=5001, help="Backend port") - - args = parser.parse_args() - - try: - launch_app(args.frontend_port, args.backend_port) - except KeyboardInterrupt: - logging.info("App interrupted by user") - except Exception as e: - logging.error(f"Failed to launch app: {e}") - sys.exit(1) diff --git a/environments/browser/environment/README.md b/environments/browser/environment/README.md deleted file mode 100644 index 2c86019e..00000000 --- a/environments/browser/environment/README.md +++ /dev/null @@ -1,135 +0,0 @@ -# Apps Directory - -Launchable web applications for the HUD browser environment. Each app is a self-contained service that can be dynamically launched. - -## App Specification - -Each app must implement: - -### Required Files -- `launch.py` - Entry point script with standardized arguments -- `backend/` - Backend service (required) -- `frontend/` - Frontend service (optional) - -### Launch Script Interface - -```python -# launch.py -import argparse - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("--frontend-port", type=int) - parser.add_argument("--backend-port", type=int, required=True) - args = parser.parse_args() - - # Start your services on the provided ports - # Backend must run on args.backend_port - # Frontend (if present) should run on args.frontend_port - -if __name__ == "__main__": - main() -``` - -### Service Requirements - -**Backend** -- Must bind to the provided `--backend-port` -- Should implement health check endpoint (`/health`) -- Must handle graceful shutdown -- Should use production-ready server (uvicorn, gunicorn, etc.) - -**Frontend** (Optional) -- Must bind to the provided `--frontend-port` -- Should be a static build or development server -- Common frameworks: Next.js, React, Vue, etc. - -## App Lifecycle - -1. **Discovery** - Apps are discovered by scanning subdirectories -2. **Launch** - Controller calls `python launch.py --backend-port=5000 --frontend-port=3000` -3. **Registration** - Ports are registered for API access -4. **Operation** - App services run independently -5. **Cleanup** - Processes terminated when environment shuts down - -## Integration Patterns - -### Basic Web App -```python -# Minimal FastAPI backend -from fastapi import FastAPI -import uvicorn - -app = FastAPI() - -@app.get("/health") -def health(): - return {"status": "healthy"} - -if __name__ == "__main__": - import sys - port = int(sys.argv[sys.argv.index("--backend-port") + 1]) - uvicorn.run(app, host="0.0.0.0", port=port) -``` - -### Full-Stack App -```python -# launch.py for app with both frontend and backend -import subprocess -import sys - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("--frontend-port", type=int) - parser.add_argument("--backend-port", type=int, required=True) - args = parser.parse_args() - - # Start backend - backend_proc = subprocess.Popen([ - "uvicorn", "backend.main:app", - "--host", "0.0.0.0", - "--port", str(args.backend_port) - ]) - - # Start frontend (if port provided) - if args.frontend_port: - frontend_proc = subprocess.Popen([ - "npm", "run", "dev", "--", "--port", str(args.frontend_port) - ], cwd="frontend") - - # Wait for processes - try: - backend_proc.wait() - except KeyboardInterrupt: - backend_proc.terminate() - if args.frontend_port: - frontend_proc.terminate() -``` - -## Optional Integrations - -### Evaluation APIs -Apps can optionally provide evaluation endpoints for testing: -- `GET /api/eval/health` - Health check -- `GET /api/eval/stats` - Application statistics -- Additional endpoints as needed - -### Environment Access -Apps can access the browser environment through: -- Shared network (communicate with controller) -- File system (shared volumes) -- Environment variables - -## Development Guidelines - -- **Port Binding** - Always use provided ports, never hardcode -- **Health Checks** - Implement basic health endpoints -- **Logging** - Use structured logging for debugging -- **Dependencies** - Manage dependencies with lockfiles -- **Graceful Shutdown** - Handle SIGTERM properly -- **Error Handling** - Return meaningful error responses - -## Examples - -- `todo/` - Full-stack Next.js + FastAPI application with evaluation integration -- See individual app READMEs for specific implementation details \ No newline at end of file diff --git a/environments/browser/environment/__init__.py b/environments/browser/environment/__init__.py deleted file mode 100644 index 36902690..00000000 --- a/environments/browser/environment/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -"""Browser environment server package.""" - -__version__ = "0.1.0" diff --git a/environments/browser/environment/pyproject.toml b/environments/browser/environment/pyproject.toml deleted file mode 100644 index f6f853f8..00000000 --- a/environments/browser/environment/pyproject.toml +++ /dev/null @@ -1,23 +0,0 @@ -[project] -name = "hud-browser-environment" -version = "0.1.0" -description = "HUD Browser Environment Backend" -requires-python = ">=3.11,<3.14" -dependencies = [ - "fastapi>=0.104.1", - "uvicorn[standard]>=0.24.0", - "python-multipart>=0.0.6", - "pydantic>=2.6,<3", - "pydantic-settings>=2.2,<3", - "httpx", -] - -[build-system] -requires = ["hatchling"] -build-backend = "hatchling.build" - -[tool.hatch.metadata] -allow-direct-references = true - -[tool.hatch.build.targets.wheel] -packages = ["environment"] diff --git a/environments/browser/environment/server.py b/environments/browser/environment/server.py deleted file mode 100644 index bd1297c7..00000000 --- a/environments/browser/environment/server.py +++ /dev/null @@ -1,503 +0,0 @@ -""" -FastAPI server for browser environment. -Exposes API endpoints to interact with the environment and its subcomponents. -""" - -import asyncio -import subprocess -import os -import logging -from pathlib import Path -from typing import Optional, Dict, List, Any, Set -import socket -from contextlib import asynccontextmanager -import shutil -import httpx - -from fastapi import FastAPI, HTTPException -from pydantic import BaseModel - -# Configure logging -logging.basicConfig( - level=logging.INFO, format="[%(levelname)s] %(asctime)s | %(name)s | %(message)s" -) -logger = logging.getLogger(__name__) - - -class AppInfo(BaseModel): - """Information about a launched app.""" - - name: str - frontend_port: int - backend_port: int - url: str - status: str - - -class ServiceStatus(BaseModel): - """Status of environment services.""" - - x11: bool - vnc: bool - websockify: bool - apps: List[AppInfo] - - -class LaunchAppRequest(BaseModel): - """Request to launch an app.""" - - app_name: str - - -class LaunchAppResponse(BaseModel): - """Response after launching an app.""" - - name: str - url: str - frontend_port: int - backend_port: int - - -class ServiceManager: - """Manages environment services (X11, VNC, apps).""" - - def __init__(self): - self.x11_proc: Optional[subprocess.Popen] = None - self.vnc_proc: Optional[subprocess.Popen] = None - self.websockify_proc: Optional[subprocess.Popen] = None - self.chrome_proc: Optional[subprocess.Popen] = None - self.cdp_port: Optional[int] = None - self._launched_apps: Dict[str, AppInfo] = {} - self._playwright = None - self._browser = None - self._app_processes: Dict[str, subprocess.Popen] = {} - self._allocated_ports: Set[int] = set() - - async def start_core_services(self): - """Start X11, VNC, and websockify services.""" - # Check if X11 is already running - if Path("/tmp/.X11-unix/X1").exists(): - logger.info("X11 display :1 already running") - else: - # Start Xvfb if not already running - self.x11_proc = subprocess.Popen( - ["Xvfb", ":1", "-screen", "0", "1920x1080x24"], - stdin=subprocess.DEVNULL, - stdout=subprocess.DEVNULL, - stderr=subprocess.PIPE, - ) - logger.info("Started Xvfb on display :1") - - # Wait for X11 - await self._wait_for_x11() - - # Start VNC and websockify - await self._start_vnc_services() - - async def _wait_for_x11(self): - """Wait for X11 display to be ready.""" - for i in range(100): # 10 seconds max - if Path("/tmp/.X11-unix/X1").exists(): - logger.info("X11 display :1 is ready") - os.environ["DISPLAY"] = ":1" - return - await asyncio.sleep(0.1) - raise TimeoutError("X11 failed to start") - - async def _start_vnc_services(self): - """Start VNC and websockify services.""" - # Start x11vnc - self.vnc_proc = subprocess.Popen( - ["x11vnc", "-display", ":1", "-forever", "-shared", "-nopw"], - stdin=subprocess.DEVNULL, - stdout=subprocess.DEVNULL, - stderr=subprocess.PIPE, - env={**os.environ, "DISPLAY": ":1"}, - ) - logger.info("Started x11vnc") - - # Start websockify - self.websockify_proc = subprocess.Popen( - ["websockify", "--web", "/usr/share/novnc", "8080", "localhost:5900"], - stdin=subprocess.DEVNULL, - stdout=subprocess.DEVNULL, - stderr=subprocess.PIPE, - ) - logger.info("Started websockify on port 8080") - - # Wait for both services - await asyncio.gather( - self._wait_for_port(5900, "VNC"), self._wait_for_port(8080, "websockify") - ) - logger.info("noVNC available at: http://localhost:8080/vnc.html") - - # Start Playwright's Chromium browser - logger.info("Starting Playwright's Chromium browser") - try: - from playwright.async_api import async_playwright - - self._playwright = await async_playwright().start() - # Get a free port for CDP - self.cdp_port = self._get_next_port() - - self._browser = await self._playwright.chromium.launch( - headless=False, - args=[ - f"--remote-debugging-port={self.cdp_port}", - "--no-sandbox", - "--disable-dev-shm-usage", - "--disable-gpu", - "--disable-web-security", - "--disable-features=IsolateOrigins,site-per-process", - "--display=:1", - "--start-maximized", - ], - env={**os.environ, "DISPLAY": ":1"}, - ) - - logger.info(f"Started Playwright Chromium with CDP on port {self.cdp_port}") - - # Wait for CDP to be ready - await self._wait_for_port(self.cdp_port, "CDP", timeout=30) - - # Open a default page so the browser window is visible - default_context = await self._browser.new_context( - viewport={"width": 1920, "height": 1080}, no_viewport=False - ) - default_page = await default_context.new_page() - await default_page.goto("about:blank") - logger.info("Opened default browser page") - - except ImportError: - logger.error("Playwright not installed") - raise RuntimeError("Playwright is required. The Docker image should have installed it.") - except Exception as e: - logger.error(f"Failed to start Playwright browser: {e}") - raise - - async def launch_app(self, app_name: str) -> LaunchAppResponse: - """Launch a specific app dynamically.""" - # Check if app is already running - if app_name in self._launched_apps: - app_info = self._launched_apps[app_name] - if app_info.status == "running": - return LaunchAppResponse( - name=app_info.name, - url=app_info.url, - frontend_port=app_info.frontend_port, - backend_port=app_info.backend_port, - ) - - app_path = Path(f"/app/environment/{app_name}") - if not app_path.exists(): - raise ValueError(f"App '{app_name}' not found at {app_path}") - - # Check if app has a launch script - launch_script = app_path / "launch.py" - if not launch_script.exists(): - raise ValueError(f"App '{app_name}' missing launch.py") - - # Get unique ports for frontend and backend - frontend_port = self._get_next_port() - backend_port = self._get_next_port() - - # Launch the app - proc = subprocess.Popen( - [ - "python3", - str(launch_script), - "--frontend-port", - str(frontend_port), - "--backend-port", - str(backend_port), - ], - cwd=app_path, - stdin=subprocess.DEVNULL, - stdout=subprocess.DEVNULL, - stderr=subprocess.DEVNULL, - env={**os.environ, "DISPLAY": ":1"}, - ) - - self._app_processes[app_name] = proc - - try: - # Wait for both ports - await asyncio.gather( - self._wait_for_port(frontend_port, f"app '{app_name}' frontend", timeout=60), - self._wait_for_port(backend_port, f"app '{app_name}' backend", timeout=60), - ) - - logger.info( - f"Launched app '{app_name}' - Frontend: {frontend_port}, Backend: {backend_port}" - ) - - # Store app information - app_info = AppInfo( - name=app_name, - frontend_port=frontend_port, - backend_port=backend_port, - url=f"http://localhost:{frontend_port}", - status="running", - ) - self._launched_apps[app_name] = app_info - - return LaunchAppResponse( - name=app_name, - url=app_info.url, - frontend_port=frontend_port, - backend_port=backend_port, - ) - - except TimeoutError: - # Check if process is still running - if proc.poll() is not None: - logger.error(f"App '{app_name}' process exited with code {proc.returncode}") - else: - logger.error(f"App '{app_name}' failed to become ready within timeout") - raise - - def get_service_status(self) -> ServiceStatus: - """Get status of all services.""" - # Update app statuses - for app_name, proc in self._app_processes.items(): - if app_name in self._launched_apps: - if proc.poll() is None: - self._launched_apps[app_name].status = "running" - else: - self._launched_apps[app_name].status = "stopped" - - return ServiceStatus( - x11=self.x11_proc is not None and self.x11_proc.poll() is None - if self.x11_proc - else Path("/tmp/.X11-unix/X1").exists(), - vnc=self.vnc_proc is not None and self.vnc_proc.poll() is None - if self.vnc_proc - else self._is_port_open(5900), - websockify=self.websockify_proc is not None and self.websockify_proc.poll() is None - if self.websockify_proc - else self._is_port_open(8080), - apps=list(self._launched_apps.values()), - ) - - def get_app_info(self, app_name: str) -> AppInfo: - """Get information about a specific app.""" - if app_name not in self._launched_apps: - raise ValueError(f"App '{app_name}' not found") - return self._launched_apps[app_name] - - async def shutdown(self): - """Shutdown all services gracefully.""" - # Stop app processes - for name, proc in self._app_processes.items(): - if proc.poll() is None: - proc.terminate() - await asyncio.sleep(1) - if proc.poll() is None: - proc.kill() - logger.info(f"Terminated app '{name}'") - - # Clear app tracking - self._app_processes.clear() - self._launched_apps.clear() - self._allocated_ports.clear() - - # Close Playwright browser - if self._browser: - try: - await self._browser.close() - logger.info("Closed Playwright browser") - except Exception as e: - logger.error(f"Error closing browser: {e}") - - if self._playwright: - try: - await self._playwright.stop() - logger.info("Stopped Playwright") - except Exception as e: - logger.error(f"Error stopping playwright: {e}") - - # Stop services in reverse order - for proc, name in [ - (self.websockify_proc, "websockify"), - (self.vnc_proc, "x11vnc"), - (self.x11_proc, "Xvfb"), - ]: - if proc and proc.poll() is None: - proc.terminate() - await asyncio.sleep(0.5) - if proc.poll() is None: - proc.kill() - logger.info(f"Stopped {name}") - - def _is_port_open(self, port: int) -> bool: - """Check if a port is open.""" - sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - sock.settimeout(0.1) - try: - result = sock.connect_ex(("localhost", port)) - sock.close() - return result == 0 - except: - return False - - def _get_next_port(self) -> int: - """Get next available port for apps.""" - base_port = 3000 - for offset in range(200): # Support up to 200 ports - port = base_port + offset - if not self._is_port_open(port) and port not in self._allocated_ports: - self._allocated_ports.add(port) - return port - raise RuntimeError("No available ports") - - async def _wait_for_port(self, port: int, service_name: str = "service", timeout: int = 30): - """Wait for a port to become available.""" - for _ in range(timeout * 5): # Check every 200ms - if self._is_port_open(port): - logger.info(f"{service_name} is ready on port {port}") - return - await asyncio.sleep(0.2) - raise TimeoutError(f"Port {port} did not become available for {service_name}") - - async def get_cdp_websocket_url(self) -> str | None: - """Discover the actual CDP WebSocket URL from Chrome's /json/version endpoint.""" - if not self.cdp_port: - return None - - try: - async with httpx.AsyncClient() as client: - response = await client.get( - f"http://localhost:{self.cdp_port}/json/version", timeout=5.0 - ) - if response.status_code == 200: - data = response.json() - # Chrome returns webSocketDebuggerUrl in /json/version response - websocket_url = data.get("webSocketDebuggerUrl") - if websocket_url: - return websocket_url - - # Fallback: try /json/list to find a browser target - response = await client.get( - f"http://localhost:{self.cdp_port}/json/list", timeout=5.0 - ) - if response.status_code == 200: - targets = response.json() - # Look for a browser target (type 'page' or title containing 'about:blank') - for target in targets: - if target.get("type") == "page" or "about:blank" in target.get("url", ""): - websocket_url = target.get("webSocketDebuggerUrl") - if websocket_url: - return websocket_url - - except Exception as e: - logger.warning(f"Failed to discover CDP WebSocket URL: {e}") - - # Final fallback to generic path (may not work) - return f"ws://localhost:{self.cdp_port}/devtools/browser" - - -# Global service manager instance -service_manager = ServiceManager() - - -@asynccontextmanager -async def lifespan(app: FastAPI): - """Manage application lifecycle.""" - # Startup - logger.info("Starting browser environment server...") - await service_manager.start_core_services() - logger.info("Browser environment server ready") - - yield - - # Shutdown - logger.info("Shutting down browser environment server...") - await service_manager.shutdown() - - -# Create FastAPI app -app = FastAPI( - title="Browser Environment API", - description="API for managing browser environment services and applications", - version="1.0.0", - lifespan=lifespan, -) - - -@app.get("/health") -async def health_check(): - """Health check endpoint.""" - return {"status": "healthy"} - - -@app.get("/status", response_model=ServiceStatus) -async def get_status(): - """Get status of all environment services.""" - return service_manager.get_service_status() - - -@app.post("/apps/launch", response_model=LaunchAppResponse) -async def launch_app(request: LaunchAppRequest): - """Launch a specific application.""" - try: - return await service_manager.launch_app(request.app_name) - except ValueError as e: - raise HTTPException(status_code=404, detail=str(e)) - except Exception as e: - raise HTTPException(status_code=500, detail=str(e)) - - -@app.get("/apps/{app_name}", response_model=AppInfo) -async def get_app_info(app_name: str): - """Get information about a specific app.""" - try: - return service_manager.get_app_info(app_name) - except ValueError as e: - raise HTTPException(status_code=404, detail=str(e)) - - -@app.get("/vnc/url") -async def get_vnc_url(): - """Get the VNC viewer URL.""" - return {"url": "http://localhost:8080/vnc.html"} - - -@app.get("/display") -async def get_display(): - """Get the X11 display information.""" - return { - "display": os.environ.get("DISPLAY", ":1"), - "x11_running": Path("/tmp/.X11-unix/X1").exists(), - } - - -@app.get("/cdp") -async def get_cdp(): - """Return the CDP websocket URL for connecting Playwright/Chromium clients.""" - if service_manager.cdp_port is None: - raise HTTPException(status_code=503, detail="CDP not available") - - # Discover the actual CDP WebSocket URL from Chrome - websocket_url = await service_manager.get_cdp_websocket_url() - if not websocket_url: - raise HTTPException(status_code=503, detail="CDP WebSocket URL not available") - - return {"ws": websocket_url} - - -@app.post("/shutdown") -async def shutdown_env(): - """Gracefully stop services and request server shutdown.""" - try: - await service_manager.shutdown() - except Exception as e: - logger.warning(f"Error during environment shutdown: {e}") - # Signal uvicorn to exit via lifespan shutdown - # FastAPI/uvicorn doesn't expose server here; we rely on process signal from caller. - return {"status": "shutting_down"} - - -if __name__ == "__main__": - import uvicorn - - uvicorn.run(app, host="0.0.0.0", port=8000) diff --git a/environments/browser/environment/todo/README.md b/environments/browser/environment/todo/README.md deleted file mode 100644 index 7d2460e9..00000000 --- a/environments/browser/environment/todo/README.md +++ /dev/null @@ -1,85 +0,0 @@ -# Todo App - -Simple todo list application with Next.js frontend and FastAPI backend, fully integrated with the HUD evaluation system. - -## Tech Stack - -- **Frontend**: Next.js, TypeScript, Tailwind CSS -- **Backend**: FastAPI, SQLite, uv for dependency management -- **Evaluation**: Comprehensive API endpoints for testing - -## Development - -```bash -# Backend -cd backend && uv run uvicorn main:app --reload - -# Frontend -cd frontend && npm install && npm run dev -``` - -## Launching - -```python -await client.call_tool("launch_app", {"app_name": "todo"}) -``` - -## Evaluation Integration - -### Backend API Endpoints -- `GET /api/eval/health` - Health check -- `GET /api/eval/stats` - Comprehensive statistics -- `GET /api/eval/has_todo?text=` - Check if todo exists -- `GET /api/eval/completion_rate` - Completion percentage -- `POST /api/eval/seed` - Seed test data -- `DELETE /api/eval/reset` - Reset database - -### Controller Components -- **Evaluators**: `TodoCompletedEvaluator`, `TodoExistsEvaluator`, `CompositeEvaluator` -- **Setup Tools**: `TodoSeedSetup`, `TodoResetSetup`, `TodoCustomSeedSetup` -- **Problems**: `TodoBasicUsageProblem`, `TodoCompositeWeightedProblem` - -### Usage Examples - -```python -# Complete problem execution -await setup({"name": "todo_basic_usage"}) -await evaluate({"name": "todo_basic_usage"}) - -# Direct function calls -await setup({"name": "todo_reset", "arguments": {}}) -await evaluate({"name": "todo_completion_rate", "arguments": {"min_rate": 0.5}}) - -# MCP resource discovery -todo_evaluators = await client.read_resource("evaluators://todo") -``` - -## Database Schema - -```sql -CREATE TABLE items ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - title TEXT NOT NULL, - description TEXT, - completed BOOLEAN DEFAULT FALSE, - created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP -); -``` - -## Testing - -### Manual -1. Launch app: `await launch_app("todo")` -2. Access at http://localhost:3000 -3. Run evaluations - -### Automated -```bash -# Test APIs -curl http://localhost:5000/api/eval/health -curl http://localhost:5000/api/eval/stats - -# Test MCP tools -await setup({"name": "todo_basic_usage"}) -await evaluate({"name": "todo_basic_usage"}) -``` \ No newline at end of file diff --git a/environments/browser/environment/todo/backend/main.py b/environments/browser/environment/todo/backend/main.py deleted file mode 100644 index 5839fa85..00000000 --- a/environments/browser/environment/todo/backend/main.py +++ /dev/null @@ -1,391 +0,0 @@ -from fastapi import FastAPI, HTTPException -from fastapi.middleware.cors import CORSMiddleware -from pydantic import BaseModel -from typing import List, Optional -from datetime import datetime -import sqlite3 -import json - -app = FastAPI(title="Todo API with Evaluation", version="0.2.0") - -# Configure CORS -app.add_middleware( - CORSMiddleware, - allow_origins=["http://localhost:3000"], - allow_credentials=True, - allow_methods=["*"], - allow_headers=["*"], -) - - -# Pydantic models -class Item(BaseModel): - id: Optional[int] = None - title: str - description: str - completed: bool = False - created_at: Optional[datetime] = None - - -class ItemCreate(BaseModel): - title: str - description: str - completed: bool = False - - -class BulkUpdateRequest(BaseModel): - item_ids: List[int] - completed: Optional[bool] = None - - -class EvaluationStats(BaseModel): - total_items: int - completed_items: int - pending_items: int - completion_rate: float - items: List[Item] - timestamps: dict - - -# Database setup -def init_db(): - conn = sqlite3.connect("app.db") - c = conn.cursor() - c.execute(""" - CREATE TABLE IF NOT EXISTS items ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - title TEXT NOT NULL, - description TEXT, - completed BOOLEAN NOT NULL DEFAULT 0, - created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP - ) - """) - conn.commit() - conn.close() - - -init_db() - - -# === CORE TODO API ROUTES === - - -@app.get("/api/status") -def status(): - return {"status": "ok", "timestamp": datetime.now().isoformat()} - - -@app.get("/api/items", response_model=List[Item]) -def get_items(): - conn = sqlite3.connect("app.db") - conn.row_factory = sqlite3.Row - c = conn.cursor() - c.execute("SELECT * FROM items ORDER BY created_at DESC") - items = [dict(row) for row in c.fetchall()] - conn.close() - return items - - -@app.post("/api/items", response_model=Item) -def create_item(item: ItemCreate): - conn = sqlite3.connect("app.db") - c = conn.cursor() - c.execute( - "INSERT INTO items (title, description, completed) VALUES (?, ?, ?)", - (item.title, item.description, item.completed), - ) - item_id = c.lastrowid - conn.commit() - conn.close() - - return get_item(item_id) - - -@app.get("/api/items/{item_id}", response_model=Item) -def get_item(item_id: int): - conn = sqlite3.connect("app.db") - conn.row_factory = sqlite3.Row - c = conn.cursor() - c.execute("SELECT * FROM items WHERE id = ?", (item_id,)) - item = c.fetchone() - conn.close() - - if not item: - raise HTTPException(status_code=404, detail="Item not found") - - return dict(item) - - -@app.put("/api/items/{item_id}", response_model=Item) -def update_item(item_id: int, item: ItemCreate): - conn = sqlite3.connect("app.db") - c = conn.cursor() - c.execute( - "UPDATE items SET title = ?, description = ?, completed = ? WHERE id = ?", - (item.title, item.description, item.completed, item_id), - ) - conn.commit() - - if c.rowcount == 0: - conn.close() - raise HTTPException(status_code=404, detail="Item not found") - - conn.close() - return get_item(item_id) - - -@app.delete("/api/items/{item_id}") -def delete_item(item_id: int): - conn = sqlite3.connect("app.db") - c = conn.cursor() - c.execute("DELETE FROM items WHERE id = ?", (item_id,)) - conn.commit() - - if c.rowcount == 0: - conn.close() - raise HTTPException(status_code=404, detail="Item not found") - - conn.close() - return {"message": "Item deleted successfully"} - - -# === EVALUATION API ROUTES === - - -@app.get("/api/eval/health") -def eval_health(): - """Health check endpoint for evaluation system.""" - try: - conn = sqlite3.connect("app.db") - c = conn.cursor() - c.execute("SELECT COUNT(*) FROM items") - count = c.fetchone()[0] - conn.close() - - return { - "status": "healthy", - "database_accessible": True, - "total_items": count, - "timestamp": datetime.now().isoformat(), - } - except Exception as e: - return {"status": "unhealthy", "error": str(e), "timestamp": datetime.now().isoformat()} - - -@app.get("/api/eval/stats", response_model=EvaluationStats) -def get_evaluation_stats(): - """Comprehensive evaluation statistics for the todo app.""" - conn = sqlite3.connect("app.db") - conn.row_factory = sqlite3.Row - c = conn.cursor() - - # Get total counts - c.execute("SELECT COUNT(*) as total FROM items") - total = c.fetchone()[0] - - c.execute("SELECT COUNT(*) as completed FROM items WHERE completed = 1") - completed = c.fetchone()[0] - - # Get all items with details - c.execute("SELECT * FROM items ORDER BY created_at DESC") - items = [dict(row) for row in c.fetchall()] - - # Get timing information - c.execute(""" - SELECT created_at - FROM items - ORDER BY created_at DESC - LIMIT 1 - """) - last_created_row = c.fetchone() - last_created = last_created_row[0] if last_created_row else None - - c.execute(""" - SELECT created_at - FROM items - WHERE completed = 1 - ORDER BY created_at DESC - LIMIT 1 - """) - last_completed_row = c.fetchone() - last_completed = last_completed_row[0] if last_completed_row else None - - conn.close() - - return EvaluationStats( - total_items=total, - completed_items=completed, - pending_items=total - completed, - completion_rate=completed / total if total > 0 else 0.0, - items=items, - timestamps={"last_created": last_created, "last_completed": last_completed}, - ) - - -@app.get("/api/eval/todos", response_model=List[Item]) -def get_todos_for_evaluation(): - """Get all todos for evaluation purposes (alias for /api/items).""" - return get_items() - - -@app.get("/api/eval/has_todo") -def check_todo_exists(text: str): - """Check if a todo item exists with specific text in title or description.""" - conn = sqlite3.connect("app.db") - conn.row_factory = sqlite3.Row - c = conn.cursor() - c.execute( - """ - SELECT * FROM items - WHERE title LIKE ? OR description LIKE ? - ORDER BY created_at DESC - """, - (f"%{text}%", f"%{text}%"), - ) - - items = [dict(row) for row in c.fetchall()] - conn.close() - - return { - "exists": len(items) > 0, - "count": len(items), - "search_text": text, - "matches": items, - "timestamp": datetime.now().isoformat(), - } - - -@app.post("/api/eval/bulk_update") -def bulk_update_items(request: BulkUpdateRequest): - """Update multiple items at once for evaluation purposes.""" - conn = sqlite3.connect("app.db") - c = conn.cursor() - - updated_count = 0 - if request.completed is not None: - for item_id in request.item_ids: - c.execute("UPDATE items SET completed = ? WHERE id = ?", (request.completed, item_id)) - if c.rowcount > 0: - updated_count += 1 - - conn.commit() - conn.close() - - return { - "message": f"Updated {updated_count} items", - "updated_count": updated_count, - "requested_ids": request.item_ids, - "timestamp": datetime.now().isoformat(), - } - - -@app.get("/api/eval/completion_rate") -def get_completion_rate(): - """Get the current completion rate as a percentage.""" - conn = sqlite3.connect("app.db") - c = conn.cursor() - - c.execute("SELECT COUNT(*) as total FROM items") - total = c.fetchone()[0] - - c.execute("SELECT COUNT(*) as completed FROM items WHERE completed = 1") - completed = c.fetchone()[0] - - conn.close() - - rate = completed / total if total > 0 else 0.0 - - return { - "completion_rate": rate, - "completion_percentage": rate * 100, - "completed_items": completed, - "total_items": total, - "timestamp": datetime.now().isoformat(), - } - - -# === EVALUATION UTILITY ROUTES === - - -@app.post("/api/eval/seed") -def seed_test_data(): - """Seed the database with test data for evaluation purposes.""" - test_items = [ - {"title": "Buy groceries", "description": "Get milk, eggs, and bread", "completed": True}, - { - "title": "Walk the dog", - "description": "Take Max for a 30-minute walk", - "completed": True, - }, - { - "title": "Finish project", - "description": "Complete the Q4 presentation", - "completed": False, - }, - {"title": "Call mom", "description": "Weekly check-in call", "completed": False}, - { - "title": "Schedule dentist", - "description": "Book appointment for cleaning", - "completed": False, - }, - ] - - conn = sqlite3.connect("app.db") - c = conn.cursor() - - for item in test_items: - c.execute( - """ - INSERT INTO items (title, description, completed) - VALUES (?, ?, ?) - """, - (item["title"], item["description"], item["completed"]), - ) - - conn.commit() - conn.close() - - return { - "message": "Test data seeded successfully", - "items_added": len(test_items), - "timestamp": datetime.now().isoformat(), - } - - -@app.post("/api/eval/seed_custom") -def seed_custom_data(items: List[ItemCreate]): - """Seed the database with custom test data for evaluation purposes.""" - conn = sqlite3.connect("app.db") - c = conn.cursor() - - items_added = 0 - for item in items: - c.execute( - """ - INSERT INTO items (title, description, completed) - VALUES (?, ?, ?) - """, - (item.title, item.description if hasattr(item, "description") else "", item.completed), - ) - items_added += 1 - - conn.commit() - conn.close() - - return { - "message": "Custom test data seeded successfully", - "items_added": items_added, - "timestamp": datetime.now().isoformat(), - } - - -@app.delete("/api/eval/reset") -def reset_database(): - """Reset the database to empty state for clean evaluation.""" - conn = sqlite3.connect("app.db") - c = conn.cursor() - c.execute("DELETE FROM items") - conn.commit() - conn.close() - - return {"message": "Database reset successfully", "timestamp": datetime.now().isoformat()} diff --git a/environments/browser/environment/todo/backend/pyproject.toml b/environments/browser/environment/todo/backend/pyproject.toml deleted file mode 100644 index 493627d5..00000000 --- a/environments/browser/environment/todo/backend/pyproject.toml +++ /dev/null @@ -1,15 +0,0 @@ -[project] -name = "sample-backend" -version = "0.1.0" -description = "FastAPI backend for sample app" -requires-python = ">=3.10" -dependencies = [ - "fastapi==0.109.0", - "uvicorn[standard]==0.27.0", - "sqlalchemy==2.0.25", - "pydantic==2.5.3", - "python-multipart==0.0.6", -] - -[tool.uv] -dev-dependencies = [] \ No newline at end of file diff --git a/environments/browser/environment/todo/frontend/app/globals.css b/environments/browser/environment/todo/frontend/app/globals.css deleted file mode 100644 index de4d11a2..00000000 --- a/environments/browser/environment/todo/frontend/app/globals.css +++ /dev/null @@ -1,3 +0,0 @@ -@tailwind base; -@tailwind components; -@tailwind utilities; \ No newline at end of file diff --git a/environments/browser/environment/todo/frontend/app/layout.tsx b/environments/browser/environment/todo/frontend/app/layout.tsx deleted file mode 100644 index 0acab9a4..00000000 --- a/environments/browser/environment/todo/frontend/app/layout.tsx +++ /dev/null @@ -1,22 +0,0 @@ -import type { Metadata } from 'next' -import { Inter } from 'next/font/google' -import './globals.css' - -const inter = Inter({ subsets: ['latin'] }) - -export const metadata: Metadata = { - title: 'Sample App', - description: 'A sample Next.js app with FastAPI backend', -} - -export default function RootLayout({ - children, -}: { - children: React.ReactNode -}) { - return ( - - {children} - - ) -} \ No newline at end of file diff --git a/environments/browser/environment/todo/frontend/app/page.tsx b/environments/browser/environment/todo/frontend/app/page.tsx deleted file mode 100644 index c5de6422..00000000 --- a/environments/browser/environment/todo/frontend/app/page.tsx +++ /dev/null @@ -1,289 +0,0 @@ -'use client' - -import { useState, useEffect } from 'react' - -interface Item { - id: number - title: string - description: string - completed: boolean - created_at: string -} - -type FilterType = 'all' | 'active' | 'completed' - -// Dynamically determine API URL based on current port -// Backend is always on frontend_port + 1 -const getApiUrl = () => { - if (typeof window !== 'undefined') { - const currentPort = parseInt(window.location.port) || 3000; - return `http://localhost:${currentPort + 1}`; - } - return process.env.NEXT_PUBLIC_API_URL || 'http://localhost:5000'; -}; - -const API_URL = getApiUrl(); - -export default function Home() { - const [items, setItems] = useState([]) - const [newTitle, setNewTitle] = useState('') - const [newDescription, setNewDescription] = useState('') - const [loading, setLoading] = useState(true) - const [filter, setFilter] = useState('all') - const [searchTerm, setSearchTerm] = useState('') - - useEffect(() => { - fetchItems() - }, []) - - const fetchItems = async () => { - try { - const response = await fetch(`${API_URL}/api/items`) - const data = await response.json() - setItems(data) - } catch (error) { - console.error('Error fetching items:', error) - } finally { - setLoading(false) - } - } - - const createItem = async (e: React.FormEvent) => { - e.preventDefault() - if (!newTitle.trim()) return - - try { - const response = await fetch(`${API_URL}/api/items`, { - method: 'POST', - headers: { 'Content-Type': 'application/json' }, - body: JSON.stringify({ - title: newTitle, - description: newDescription, - completed: false - }) - }) - - if (response.ok) { - setNewTitle('') - setNewDescription('') - fetchItems() - } - } catch (error) { - console.error('Error creating item:', error) - } - } - - const toggleItem = async (id: number, item: Item) => { - try { - const response = await fetch(`${API_URL}/api/items/${id}`, { - method: 'PUT', - headers: { 'Content-Type': 'application/json' }, - body: JSON.stringify({ - ...item, - completed: !item.completed - }) - }) - - if (response.ok) { - fetchItems() - } - } catch (error) { - console.error('Error updating item:', error) - } - } - - const deleteItem = async (id: number) => { - try { - const response = await fetch(`${API_URL}/api/items/${id}`, { - method: 'DELETE' - }) - - if (response.ok) { - fetchItems() - } - } catch (error) { - console.error('Error deleting item:', error) - } - } - - const markAllComplete = async () => { - const activeItems = items.filter(item => !item.completed) - for (const item of activeItems) { - await toggleItem(item.id, item) - } - } - - const deleteCompleted = async () => { - const completedItems = items.filter(item => item.completed) - for (const item of completedItems) { - await deleteItem(item.id) - } - } - - // Filter and search logic - const filteredItems = items - .filter(item => { - if (filter === 'active') return !item.completed - if (filter === 'completed') return item.completed - return true - }) - .filter(item => { - if (!searchTerm) return true - const term = searchTerm.toLowerCase() - return item.title.toLowerCase().includes(term) || - item.description.toLowerCase().includes(term) - }) - - const stats = { - total: items.length, - active: items.filter(i => !i.completed).length, - completed: items.filter(i => i.completed).length - } - - return ( -
-
-

Todo App

- - {/* Stats Bar */} -
-
-
- - Total: {stats.total} - - - Active: {stats.active} - - - Completed: {stats.completed} - -
-
- - -
-
-
- - {/* Add Item Form */} -
-

Add New Item

-
- setNewTitle(e.target.value)} - className="w-full px-4 py-2 border border-gray-300 rounded-md focus:outline-none focus:ring-2 focus:ring-blue-500" - /> -