diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 61d713d0..b070add5 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -59,4 +59,4 @@ jobs:
         uses: astral-sh/setup-uv@v5
 
       - name: Run pyright
-        run: uv run --with=".[rl,dev]" pyright
+        run: uv run --with=".[dev]" pyright
diff --git a/README.md b/README.md
index bd4fdfd3..ff67506a 100644
--- a/README.md
+++ b/README.md
@@ -6,395 +6,133 @@
   </picture>
 </div>
 
-OSS RL environment + evals toolkit. Wrap software as environments, run benchmarks, and train with RL – locally or at scale.
+The HUD SDK is an open-source Python toolkit for building, evaluating, and training AI agents. Use a unified API for any model provider, wrap your code as MCP environments, run A/B evals at scale, and train with reinforcement learning.
 
-[![PyPI version](https://img.shields.io/pypi/v/hud-python?style=flat-square)](https://pypi.org/project/hud-python/)
+To learn more, check out our [Documentation](https://docs.hud.ai) and [API Reference](https://docs.hud.ai/reference).
+
+[![PyPI](https://img.shields.io/pypi/v/hud-python?style=flat-square)](https://pypi.org/project/hud-python/)
 [![License](https://img.shields.io/badge/license-MIT-green?style=flat-square)](LICENSE)
 [![Add docs to Cursor](https://img.shields.io/badge/Add%20docs%20to-Cursor-black?style=flat-square)](https://cursor.com/en/install-mcp?name=docs-hud-python&config=eyJ1cmwiOiJodHRwczovL2RvY3MuaHVkLmFpL21jcCJ9)
 [![Discord](https://img.shields.io/discord/1327447144772407390?label=Discord&logo=discord&style=flat-square)](https://discord.gg/wkjtmHYYjm)
 [![X Follow](https://img.shields.io/twitter/follow/hud_evals?style=social)](https://x.com/intent/user?screen_name=hud_evals)
 [![Shop](https://img.shields.io/badge/_-white.svg?label=shop&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABQAAAAJCAYAAAAywQxIAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAAJcEhZcwAACxMAAAsTAQCanBgAAAF6SURBVChTlZA9ixNhFIWf8yaTpFHRRMXCKpAZhCAYFvwoLHZhwUKw9A9YCJb+Bq0sxGbBQrTxX1j41dvIRAjGZbdwRUUGIzPMeyw2swS3WZ/ynHvP5VylafoAWAd+5Xm+wX+SpukmcMf29RDCZrD9BViz3f53+CjYngKZpD5A2/Y7SQBMJpOkKIprdV1vdzqdHzHGblmW9Ww2+5pl2TmAxWKxmM/nP8fj8cmqqtZijJ9sb0u6ABBWjh0riuIt8CqE8LGu66e2d5MkeQ8QY3xme7fb7T4ZjUbrZVl+jjFuSXoEXGxCDgIl9WzfAO5LSmzvNB771R6vzG4Bx0MIt/M8vwV8aLyDQNt70+n0G1AspaTxVln+aghQluVsKbvxVysflT9NQK/XO7R/SGiQ9Nt2aftElmWXJd1kv0kbeANQVdWl4XB4XtJouXaqNRgMHkrqS+r0+/3XwD1JXdungRfAVWBi+6WkK8D3EMJz22cl3W21WgNgx3YAzvwFd0Chdq03gKUAAAAASUVORK5CYII=&style=social)](https://shop.hud.ai)
 [![Scarf](https://static.scarf.sh/a.png?x-pxid=6530ff33-4945-452b-81f9-626872593933)](https://scarf.sh)
+[![Docs](https://img.shields.io/badge/docs-hud.ai-blue?style=flat-square)](https://docs.hud.ai)
 
-
-### Are you an enterprise building agents?
-
-[📅 Hop on a call](https://cal.com/jay-hud) or [📧 founders@hud.ai](mailto:founders@hud.ai)
-
-## Highlights
-
-- 🚀 **[MCP environment skeleton](https://docs.hud.ai/core-concepts/mcp-protocol)** – any agent can call any environment.
-- ⚡️ **[Live telemetry](https://hud.ai)** – inspect every tool call, observation, and reward in real time.
-- 🗂️ **[Public benchmarks](https://hud.ai/leaderboards)** – OSWorld-Verified, SheetBench-50, and more.
-- 🌐 **[Cloud browsers](environments/remote_browser/)** – AnchorBrowser, Steel, BrowserBase integrations for browser automation.
-- 🛠️ **[Hot-reload dev loop](environments/README.md#phase-5-hot-reload-development-with-cursor-agent)** – `hud dev` for iterating on environments without rebuilds.
-- 🎓 **[One-click RL](https://hud.ai/models)** – Run `hud rl` to get a trained model on any environment.
-
-> We welcome contributors and feature requests – open an issue or hop on a call to discuss improvements!
-
-## Installation
+## Install
 
 ```bash
-# SDK - MCP servers, telemetry, evaluation
 pip install hud-python
-
-# CLI - RL pipeline, environment design
-uv tool install hud-python@latest --python 3.12
-# uv tool update-shell
-```
-
-> See [docs.hud.ai](https://docs.hud.ai), or add docs to any MCP client:
-> `claude mcp add --transport http docs-hud https://docs.hud.ai/mcp`
-
-Before starting, get your HUD_API_KEY at [hud.ai](https://hud.ai).
-
-
-## Quickstart: Evals
-
-For a tutorial that explains the agent and evaluation design, run:
-
-```python
-uvx hud-python quickstart
 ```
 
-Or just write your own agent loop (more [examples here](examples/)).
-
-```python
-import asyncio, hud, os
-from hud.settings import settings
-from hud.clients import MCPClient
-from hud.agents import ClaudeAgent
-from hud.datasets import Task  # See docs: https://docs.hud.ai/reference/tasks
-
-async def main() -> None:
-    with hud.trace("Quick Start 2048"): # All telemetry works for any MCP-based agent (see https://hud.ai)
-        task = {
-            "prompt": "Reach 64 in 2048.",
-            "mcp_config": {
-                "hud": {
-                    "url": "https://mcp.hud.ai/v3/mcp",  # HUD's cloud MCP server (see https://docs.hud.ai/core-concepts/architecture)
-                    "headers": {
-                        "Authorization": f"Bearer {settings.api_key}",  # Get your key at https://hud.ai
-                        "Mcp-Image": "hudpython/hud-text-2048:v1.2"  # Docker image from https://hub.docker.com/u/hudpython
-                    }
-                }
-            },
-            "evaluate_tool": {"name": "evaluate", "arguments": {"name": "max_number", "arguments": {"target": 64}}},
-        }
-        task = Task(**task)
-
-        # 1. Define the client explicitly:
-        client = MCPClient(mcp_config=task.mcp_config)
-        agent = ClaudeAgent(
-            mcp_client=client,
-            model="claude-sonnet-4-5",  # requires ANTHROPIC_API_KEY
-        )
-
-        result = await agent.run(task)
-
-        # 2. Or just:
-        # result = await ClaudeAgent().run(task)
-
-        print(f"Reward: {result.reward}")
-        await client.shutdown()
-
-asyncio.run(main())
-```
-
-The above example let's the agent play 2048 ([See replay](https://hud.ai/trace/6feed7bd-5f67-4d66-b77f-eb1e3164604f))
-
-![Agent playing 2048](https://raw.githubusercontent.com/hud-evals/hud-python/main/docs/src/images/2048_1.gif)
-
-## Quickstart: Training
-
-RL using GRPO a Qwen2.5-VL model on any hud dataset:
+Get your API key at [hud.ai](https://hud.ai) and set it:
 
 ```bash
-hud get hud-evals/2048-basic # from HF
-hud rl 2048-basic.json
+export HUD_API_KEY=your-key-here
 ```
 
-> See [agent training docs](https://docs.hud.ai/train-agents/quickstart)
-
-Or make your own environment and dataset:
-
-```bash
-hud init my-env && cd my-env
-hud dev --interactive
-# When ready to run:
-hud rl
-```
+> For CLI tools (`hud init`, `hud dev`, etc.): `uv tool install hud-python --python 3.12`
 
-> See [environment design docs](https://docs.hud.ai/build-environments)
+![Agent running on SheetBench](https://raw.githubusercontent.com/hud-evals/hud-python/main/docs/src/images/trace_sheet.gif)
 
-## Benchmarking Agents
+## Usage
 
-This is Claude Computer Use running on our proprietary financial analyst benchmark [SheetBench-50](https://huggingface.co/datasets/hud-evals/SheetBench-50):
+### Unified Model API
 
-![Trace screenshot](https://raw.githubusercontent.com/hud-evals/hud-python/main/docs/src/images/trace_sheet.gif)
+Use Claude, GPT, Gemini, or Grok through one OpenAI-compatible endpoint:
 
-> [See this trace on _hud.ai_](https://hud.ai/trace/9e212e9e-3627-4f1f-9eb5-c6d03c59070a)
-
-This example runs the full dataset (only takes ~20 minutes) using [run_evaluation.py](examples/run_evaluation.py):
-
-```bash
-python examples/run_evaluation.py hud-evals/SheetBench-50 --full --agent claude
-```
+```python
+from openai import AsyncOpenAI
+import os
 
-Or in code:
+client = AsyncOpenAI(
+    base_url="https://inference.hud.ai",
+    api_key=os.environ["HUD_API_KEY"]
+)
 
-```python
-import asyncio
-from hud.datasets import run_dataset
-from hud.agents import ClaudeAgent
-
-results = await run_dataset(
-    name="My SheetBench-50 Evaluation",
-    dataset="hud-evals/SheetBench-50",      # <-- HuggingFace dataset
-    agent_class=ClaudeAgent,                # <-- Your custom agent can replace this (see https://docs.hud.ai/evaluate-agents/create-agents)
-    agent_config={"model": "claude-sonnet-4-5"},
-    max_concurrent=50,
-    max_steps=30,
+response = await client.chat.completions.create(
+    model="claude-sonnet-4-5",  # or gpt-4o, gemini-2.5-pro (https://hud.ai/models)
+    messages=[{"role": "user", "content": "Hello!"}]
 )
-print(f"Average reward: {sum(r.reward for r in results) / len(results):.2f}")
 ```
 
-> Running a dataset creates a job and streams results to the [hud.ai](https://hud.ai) platform for analysis and [leaderboard submission](https://docs.hud.ai/evaluate-agents/leaderboards).
-
-## Building Environments (MCP)
+Every call is traced at [hud.ai](https://hud.ai). → [Docs](https://docs.hud.ai/quick-links/gateway)
 
-This is how you can make any environment into an interactable one in 5 steps:
+### Environments
 
-1. Define MCP server layer using [`MCPServer`](https://docs.hud.ai/reference/environments)
+Turn your code into tools agents can call. Define how to evaluate them:
 
 ```python
-from hud.server import MCPServer
-from hud.tools import HudComputerTool
+from hud import Environment
 
-mcp = MCPServer("My Environment")
+env = Environment("my-env")
 
-# Add hud tools (see all tools: https://docs.hud.ai/reference/tools)
-mcp.tool(HudComputerTool())
+@env.tool()
+def search(query: str) -> str:
+    """Search the knowledge base."""
+    return db.search(query)
 
-# Or custom tools (see https://docs.hud.ai/build-environments/adapting-software)
-@mcp.tool("launch_app"):
-def launch_app(name: str = "Gmail")
-...
-
-if __name__ == "__main__":
-    mcp.run()
+@env.scenario("find-answer")
+async def find_answer(question: str, answer: str):
+    response = yield f"Find: {question}"       # Prompt
+    yield 1.0 if answer in response else 0.0  # Reward
 ```
 
-2. Write a simple Dockerfile that installs packages and runs:
+The agent runs between the yields. First yield sends the prompt, second yield scores the result. → [Docs](https://docs.hud.ai/quick-links/environments) · [Templates](https://hud.ai/environments)
 
-```python
-CMD ["python", "-m", "hud_controller.server"]
-```
+### A/B Evals
 
-And build the image:
+Test different models. Repeat runs to see the distribution:
 
-```bash
-hud build # runs docker build under the hood
-```
+```python
+import hud
 
-Or run it in interactible mode
+task = env("find-answer", question="What is 2+2?", answer="4")
 
-```bash
-hud dev
+async with hud.eval(task, variants={"model": ["gpt-4o", "claude-sonnet-4-5"]}, group=5) as ctx:
+    response = await client.chat.completions.create(
+        model=ctx.variants["model"],
+        messages=[{"role": "user", "content": ctx.prompt}]
+    )
+    await ctx.submit(response.choices[0].message.content)
 ```
 
-3. Debug it with the CLI to see if it launches:
+**Variants** test configurations. **Groups** repeat for distribution. Results stream to [hud.ai](https://hud.ai). → [Docs](https://docs.hud.ai/quick-links/ab-testing)
 
-```console
-$ hud debug my-name/my-environment:latest
+### Deploy & Train
 
-✓ Phase 1: Docker image exists
-✓ Phase 2: MCP server responds to initialize 
-✓ Phase 3: Tools are discoverable
-✓ Phase 4: Basic tool execution works
-✓ Phase 5: Parallel performance is good
-
-Progress: [█████████████████████] 5/5 phases (100%)
-✅ All phases completed successfully!
-```
-
-Analyze it to see if all tools appear:
-
-```console
-$ hud analyze hudpython/hud-remote-browser:latest
-⠏ ✓ Analysis complete
-...
-Tools
-├── Regular Tools
-│   ├── computer
-│   │   └── Control computer with mouse, keyboard, and screenshots
-...
-└── Hub Tools
-    ├── setup
-    │   ├── navigate_to_url
-    │   ├── set_cookies
-    │   ├── ...
-    └── evaluate
-        ├── url_match
-        ├── page_contains
-        ├── cookie_exists
-        ├── ...
-
-📡 Telemetry Data
- Live URL  https://live.anchorbrowser.io?sessionId=abc123def456
-```
-
-4. When the tests pass, push it up to the docker registry:
+Push to GitHub, connect on hud.ai, run at scale:
 
 ```bash
-hud push # needs docker login, hud api key
-```
-
-5. Now you can use `mcp.hud.ai` to launch 100s of instances of this environment in parallel with any agent, and see everything live on [hud.ai](https://hud.ai):
-
-```python
-from hud.agents import ClaudeAgent
-
-result = await ClaudeAgent().run({  # See all agents: https://docs.hud.ai/reference/agents
-    "prompt": "Please explore this environment",
-    "mcp_config": {
-        "my-environment": {
-            "url": "https://mcp.hud.ai/v3/mcp",
-            "headers": {
-                "Authorization": f"Bearer {os.getenv('HUD_API_KEY')}",
-                "Mcp-Image": "my-name/my-environment:latest"
-            }
-        }
-        # "my-environment": { # or use hud run which wraps local and remote running
-        #     "cmd": "hud",
-        #     "args": [
-        #         "run",
-        #         "my-name/my-environment:latest",
-        #     ]
-        # }
-    }
-})
-
+hud init                  # Scaffold environment
+git push                  # Push to GitHub
+# Connect on hud.ai → New → Environment
+hud eval my-org/my-eval --model gpt-4o --group-size 100
+# Or create and run tasks on the platform
 ```
 
-> See the full environment design guide and common pitfalls in [`environments/README.md`](environments/README.md)
+Every run generates training data. Use it to fine-tune or run RL. → [Docs](https://docs.hud.ai/quick-links/deploy)
 
-## Leaderboards & benchmarks
+## Links
 
-All leaderboards are publicly available on [hud.ai/leaderboards](https://hud.ai/leaderboards) (see [docs](https://docs.hud.ai/evaluate-agents/leaderboards))
+- 📖 [Documentation](https://docs.hud.ai)
+- ⌨️ [CLI Reference](https://docs.hud.ai/reference/cli/overview)
+- 🏆 [Leaderboards](https://hud.ai/leaderboards)
+- 🌐 [Environment Templates](https://hud.ai/environments)
+- 🤖 [Supported Models](https://hud.ai/models)
+- 💬 [Discord](https://discord.gg/wkjtmHYYjm)
 
-![Leaderboard](https://raw.githubusercontent.com/hud-evals/hud-python/main/docs/src/images/leaderboards_3.png)
+## Enterprise
 
-We highly suggest running 3-5 evaluations per dataset for the most consistent results across multiple jobs.
+Building agents at scale? We work with teams on custom environments, benchmarks, and training.
 
-Using the [`run_dataset`](https://docs.hud.ai/reference/tasks#run_dataset) function with a HuggingFace dataset automatically assigns your job to that leaderboard page, and allows you to create a scorecard out of it:
-
-## Reinforcement Learning with GRPO
-
-This is a Qwen‑2.5‑VL‑3B agent training a policy on the 2048-basic browser environment:
-
-![RL curve](https://raw.githubusercontent.com/hud-evals/hud-python/main/docs/src/images/rl_2.png)
-
-Train with the new interactive `hud rl` flow:
-
-```bash
-# Install CLI
-uv tool install hud-python@latest --python 3.12
-
-# Option A: Run directly from a HuggingFace dataset
-hud rl hud-evals/2048-basic
-
-# Option B: Download first, modify, then train
-hud get hud-evals/2048-basic
-hud rl 2048-basic.json
-
-# Optional: baseline evaluation
-hud eval 2048-basic.json
-```
-
-Supports multi‑turn RL for both:
-- Language‑only models (e.g., `Qwen/Qwen2.5-7B-Instruct`)
-- Vision‑Language models (e.g., `Qwen/Qwen2.5-VL-3B-Instruct`)
-
-By default, `hud rl` provisions a persistent server and trainer in the cloud, streams telemetry to `hud.ai`, and lets you monitor/manage models at `hud.ai/models`. Use `--local` to run entirely on your machines (typically 2+ GPUs: one for vLLM, the rest for training).
-
-Any HUD MCP environment and evaluation works with our RL pipeline (including remote configurations). See the guided docs: `https://docs.hud.ai/train-agents/quickstart`.
-
-Pricing: Hosted vLLM and training GPU rates are listed in the [Training Quickstart → Pricing](https://docs.hud.ai/train-agents/quickstart#pricing). Manage billing at the [HUD billing dashboard](https://hud.ai/project/billing).
-
-## Architecture
-
-```mermaid
-%%{init: {"theme": "neutral", "themeVariables": {"fontSize": "14px"}} }%%
-graph LR
-    subgraph "Platform"
-        Dashboard["📊 hud.ai"]
-        API["🔌 mcp.hud.ai"]
-    end
-  
-    subgraph "hud"
-        Agent["🤖 Agent"]
-        Task["📋 Task"]
-        SDK["📦 SDK"]
-    end
-  
-    subgraph "Environments"
-        LocalEnv["🖥️ Local Docker<br/>(Development)"]
-        RemoteEnv["☁️ Remote Docker<br/>(100s Parallel)"]
-    end
-  
-    subgraph "otel"
-        Trace["📡 Traces & Metrics"]
-    end
-  
-    Dataset["📚 Dataset<br/>(HuggingFace)"]
-  
-    AnyMCP["🔗 Any MCP Client<br/>(Cursor, Claude, Custom)"]
-  
-    Agent <--> SDK
-    Task --> SDK
-    Dataset <-.-> Task
-    SDK <-->|"MCP"| LocalEnv
-    SDK <-->|"MCP"| API
-    API  <-->|"MCP"| RemoteEnv
-    SDK  --> Trace
-    Trace --> Dashboard
-    AnyMCP -->|"MCP"| API
-  
-```
-
-## CLI reference
-
-| Command                 | Purpose                                    | Docs |
-| ----------------------- | ------------------------------------------ | ---- |
-| [`hud init`](https://docs.hud.ai/reference/cli/init)            | Create new environment with boilerplate.  | [📖](https://docs.hud.ai/reference/cli/init) |
-| [`hud dev`](https://docs.hud.ai/reference/cli/dev)              | Hot-reload development with Docker.        | [📖](https://docs.hud.ai/reference/cli/dev) |
-| [`hud build`](https://docs.hud.ai/reference/cli/build)          | Build image and generate lock file.       | [📖](https://docs.hud.ai/reference/cli/build) |
-| [`hud push`](https://docs.hud.ai/reference/cli/push)            | Share environment to registry.            | [📖](https://docs.hud.ai/reference/cli/push) |
-| [`hud pull <target>`](https://docs.hud.ai/reference/cli/pull)   | Get environment from registry.            | [📖](https://docs.hud.ai/reference/cli/pull) |
-| [`hud analyze <image>`](https://docs.hud.ai/reference/cli/analyze) | Discover tools, resources, and metadata.   | [📖](https://docs.hud.ai/reference/cli/analyze) |
-| [`hud debug <image>`](https://docs.hud.ai/reference/cli/debug)   | Five-phase health check of an environment. | [📖](https://docs.hud.ai/reference/cli/debug) |
-| [`hud run <image>`](https://docs.hud.ai/reference/cli/run)       | Run MCP server locally or remotely.       | [📖](https://docs.hud.ai/reference/cli/run) |
-
-## Roadmap
-
-- Merging our forks in to the main `mcp`, `mcp_use` repositories
-- Helpers for building new environments (see [current guide](environments/README.md))
-- Integrations with every major agent framework
-- Evaluation environment registry
-- MCP opentelemetry standard
+[📅 Book a call](https://cal.com/jay-hud) · [📧 founders@hud.ai](mailto:founders@hud.ai)
 
 ## Contributing
 
-We welcome contributions! See [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.
-
-Key areas:
-- [Environment examples](environments/) - Add new MCP environments
-- [Agent implementations](hud/agents/) - Add support for new LLM providers
-- [Tool library](hud/tools/) - Extend the built-in tool collection
-- [RL training](hud/rl/) - Improve reinforcement learning pipelines
+We welcome contributions! See [CONTRIBUTING.md](CONTRIBUTING.md).
 
-Thanks to all our contributors!
+Key areas: [Agents](hud/agents/) · [Tools](hud/tools/) · [Environments](https://hud.ai/environments)
 
 <a href="https://github.com/hud-evals/hud-python/graphs/contributors">
   <img src="https://contrib.rocks/image?repo=hud-evals/hud-python&max=50" />
@@ -412,4 +150,4 @@ Thanks to all our contributors!
 }
 ```
 
-> **License**: HUD is released under the MIT License – see the [LICENSE](LICENSE) file for details.
+MIT License · [LICENSE](LICENSE)
diff --git a/docs/advanced/testing-environments.mdx b/docs/advanced/testing-environments.mdx
new file mode 100644
index 00000000..99ece815
--- /dev/null
+++ b/docs/advanced/testing-environments.mdx
@@ -0,0 +1,105 @@
+---
+title: "Testing Environments"
+description: "Test scenarios, tools, and environment logic locally"
+icon: "flask-vial"
+---
+
+Before deploying, test locally. See [Sandboxing](/guides/sandboxing) for Docker vs no-Docker patterns.
+
+## Local Testing
+
+| Environment | `local_test.py` |
+|-------------|-----------------|
+| No Docker | `from env import env` |
+| Docker | `env.connect_url("http://localhost:8765/mcp")` |
+
+Both use the same API after setup:
+
+```python
+async with env:
+    tools = env.as_tools()                              # List available tools
+    result = await env.call_tool("my_tool", arg="val")  # Call a tool
+```
+
+## Testing Scenarios Directly
+
+Scenarios are async generators. `hud.eval()` drives them automatically, but you can test the logic directly—this is exactly what runs at the start and end of `hud.eval()`:
+
+```python
+async def checkout(user_id: str, amount: int = 100):
+    # Setup + prompt (first yield) — runs at hud.eval() start
+    answer = yield f"Complete checkout for {user_id}, ${amount}"
+    
+    # Evaluation (second yield) — runs after agent submits
+    yield 1.0 if "success" in answer.lower() else 0.0
+
+async def test():
+    gen = checkout("alice", 50)
+    prompt = await anext(gen)           # What hud.eval() does at start
+    reward = await gen.asend("Success!") # What hud.eval() does after submit
+    assert reward == 1.0
+```
+
+If your scenario tests pass, `hud.eval()` will behave identically.
+
+## Mocking
+
+`env.mock()` intercepts at the tool layer—agents only see tools:
+
+```python
+env.mock()  # All tools return fake responses
+env.mock_tool("send_email", {"status": "sent"})
+
+# Check mock state
+assert env.is_mock == True
+```
+
+## Hot-Reload
+
+For Docker environments, `hud dev -w path` reloads Python on save:
+
+```bash
+hud dev -w scenarios -w tools --port 8765
+```
+
+System services (postgres, VNC, browsers) persist across reloads.
+
+## Debugging Build Failures
+
+`hud build` runs the exact same pipeline as **New → Environment** on [hud.ai](https://hud.ai)—so if it passes locally, it'll work in production. If the build fails or the container crashes on startup, use `hud debug` to run a 5-phase compliance test:
+
+```bash
+hud debug my-env:latest
+```
+
+Output shows exactly which phase failed:
+```
+✓ Phase 1: Docker image exists
+✓ Phase 2: MCP server responds to initialize
+✗ Phase 3: Tool discovery failed
+  → Error: Connection refused on port 8005
+  → Hint: Backend service may not be starting
+```
+
+You can also debug a directory (builds first) or stop at a specific phase:
+
+```bash
+hud debug .                    # Build and debug current directory
+hud debug . --max-phase 3      # Stop after phase 3
+hud debug --config mcp.json    # Debug from config file
+```
+
+## Useful Environment Properties
+
+```python
+# Check parallelization (for running multiple evals)
+env.is_parallelizable  # True if all connections are remote
+
+# List what's connected
+env.connections        # Dict of connection names → connectors
+env.is_connected       # True if in async context
+
+# Resources and prompts (beyond tools)
+await env.list_resources()  # MCP resources
+await env.list_prompts()    # MCP prompts
+```
diff --git a/docs/beta/index.mdx b/docs/beta/index.mdx
index b318cad3..6485a3fd 100644
--- a/docs/beta/index.mdx
+++ b/docs/beta/index.mdx
@@ -11,5 +11,5 @@ Beta features are experimental and may change in future releases.
 ## Available Beta Features
 
 <Card title="Reinforcement Fine-Tuning (RFT)" icon="brain-circuit" href="/beta/rft">
-  Fine-tune models with reinforcement learning on your HUD tasks (invite-only)
+  Fine-tune models on your HUD tasks (invite-only)
 </Card>
diff --git a/docs/build-environments/index.mdx b/docs/build-environments/index.mdx
index 40ec910f..4981b22e 100644
--- a/docs/build-environments/index.mdx
+++ b/docs/build-environments/index.mdx
@@ -66,9 +66,6 @@ hud eval tasks.json
 
 # Deploy to registry
 hud push
-
-# Train agents on your tasks
-hud rl tasks.json
 ```
 
 ---
@@ -83,7 +80,6 @@ hud rl tasks.json
 | Troubleshoot | `hud debug my-env:dev` |
 | Build image | `hud build` |
 | Push to registry | `hud push` |
-| RL training | `hud rl tasks.json` |
 
 ---
 
@@ -93,3 +89,20 @@ hud rl tasks.json
 * **CLI reference**: [CLI Overview](/reference/cli/overview)
 
 Have fun – and remember: *stderr for logs, stdout for MCP!*
+
+---
+
+## Available Environments
+
+Browse ready-to-use environments and templates at **[hud.ai/environments](https://hud.ai/environments)**.
+
+| Environment | Description |
+|-------------|-------------|
+| `hud-blank` | Minimal starter template |
+| `hud-browser` | Browser automation with Playwright |
+| `hud-remote-browser` | Cloud browser providers (Steel, Anchor, etc.) |
+| `hud-deepresearch` | Deep research with web search |
+| `hud-rubrics` | LLM-as-judge evaluations |
+| `coding-template` | Full coding env with VNC, Postgres, Redis |
+
+Each environment is available as a GitHub template you can fork and customize.
diff --git a/docs/build-environments/spec.mdx b/docs/build-environments/spec.mdx
index a87160df..61069b21 100644
--- a/docs/build-environments/spec.mdx
+++ b/docs/build-environments/spec.mdx
@@ -24,7 +24,7 @@ graph TD
 - No non‑MCP output on stdout (all logging to stderr).
 - No required file layout, framework, or endpoints.
 
-Recommended (for HUD RL/evals): provide tools named `setup` and `evaluate`.
+Recommended (for HUD evals): provide tools named `setup` and `evaluate`.
 
 ## Make it runnable remotely (mcp.hud.ai)
 
@@ -143,7 +143,7 @@ The same structure is used by `hud init`’s template and by programmatic tasks.
 ]
 ```
 
-Switching this file to remote is as simple as replacing the `mcp_config` with the `hud` section shown above (or using `hud rl`, which will help convert it automatically).
+Switching this file to remote is as simple as replacing the `mcp_config` with the `hud` section shown above (or using `hud convert`, which will help convert it automatically).
 
 Run tasks with either the CLI or an agent:
 
diff --git a/docs/docs.json b/docs/docs.json
index b9091131..2b73c81c 100644
--- a/docs/docs.json
+++ b/docs/docs.json
@@ -29,12 +29,81 @@
   "navigation": {
     "versions": [
       {
-        "version": "0.4.74",
+        "version": "0.5.0",
         "groups": [
           {
             "group": "Get Started",
             "pages": [
               "index",
+              "llm-quickstart"
+            ]
+          },
+          {
+            "group": "Essentials",
+            "pages": [
+              "quick-links/gateway",
+              "quick-links/ab-testing",
+              "quick-links/environments",
+              "quick-links/deploy"
+            ]
+          },
+          {
+            "group": "Guides",
+            "pages": [
+              "guides/integrations",
+              "guides/sandboxing",
+              "guides/best-practices",
+              "migration"
+            ]
+          },
+          {
+            "group": "Advanced",
+            "pages": [
+              "advanced/testing-environments"
+            ]
+          },
+          {
+            "group": "SDK Reference",
+            "pages": [
+              "reference/evals",
+              "reference/environments",
+              "reference/tools",
+              "reference/mcpserver",
+              "reference/agents",
+              "reference/types"
+            ]
+          },
+          {
+            "group": "CLI Reference",
+            "pages": [
+              "reference/cli/overview",
+              "reference/cli/init",
+              "reference/cli/dev",
+              "reference/cli/build",
+              "reference/cli/push",
+              "reference/cli/analyze",
+              "reference/cli/debug",
+              "reference/cli/run",
+              "reference/cli/eval",
+              "reference/cli/rft",
+              "reference/cli/misc"
+            ]
+          },
+          {
+            "group": "Community",
+            "pages": [
+              "contributing"
+            ]
+          }
+        ]
+      },
+      {
+        "version": "0.4.73",
+        "groups": [
+          {
+            "group": "Get Started",
+            "pages": [
+              "index-legacy",
               "quickstart",
               "llm-quickstart"
             ]
@@ -50,10 +119,11 @@
           {
             "group": "SDK Reference",
             "pages": [
+              "reference/eval",
               "reference/tools",
               "reference/agents",
               "reference/types",
-              "reference/environments",
+              "reference/mcpserver",
               "reference/tasks"
             ]
           },
@@ -64,17 +134,10 @@
               "build-environments/spec"
             ]
           },
-          {
-            "group": "Training (RL)",
-            "pages": [
-              "train-agents/quickstart",
-              "train-agents/tasks"
-            ]
-          },
           {
             "group": "HUD Gateway",
             "pages": [
-              "gateway/index"
+              "gateway/index-legacy"
             ]
           },
           {
@@ -103,7 +166,6 @@
               "reference/cli/debug",
               "reference/cli/run",
               "reference/cli/eval",
-              "reference/cli/rl",
               "reference/cli/rft",
               "reference/cli/misc"
             ]
diff --git a/docs/evaluate-agents/benchmarks.mdx b/docs/evaluate-agents/benchmarks.mdx
index b63d9b17..09561a30 100644
--- a/docs/evaluate-agents/benchmarks.mdx
+++ b/docs/evaluate-agents/benchmarks.mdx
@@ -18,7 +18,30 @@ hud eval tasks.json
 hud eval hud-evals/SheetBench-50 claude --full
 ```
 
-- SDK
+- SDK (Context Manager)
+
+```python
+import hud
+
+# Single task evaluation
+async with hud.eval("hud-evals/SheetBench-50:0") as ctx:
+    agent = MyAgent()
+    result = await agent.run(ctx)
+    ctx.reward = result.reward
+
+# All tasks with variants
+async with hud.eval(
+    "hud-evals/SheetBench-50:*",
+    variants={"model": ["claude-sonnet", "gpt-4o"]},
+    group=3,
+    max_concurrent=50,
+) as ctx:
+    agent = create_agent(model=ctx.variants["model"])
+    result = await agent.run(ctx)
+    ctx.reward = result.reward
+```
+
+- SDK (Batch Execution)
 
 ```python
 from hud.datasets import run_tasks
@@ -108,8 +131,9 @@ results = await run_tasks(
 
 ## See Also
 
-- [`hud eval`](/reference/cli/eval)
-- [`hud rl`](/reference/cli/rl)
+- [Evaluation API](/reference/eval) - SDK reference for `hud.eval()`
+- [`hud eval`](/reference/cli/eval) - CLI reference
+- [`hud rft`](/reference/cli/rft)
 - [Tasks](/reference/tasks)
 - [Agents (SDK)](/reference/agents)
 
diff --git a/docs/gateway/index.mdx b/docs/gateway/index-legacy.mdx
similarity index 99%
rename from docs/gateway/index.mdx
rename to docs/gateway/index-legacy.mdx
index ea235980..a60b6811 100644
--- a/docs/gateway/index.mdx
+++ b/docs/gateway/index-legacy.mdx
@@ -1,5 +1,5 @@
 ---
-title: "HUD Gateway"
+title: "Gateway"
 description: "Unified LLM inference service with built-in auth and credit management."
 icon: "server"
 ---
@@ -128,3 +128,4 @@ This example demonstrates:
 - Automatic token usage and latency tracking
 
 View your traces on the [HUD Dashboard](https://hud.ai/home).
+
diff --git a/docs/guides/best-practices.mdx b/docs/guides/best-practices.mdx
new file mode 100644
index 00000000..662cbab9
--- /dev/null
+++ b/docs/guides/best-practices.mdx
@@ -0,0 +1,142 @@
+---
+title: "Best Practices"
+description: "Design effective environments, evals, and grading logic"
+icon: "star"
+---
+
+Building good agent evaluations requires thoughtful design at every layer—the environment, the prompts, and the grading logic. This guide covers patterns that lead to useful, reliable signal.
+
+## Good Environments
+
+A good environment gives agents what they need to succeed—and gives you what you need to evaluate them.
+
+### Observable State
+
+Agents need access to the right information. If they can't see the data they need, they can't complete the task. Design tools that expose useful state:
+
+```python
+# ❌ Bad: Agent can't see what was created
+@env.tool()
+def create_user(name: str) -> str:
+    db.insert("users", name=name)
+    return "User created"
+
+# ✅ Good: Agent gets actionable data back
+@env.tool()
+def create_user(name: str) -> dict:
+    user_id = db.insert("users", name=name)
+    return {"id": user_id, "name": name, "created": True}
+```
+
+For grading, you also need to observe what happened. If the agent creates a database row, you need to query that database. If it uploads a file, you need to read that file. Be cognizant of what you can and cannot observe—only ask agents to do things you can verify.
+
+### Deterministic Setup
+
+Each eval should seed the state it needs. HUD handles container isolation—you handle making sure your scenario sets up the right data before the agent runs.
+
+```python
+# ❌ Bad: Depends on whatever state exists
+@env.scenario("find-user")
+async def find_user(name: str):
+    answer = yield f"Find the user named {name}"
+    yield 1.0 if name in answer else 0.0
+
+# ✅ Good: Seeds known state before eval
+@env.scenario("find-user")
+async def find_user(name: str):
+    await db.clear()
+    await db.insert("users", name=name, email=f"{name}@example.com")
+    
+    answer = yield f"Find the user named {name}"
+    yield 1.0 if name in answer else 0.0
+```
+
+### Isolated Execution
+
+HUD sandboxes each eval—containers don't share state. But if your environment connects to external services, think about stateful vs stateless.
+
+**Stateless services** are fine. Multiple agents can hit the same read-only API without interference.
+
+**Stateful services** need care. If 100 agents all hit the same database endpoint that modifies data, they'll step on each other. Use per-eval instances, transaction isolation, or target different records.
+
+## Good Evals
+
+An eval combines a prompt (the first `yield`) with grading logic (everything after). The prompt tells agents what to do—write short-to-medium length instructions that ask for an unambiguous change you can verify.
+
+### Be Specific
+
+Ambiguous prompts lead to ambiguous grading. Say exactly what you want:
+
+```
+❌ "Update the user settings"
+✅ "Change the email for user alice@example.com to alice.new@example.com"
+```
+
+Real-world example: *"Add a column to the Portfolio snapshot with the 'Phase' of the engagement. C-11X should be 'Phase 2', all else are 'Phase 1'."*
+
+### Only Ask for Testable Things
+
+If you can't observe the result, you can't grade it. Don't ask an agent to "think about" something—ask it to do something you can verify.
+
+```
+❌ "Consider the best approach to optimize the query"
+✅ "Rewrite the query to use an index on the email column"
+```
+
+### Create Variations
+
+Evals are easier to write when you have a specific failure mode in mind. If you've observed agents struggling with something, incorporate that into future evals.
+
+Create different versions with more or less explicit instructions—step-by-step guidance vs. high-level goals. Use [variants](/quick-links/ab-testing) to test these systematically. Variations make it easier to tune difficulty later.
+
+## Good Graders
+
+The grading logic after the first `yield` determines the grade. Fair grading means useful signal.
+
+### Match the Prompt
+
+If the prompt says "create a document with a Japanese car brand", check for any Japanese car brand—not just "Toyota". But don't accept any document either. Exactly as strict as the prompt implies.
+
+```python
+# ❌ Bad: Too strict—only accepts one answer
+@env.scenario("add-car")
+async def add_car():
+    answer = yield "Add a Japanese car brand to the document"
+    yield 1.0 if answer == "Toyota" else 0.0
+
+# ✅ Good: Accepts any valid answer
+@env.scenario("add-car")
+async def add_car():
+    answer = yield "Add a Japanese car brand to the document"
+    japanese_brands = ["toyota", "honda", "nissan", "mazda", "subaru"]
+    yield 1.0 if any(brand in answer.lower() for brand in japanese_brands) else 0.0
+```
+
+### Use Partial Credit
+
+Partial grades help you see where agents fail. Did they add to cart but not checkout? That's useful signal. Break complex grading into sub-checks with weighted grades:
+
+```python
+@env.scenario("checkout")
+async def checkout(product: str):
+    answer = yield f"Add {product} to cart and checkout"
+    
+    score = 0.0
+    if await product_in_cart(product):
+        score += 0.3  # Partial credit for first step
+    if await order_completed(product):
+        score += 0.7  # Most credit for completion
+    yield score
+```
+
+### Sanity Check
+
+At minimum, verify two cases: unchanged state → 0.0, correct completion → 1.0. For grading logic you'll reuse across many evals, write unit tests. Load a known state snapshot, verify the grade matches what you expect.
+
+## Finding the Right Difficulty
+
+A good eval set has range—target 20-30% average success rate. You want high variance: some runs should grade 0.0, others 1.0. If every run grades the same, there's no signal to learn from. Having both positive and negative examples on the same eval is what makes improvement possible.
+
+**Iterate.** Create an eval, test it manually, run it at scale, check the difficulty. If it's too easy or too hard, adjust the prompt or grading. Use your best evals as templates for more.
+
+**Train.** Every eval generates data—prompts, tool calls, grades. Use successful runs for fine-tuning. The loop: eval → analyze → train → eval again.
diff --git a/docs/guides/integrations.mdx b/docs/guides/integrations.mdx
new file mode 100644
index 00000000..0d826e07
--- /dev/null
+++ b/docs/guides/integrations.mdx
@@ -0,0 +1,430 @@
+---
+title: "Integrations"
+description: "Use any agent framework with HUD environments"
+icon: "robot"
+---
+
+HUD environments work with any agent framework. The `Environment` class provides format converters for all major providers, and `hud.eval()` handles setup, evaluation, and tracing automatically.
+
+Every example on this page uses the `eval` defined below and the [Gateway](/quick-links/gateway) for inference.
+
+## The Example Environment
+
+```python
+import hud
+
+CEOS = {"hud": "Jay Ram", "openai": "Sam Altman", "anthropic": "Dario Amodei"}
+
+env = hud.Environment("trivia")
+
+@env.tool()
+def lookup_ceo(company: str) -> str:
+    """Look up the CEO of a company."""
+    return CEOS.get(company.lower(), "Unknown")
+
+@env.scenario("initials")
+async def find_initials(company: str):
+    answer = yield f"What are the initials of the CEO of {company}?"
+    ceo = CEOS.get(company.lower())
+    correct = "".join(word[0] for word in ceo.split()) if ceo else None
+    yield 1.0 if answer and correct and correct in answer.upper() else 0.0
+
+task = env("initials", company="HUD")
+```
+
+---
+
+## OpenAI
+
+The OpenAI SDK supports three APIs: Chat Completions, Responses, and the Agents SDK.
+
+### Chat Completions
+
+```python
+import os
+from openai import AsyncOpenAI
+import hud
+
+client = AsyncOpenAI(
+    base_url="https://inference.hud.ai",
+    api_key=os.environ["HUD_API_KEY"]
+)
+
+async with hud.eval(eval) as ctx:
+    messages = [{"role": "user", "content": ctx.prompt}]
+    
+    while True:
+        response = await client.chat.completions.create(
+            model="gpt-4o",
+            messages=messages,
+            tools=ctx.as_openai_chat_tools()
+        )
+        
+        msg = response.choices[0].message
+        messages.append(msg)
+        
+        if not msg.tool_calls:
+            break
+            
+        for tool_call in msg.tool_calls:
+            result = await ctx.call_tool(tool_call)
+            messages.append(result)
+    
+    await ctx.submit(msg.content or "")
+```
+
+### Responses API
+
+```python
+async with hud.eval(eval) as ctx:
+    response = await client.responses.create(
+        model="gpt-4o",
+        input=ctx.prompt,
+        tools=ctx.as_openai_responses_tools()
+    )
+    
+    for item in response.output:
+        if item.type == "function_call":
+            await ctx.call_tool(item)
+    
+    await ctx.submit(response.output_text)
+```
+
+### Agents SDK
+
+```python
+from agents import Agent, Runner
+import hud
+
+async with hud.eval(eval) as ctx:
+    agent = Agent(
+        name="trivia-agent",
+        instructions="Answer trivia questions. Use tools to look up information.",
+        tools=ctx.as_openai_agent_tools()
+    )
+    
+    result = await Runner.run(agent, ctx.prompt)
+    await ctx.submit(result.final_output)
+```
+
+Requires: `pip install openai-agents`
+
+---
+
+## Anthropic
+
+Claude's Messages API with tool use.
+
+```python
+import os
+from anthropic import AsyncAnthropic
+import hud
+
+client = AsyncAnthropic(
+    base_url="https://inference.hud.ai",
+    api_key=os.environ["HUD_API_KEY"]
+)
+
+async with hud.eval(eval) as ctx:
+    messages = [{"role": "user", "content": ctx.prompt}]
+    
+    while True:
+        response = await client.messages.create(
+            model="claude-sonnet-4-20250514",
+            max_tokens=1024,
+            messages=messages,
+            tools=ctx.as_claude_tools()
+        )
+        
+        tool_uses = [b for b in response.content if b.type == "tool_use"]
+        if not tool_uses:
+            break
+        
+        tool_results = [await ctx.call_tool(block) for block in tool_uses]
+        
+        messages.append({"role": "assistant", "content": response.content})
+        messages.append({"role": "user", "content": tool_results})
+    
+    text = next((b.text for b in response.content if b.type == "text"), "")
+    await ctx.submit(text)
+```
+
+Requires: `pip install anthropic`
+
+---
+
+## Gemini
+
+Google's Gemini API with function calling.
+
+```python
+import os
+import google.generativeai as genai
+import hud
+
+genai.configure(api_key=os.environ["GOOGLE_API_KEY"])
+model = genai.GenerativeModel("gemini-2.0-flash")
+
+async with hud.eval(eval) as ctx:
+    chat = model.start_chat()
+    
+    response = chat.send_message(
+        ctx.prompt,
+        tools=ctx.as_gemini_tools(),
+        tool_config=ctx.as_gemini_tool_config()
+    )
+    
+    while True:
+        part = response.candidates[0].content.parts[0]
+        if not hasattr(part, "function_call") or not part.function_call:
+            break
+        
+        result = await ctx.call_tool(part)
+        response = chat.send_message(result)
+    
+    await ctx.submit(response.text)
+```
+
+Requires: `pip install google-generativeai`
+
+---
+
+## browser-use
+
+Browser automation for web agents.
+
+```python
+import os
+from browser_use import Agent
+from langchain_openai import ChatOpenAI
+import hud
+
+llm = ChatOpenAI(
+    model="gpt-4o",
+    base_url="https://inference.hud.ai",
+    api_key=os.environ["HUD_API_KEY"]
+)
+
+async with hud.eval(eval) as ctx:
+    agent = Agent(task=ctx.prompt, llm=llm)
+    result = await agent.run()
+    await ctx.submit(str(result))
+```
+
+Requires: `pip install browser-use playwright && playwright install`
+
+---
+
+## LangChain
+
+LangChain's agent framework with tool calling.
+
+```python
+import os
+from langchain_openai import ChatOpenAI
+from langchain.agents import create_tool_calling_agent, AgentExecutor
+from langchain_core.prompts import ChatPromptTemplate
+import hud
+
+llm = ChatOpenAI(
+    model="gpt-4o",
+    base_url="https://inference.hud.ai",
+    api_key=os.environ["HUD_API_KEY"]
+)
+
+async with hud.eval(eval) as ctx:
+    tools = ctx.as_langchain_tools()
+    
+    prompt = ChatPromptTemplate.from_messages([
+        ("system", "You are a helpful assistant."),
+        ("human", "{input}"),
+        ("placeholder", "{agent_scratchpad}"),
+    ])
+    
+    agent = create_tool_calling_agent(llm, tools, prompt)
+    executor = AgentExecutor(agent=agent, tools=tools)
+    
+    result = await executor.ainvoke({"input": ctx.prompt})
+    await ctx.submit(result["output"])
+```
+
+Requires: `pip install langchain langchain-openai langchain-core`
+
+---
+
+## LlamaIndex
+
+LlamaIndex's ReAct agent with tool integration.
+
+```python
+import os
+from llama_index.llms.openai import OpenAI
+from llama_index.core.agent import ReActAgent
+import hud
+
+llm = OpenAI(
+    model="gpt-4o",
+    api_base="https://inference.hud.ai",
+    api_key=os.environ["HUD_API_KEY"]
+)
+
+async with hud.eval(eval) as ctx:
+    tools = ctx.as_llamaindex_tools()
+    
+    agent = ReActAgent.from_tools(tools, llm=llm, verbose=True)
+    response = await agent.achat(ctx.prompt)
+    
+    await ctx.submit(str(response))
+```
+
+Requires: `pip install llama-index-core llama-index-llms-openai`
+
+---
+
+## Google ADK
+
+Google's Agent Development Kit for Gemini-powered agents.
+
+```python
+import os
+from google.adk.agents import Agent
+from google.adk.runners import Runner
+import hud
+
+async with hud.eval(eval) as ctx:
+    agent = Agent(
+        name="trivia-agent",
+        model="gemini-2.0-flash",
+        instruction="Answer trivia questions. Use tools to look up information.",
+        tools=ctx.as_adk_tools()
+    )
+    
+    runner = Runner(agent=agent)
+    result = await runner.run(ctx.prompt)
+    
+    await ctx.submit(result.output)
+```
+
+Requires: `pip install google-adk`
+
+---
+
+## CrewAI
+
+Multi-agent orchestration with roles and tasks.
+
+```python
+import os
+from crewai import Agent, Task, Crew
+from langchain_openai import ChatOpenAI
+import hud
+
+llm = ChatOpenAI(
+    model="gpt-4o",
+    base_url="https://inference.hud.ai",
+    api_key=os.environ["HUD_API_KEY"]
+)
+
+async with hud.eval(eval) as ctx:
+    tools = ctx.as_langchain_tools()
+    
+    researcher = Agent(
+        role="Researcher",
+        goal="Find accurate information",
+        backstory="Expert at finding information",
+        tools=tools,
+        llm=llm
+    )
+    
+    task = LegacyTask(
+        description=ctx.prompt,
+        expected_output="The initials of the CEO",
+        agent=researcher
+    )
+    
+    crew = Crew(agents=[researcher], tasks=[task])
+    result = crew.kickoff()
+    await ctx.submit(str(result))
+```
+
+Requires: `pip install crewai langchain-openai`
+
+---
+
+## AutoGen
+
+Microsoft's multi-agent conversation framework.
+
+```python
+import os
+from autogen import AssistantAgent, UserProxyAgent
+import hud
+
+async with hud.eval(eval) as ctx:
+    config_list = [{
+        "model": "gpt-4o",
+        "base_url": "https://inference.hud.ai",
+        "api_key": os.environ["HUD_API_KEY"]
+    }]
+    
+    assistant = AssistantAgent(
+        name="assistant",
+        llm_config={"config_list": config_list}
+    )
+    
+    for tool in ctx.as_tools():
+        @assistant.register_for_execution()
+        async def tool_fn(name=tool.name, **kwargs):
+            return await ctx.call_tool(name, **kwargs)
+    
+    user = UserProxyAgent(
+        name="user",
+        human_input_mode="NEVER",
+        code_execution_config=False
+    )
+    
+    result = await user.a_initiate_chat(assistant, message=ctx.prompt)
+    await ctx.submit(result.summary)
+```
+
+Requires: `pip install pyautogen`
+
+---
+
+## Format Reference
+
+| Method | Returns | Use With |
+|--------|---------|----------|
+| `as_openai_chat_tools()` | OpenAI Chat format | OpenAI Chat Completions |
+| `as_openai_responses_tools()` | OpenAI Responses format | OpenAI Responses API |
+| `as_openai_agent_tools()` | FunctionTool objects | OpenAI Agents SDK |
+| `as_claude_tools()` | Anthropic format | Claude API |
+| `as_gemini_tools()` | Gemini format | Google AI |
+| `as_adk_tools()` | ADK FunctionTool objects | Google ADK |
+| `as_langchain_tools()` | StructuredTool objects | LangChain, CrewAI |
+| `as_llamaindex_tools()` | FunctionTool objects | LlamaIndex |
+| `as_tools()` | MCP Tool objects | Raw MCP, AutoGen |
+
+All `call_tool()` calls auto-detect the input format and return matching output format.
+
+---
+
+## Bring Your Own
+
+Don't see your framework? The pattern is simple:
+
+1. Get tools in your framework's format (or use `as_tools()` for raw MCP)
+2. Run your agent loop
+3. Call `ctx.call_tool()` for each tool invocation
+4. Call `ctx.submit()` with the final answer
+
+```python
+async with hud.eval(eval) as ctx:
+    tools = ctx.as_tools()  # Raw MCP format
+    
+    result = await my_custom_agent(ctx.prompt, tools, ctx.call_tool)
+    
+    await ctx.submit(result)
+```
+
+The environment handles setup, evaluation, and tracing. You handle the agent logic.
diff --git a/docs/guides/sandboxing.mdx b/docs/guides/sandboxing.mdx
new file mode 100644
index 00000000..dbebcb3d
--- /dev/null
+++ b/docs/guides/sandboxing.mdx
@@ -0,0 +1,161 @@
+---
+title: "Sandboxing"
+description: "Turn your existing services into agent-testable environments"
+icon: "shield"
+---
+
+You have a production stack. You want an agent on it. But you can't just point an agent at production—it'll make real changes, hit real APIs, affect real users. And you can't test at scale against a single live instance with shared state.
+
+HUD lets you mock your production environment so agents can run against it safely. Connect your services in a few lines, mock external dependencies, and run thousands of agents in parallel—each isolated, each reproducible, each generating useful data.
+
+## Connecting Your Stack
+
+HUD wraps your existing infrastructure without rewriting it:
+
+```python
+from hud import Environment
+
+env = Environment("my-env")
+
+# Connect what you already have
+env.connect_fastapi(app)                                    # FastAPI → tools
+env.connect_openapi("https://api.example.com/openapi.json") # OpenAPI spec → tools
+env.connect_hub("hud-evals/browser")                        # HUD Hub environments
+env.connect_image("my-service:v1")                          # Docker images
+```
+
+## Making Databases Safe
+
+Agents need isolated state. Three patterns work:
+
+**In-memory SQLite** — fastest, resets automatically:
+```python
+import sqlite3
+db = sqlite3.connect(":memory:")  # Fresh per eval
+
+@env.scenario("update-order")
+async def update_order(order_id: str):
+    db.executescript(Path("fixtures/orders.sql").read_text())  # Seed
+    answer = yield f"Update order {order_id} to shipped"
+    row = db.execute("SELECT status FROM orders WHERE id=?", (order_id,)).fetchone()
+    yield 1.0 if row and row[0] == "shipped" else 0.0
+```
+
+**Transaction rollback** — use your real DB, undo changes:
+```python
+@env.scenario("process-refund")
+async def process_refund(order_id: str):
+    conn = await asyncpg.connect(DATABASE_URL)
+    tx = conn.transaction()
+    await tx.start()
+    try:
+        answer = yield f"Process refund for order {order_id}"
+        # Check result...
+        yield reward
+    finally:
+        await tx.rollback()  # Always undo
+        await conn.close()
+```
+
+**Fixture seeding** — deterministic starting state:
+```python
+await db.execute("TRUNCATE orders, users CASCADE")
+await db.executemany("INSERT INTO users ...", fixtures["users"])
+```
+
+## Mocking External Services
+
+`env.mock()` intercepts at the tool layer. Agents only see tools, so this is usually all you need:
+
+```python
+env.mock()  # All tools return schema-based fake responses
+env.mock_tool("send_email", {"status": "sent", "id": "mock-123"})
+env.mock_tool("charge_card", {"success": True, "transaction_id": "tx-mock"})
+```
+
+For stateful mocking (tracking what happened for assertions):
+
+```python
+class MockPaymentService:
+    def __init__(self):
+        self.charges = []
+    
+    async def charge(self, amount: int, card_token: str) -> dict:
+        self.charges.append({"amount": amount, "token": card_token})
+        return {"success": True, "id": f"ch-{len(self.charges)}"}
+
+payments = MockPaymentService()
+
+@env.scenario("checkout")
+async def checkout(cart_total: int):
+    _ = yield f"Complete checkout for ${cart_total}"
+    yield 1.0 if any(c["amount"] == cart_total for c in payments.charges) else 0.0
+```
+
+## Docker vs No Docker
+
+| Pattern | When to Use | Examples |
+|---------|-------------|----------|
+| **No Docker** | Pure Python, API integrations | Web research, LLM grading |
+| **Docker** | System dependencies, persistent services | VNC, PostgreSQL, browsers |
+
+### Pattern 1: No Docker
+
+Import and test directly:
+
+```python
+# local_test.py
+from env import env
+
+async def test():
+    async with env:
+        result = await env.call_tool("search", query="test")
+```
+
+### Pattern 2: Docker
+
+Connect to the running container instead of importing. Same API, different transport—because your tools now run inside the container where dependencies live:
+
+```python
+# local_test.py
+env = Environment("browser-env")
+env.connect_url("http://localhost:8765/mcp")  # Connect instead of import
+
+async def test():
+    async with env:  # Same API from here
+        result = await env.call_tool("navigate", url="https://example.com")
+```
+
+```bash
+hud build                                 # Build image
+hud dev -w scenarios -w tools --port 8765 # Start with hot-reload
+python local_test.py                      # Connects to container
+```
+
+### Hot-Reload
+
+`hud dev -w path` reloads Python on save. System services (postgres, VNC) persist.
+
+**Rebuild** (`hud build`) when: Dockerfile, system packages, or dependencies change.
+
+## Environment Structure
+
+Start simple, add structure as needed:
+
+```
+# Simple                      # Organized
+my-env/                       my-env/
+├── env.py                    ├── env.py
+├── local_test.py             ├── scenarios/
+└── Dockerfile.hud            ├── setup/
+                              ├── evaluate/
+                              └── Dockerfile.hud
+```
+
+Most environments fall somewhere between. Split when files get hard to navigate.
+
+## What's Next
+
+**Test locally.** See [Testing Environments](/advanced/testing-environments) for debugging and scenario testing.
+
+**Deploy.** Push to GitHub, connect on [hud.ai](https://hud.ai). See [Deploy](/quick-links/deploy).
diff --git a/docs/index-legacy.mdx b/docs/index-legacy.mdx
new file mode 100644
index 00000000..ecccffeb
--- /dev/null
+++ b/docs/index-legacy.mdx
@@ -0,0 +1,113 @@
+---
+title: "Introduction"
+description: "OSS environment + evals toolkit for AI agents."
+icon: "book"
+---
+
+<Note>
+**Version 0.4.73** - Latest stable release
+</Note>
+
+<CardGroup cols={2}>
+<Card title="I want to evaluate agents" icon="robot" href="/evaluate-agents/create-agents">
+  Test Claude, Operator, or custom agents on benchmarks like SheetBench and OSWorld
+</Card>
+
+<Card title="I want to build environments" icon="cube" href="/build-environments">
+  Wrap any software in dockerized MCP for scalable and generalizable agent evaluation
+</Card>
+</CardGroup>
+
+## What is HUD?
+
+HUD connects AI agents to software environments using the Model Context Protocol (MCP). Whether you're evaluating existing agents or building new environments, HUD provides the infrastructure.
+
+```mermaid
+graph LR
+    Agent["🤖 Any Agent<br/>(Claude, Operator, etc.)"]
+    MCP["🔌 MCP Protocol<br/>(Tool Calls)"]
+    Env["📦 Any Environment<br/>(Browser, OS, etc.)"]
+    
+    Agent -->|"call_tool()"| MCP
+    MCP -->|"click(x, y)"| Env
+    Env -->|"screenshot"| MCP
+    MCP -->|"get_response()"| Agent
+    
+    style Agent fill:#3b82f6,stroke:#1e40af,stroke-width:2px,color:#ffffff
+    style MCP fill:#f59e0b,stroke:#d97706,stroke-width:2px,color:#ffffff
+    style Env fill:#10b981,stroke:#047857,stroke-width:2px,color:#ffffff
+```
+
+## Why HUD?
+
+- **🔌 MCP-native**: Any agent can connect to any environment
+- **📡 Live telemetry**: Debug every tool call at [hud.ai](https://hud.ai)
+- **⚡ HUD Gateway**: Unified inference API for all LLMs
+- **🚀 Production-ready**: From local Docker to cloud scale
+- **🎯 Built-in benchmarks**: OSWorld-Verified, SheetBench-50, and more
+- **🔧 CLI tools**: Create, develop, and run with `hud init`, `hud dev`, `hud run`, `hud eval`
+
+<CardGroup cols={2}>
+<Card title="3-minute quickstart" icon="bolt" href="/quickstart">
+  Run your first agent evaluation with zero setup
+</Card>
+
+<Card title="HUD Gateway" icon="server" href="/gateway">
+  Unified inference API for OpenAI, Anthropic, Gemini, and Open Source Models
+</Card>
+
+<Card title="Add to Cursor/Claude" icon="sparkles" href="/llm-quickstart">
+  Give your AI assistant full knowledge of HUD docs
+</Card>
+</CardGroup>
+
+
+
+## Quick Example
+
+```python
+import asyncio, os, hud
+from hud.datasets import Task
+from hud.agents import ClaudeAgent
+
+async def main():
+    # Define evaluation task with remote MCP
+    task = Task(
+        prompt="Win a game of 2048 by reaching the 128 tile",
+        mcp_config={
+            "hud": {
+                "url": "https://mcp.hud.ai/v3/mcp",
+                "headers": {
+                    "Authorization": f"Bearer {os.getenv('HUD_API_KEY')}",
+                    "Mcp-Image": "hudevals/hud-text-2048:0.1.3"
+                }
+            }
+        },
+        setup_tool={"name": "setup", "arguments": {"name": "board", "arguments": { "board_size": 4}}},
+        evaluate_tool={"name": "evaluate", "arguments": {"name": "max_number", "arguments": {"target": 64}}}
+    )
+    
+    # Run agent (auto-creates MCP client)
+    agent = ClaudeAgent.create()
+    result = await agent.run(task)
+    print(f"Score: {result.reward}")
+
+asyncio.run(main())
+```
+
+## Community
+
+<CardGroup cols={2}>
+<Card title="GitHub" icon="github" href="https://github.com/hud-evals/hud-python">
+  Star the repo and contribute
+</Card>
+
+<Card title="Discord" icon="discord" href="https://discord.gg/wkjtmHYYjm">
+  Join our community
+</Card>
+</CardGroup>
+
+### Are you an enterprise building agents?
+
+[📅 Hop on a call](https://cal.com/jay-hud) or [📧 founders@hud.ai](mailto:founders@hud.ai)
+
diff --git a/docs/index.mdx b/docs/index.mdx
index da51407e..6bd837f1 100644
--- a/docs/index.mdx
+++ b/docs/index.mdx
@@ -1,104 +1,126 @@
 ---
 title: "Introduction"
-description: "OSS RL environment + evals toolkit."
+description: "Build, evaluate, and train AI agents."
 icon: "book"
 ---
 
-<Note>
-**Version 0.4.74** - Latest stable release
-</Note>
+HUD gives you three things: a unified API for every model, a way to turn your code into agent-callable tools, and infrastructure to run evaluations at scale.
 
-<CardGroup cols={3}>
-<Card title="I want to evaluate agents" icon="robot" href="/evaluate-agents/create-agents">
-  Test Claude, Operator, or custom agents on benchmarks like SheetBench and OSWorld
-</Card>
+## Install
 
-<Card title="I want to build environments" icon="cube" href="/build-environments">
-  Wrap any software in dockerized MCP for scalable and generalizable agent evaluation
-</Card>
+```bash
+# Install CLI
+uv tool install hud-python --python 3.12
 
-<Card title="I want to train agents" icon="brain" href="/train-agents/quickstart">
-  Use reinforcement learning and GRPO on evaluations to improve agent performance
-</Card>
-</CardGroup>
+# Set your API key
+hud set HUD_API_KEY=your-key-here
+```
 
-## What is HUD?
-
-HUD connects AI agents to software environments using the Model Context Protocol (MCP). Whether you're evaluating existing agents, building new environments, or training models with RL, HUD provides the infrastructure.
-
-```mermaid
-graph LR
-    Agent["🤖 Any Agent<br/>(Claude, Operator, etc.)"]
-    MCP["🔌 MCP Protocol<br/>(Tool Calls)"]
-    Env["📦 Any Environment<br/>(Browser, OS, etc.)"]
-    
-    Agent -->|"call_tool()"| MCP
-    MCP -->|"click(x, y)"| Env
-    Env -->|"screenshot"| MCP
-    MCP -->|"get_response()"| Agent
-    
-    style Agent fill:#3b82f6,stroke:#1e40af,stroke-width:2px,color:#ffffff
-    style MCP fill:#f59e0b,stroke:#d97706,stroke-width:2px,color:#ffffff
-    style Env fill:#10b981,stroke:#047857,stroke-width:2px,color:#ffffff
+Get your API key at [hud.ai/settings/api-keys](https://hud.ai/settings/api-keys).
+
+## 1. Gateway: Any Model, One API
+
+Stop juggling API keys. Point any OpenAI-compatible client at `inference.hud.ai` and use Claude, GPT, Gemini, or Grok:
+
+```python
+from openai import AsyncOpenAI
+import os
+
+client = AsyncOpenAI(
+    base_url="https://inference.hud.ai",
+    api_key=os.environ["HUD_API_KEY"]
+)
+
+response = await client.chat.completions.create(
+    model="claude-sonnet-4-5",  # or gpt-4o, gemini-2.5-pro, grok-4-1-fast...
+    messages=[{"role": "user", "content": "Hello!"}]
+)
 ```
 
-## Why HUD?
+Every call is traced. View them at [hud.ai/home](https://hud.ai/home).
 
-- **🔌 MCP-native**: Any agent can connect to any environment
-- **📡 Live telemetry**: Debug every tool call at [hud.ai](https://hud.ai)
-- **⚡ HUD Gateway**: Unified inference API for all LLMs
-- **🚀 Production-ready**: From local Docker to cloud scale
-- **🎯 Built-in benchmarks**: OSWorld-Verified, SheetBench-50, and more
- - **🔧 CLI tools**: Create, develop, run, and train with `hud init`, `hud dev`, `hud run`, `hud eval`, `hud rl`
+→ [More on Gateway](/quick-links/gateway)
 
-<CardGroup cols={2}>
-<Card title="3-minute quickstart" icon="bolt" href="/quickstart">
-  Run your first agent evaluation with zero setup
-</Card>
+## 2. Environments: Your Code, Agent-Ready
 
-<Card title="HUD Gateway" icon="server" href="/gateway">
-  Unified inference API for OpenAI, Anthropic, Gemini, and Open Source Models
-</Card>
+A production API is one live instance with shared state—you can't run 1,000 parallel tests without them stepping on each other. Environments spin up fresh for every evaluation: isolated, deterministic, reproducible. Each generates training data.
 
-<Card title="Add to Cursor/Claude" icon="sparkles" href="/llm-quickstart">
-  Give your AI assistant full knowledge of HUD docs
-</Card>
-</CardGroup>
+Turn your code into tools agents can call. Define scripts that evaluate what agents do:
+
+```python
+from hud import Environment
+
+env = Environment("my-env")
+
+@env.tool()
+def search(query: str) -> str:
+    """Search the knowledge base."""
+    return db.search(query)
+
+@env.scenario("find-answer")
+async def find_answer(question: str):
+    answer = yield f"Find the answer to: {question}"
+    yield 1.0 if "correct" in answer.lower() else 0.0
+```
 
+Scripts define the prompt (first yield) and the scoring logic (second yield). The agent runs in between.
 
+→ [More on Environments](/quick-links/environments)
 
-## Quick Example
+## 3. Evals: Test and Improve
+
+Run your scenario with different models. Compare results:
 
 ```python
-import asyncio, os, hud
-from hud.datasets import Task
-from hud.agents import ClaudeAgent
-
-async def main():
-    # Define evaluation task with remote MCP
-    task = Task(
-        prompt="Win a game of 2048 by reaching the 128 tile",
-        mcp_config={
-            "hud": {
-                "url": "https://mcp.hud.ai/v3/mcp",
-                "headers": {
-                    "Authorization": f"Bearer {os.getenv('HUD_API_KEY')}",
-                    "Mcp-Image": "hudevals/hud-text-2048:0.1.3"
-                }
-            }
-        },
-        setup_tool={"name": "setup", "arguments": {"name": "board", "arguments": { "board_size": 4}}},
-        evaluate_tool={"name": "evaluate", "arguments": {"name": "max_number", "arguments": {"target": 64}}}
+import hud
+
+task = env("find-answer", question="What is 2+2?")
+
+async with hud.eval(task, variants={"model": ["gpt-4o", "claude-sonnet-4-5"]}, group=5) as ctx:
+    response = await client.chat.completions.create(
+        model=ctx.variants["model"],
+        messages=[{"role": "user", "content": ctx.prompt}]
     )
-    
-    # Run agent (auto-creates MCP client)
-    agent = ClaudeAgent.create()
-    result = await agent.run(task)
-    print(f"Score: {result.reward}")
+    await ctx.submit(response.choices[0].message.content)
+```
+
+**Variants** test different configurations. **Groups** repeat each to see the distribution. Results show up on [hud.ai](https://hud.ai/home) with scores, traces, and side-by-side comparisons.
 
-asyncio.run(main())
+→ [More on A/B Evals](/quick-links/ab-testing)
+
+## 4. Deploy and Scale
+
+Push your environment to GitHub, connect it on [hud.ai](https://hud.ai), and run thousands of evals in parallel. Every run generates training data.
+
+```bash
+hud init                    # Scaffold environment
+git push                    # Push to GitHub
+# Connect on hud.ai → New → Environment
+hud eval my-org/my-eval --model gpt-4o --group-size 100
 ```
 
+→ [More on Deploy](/quick-links/deploy)
+
+## Next Steps
+
+<CardGroup cols={2}>
+<Card title="Gateway" icon="server" href="/quick-links/gateway">
+  One endpoint for every model. Full observability.
+</Card>
+
+<Card title="Environments" icon="cube" href="/quick-links/environments">
+  Tools, scripts, and local testing.
+</Card>
+
+<Card title="A/B Evals" icon="flask-vial" href="/quick-links/ab-testing">
+  Variants, groups, and finding what works.
+</Card>
+
+<Card title="Deploy" icon="rocket" href="/quick-links/deploy">
+  Run at scale. Generate training data.
+</Card>
+</CardGroup>
+
 ## Community
 
 <CardGroup cols={2}>
@@ -107,11 +129,12 @@ asyncio.run(main())
 </Card>
 
 <Card title="Discord" icon="discord" href="https://discord.gg/wkjtmHYYjm">
-  Join our community
+  Join the community
 </Card>
 </CardGroup>
 
-### Are you an enterprise building agents?
+## Enterprise
 
-[📅 Hop on a call](https://cal.com/jay-hud) or [📧 founders@hud.ai](mailto:founders@hud.ai)
+Building agents at scale? We work with teams on custom environments, benchmarks, and training pipelines.
 
+[📅 Book a call](https://cal.com/jay-hud) · [📧 founders@hud.ai](mailto:founders@hud.ai)
diff --git a/docs/llm-quickstart.mdx b/docs/llm-quickstart.mdx
index 7bde2a04..bcd99d95 100644
--- a/docs/llm-quickstart.mdx
+++ b/docs/llm-quickstart.mdx
@@ -30,5 +30,5 @@ icon: "sparkles"
 </Card>
 
 <Tip>
-Try asking your assistant: "How do I create a custom agent in HUD?" or "Help me debug MCP tool calls"
+Try asking: "How do I create an Environment with tools?" or "How do scripts and evals work in HUD?"
 </Tip>
\ No newline at end of file
diff --git a/docs/migration.mdx b/docs/migration.mdx
new file mode 100644
index 00000000..f8f76ce0
--- /dev/null
+++ b/docs/migration.mdx
@@ -0,0 +1,183 @@
+---
+title: "Migrating from v4"
+description: "Transition from Task-based environments to the unified Environment class"
+icon: "arrow-right-arrow-left"
+---
+
+v4 separated environments (Docker containers) from evaluation logic (Task objects). v5 unifies everything in the `Environment` class—tools, setup, and scoring live together.
+
+<Warning>
+**Deprecation Notice**: `LegacyTask`, `setup_tool`, and `evaluate_tool` are deprecated in v0.5.0 and will be removed in v0.6.0 (no earlier than March 1st, 2026). Use `Task.from_v4()` for quick migration or `@env.scenario()` for new code.
+</Warning>
+
+## Good News: Your Code Still Works
+
+`Environment` inherits from `MCPServer`. Same API, same behavior. Just change the import:
+
+```python
+# Before
+from hud.server import MCPServer
+mcp = MCPServer("my-env")
+
+@mcp.tool()
+def my_tool(): ...
+
+mcp.run()
+```
+
+```python
+# After
+from hud import Environment
+env = Environment("my-env")
+
+@env.tool()
+def my_tool(): ...
+
+env.run()
+```
+
+That's it. Your Dockerfile, your tools, your `run()` call—all unchanged. Environment adds scripts, connectors, and integrations on top.
+
+## Migration Path 1: Quick Conversion with Task.from_v4()
+
+The fastest way to migrate existing v4 code—no changes to task definitions needed:
+
+```python
+# BEFORE (deprecated in v0.6.0)
+from hud.datasets import LegacyTask
+
+legacy_task = LegacyTask(
+    prompt="Navigate to google.com",
+    mcp_config={"hud": {...}},
+    setup_tool={"name": "navigate", "arguments": {"url": "https://google.com"}},
+    evaluate_tool={"name": "check_url", "arguments": {}}
+)
+
+# AFTER - One-line conversion
+from hud.eval import Task
+
+task = Task.from_v4(legacy_task)  # Converts LegacyTask → Task
+# Also works with: Task.from_v4(dict), Task.from_v4(json_string)
+
+# Works the same with agents
+agent = ClaudeAgent.create()
+result = await agent.run(task)
+```
+
+`Task.from_v4()` automatically:
+- Runs `setup_tool` at the start of evaluation
+- Runs `evaluate_tool` at the end to compute reward
+- Preserves all existing behavior
+
+## Migration Path 2: Full Scenario Migration (Recommended)
+
+For new code or when refactoring, migrate `setup_tool` and `evaluate_tool` to `@env.scenario()`.
+
+**The rule is simple:**
+- `setup_tool` code → **before the first yield**
+- `evaluate_tool` code → **after the first yield**
+
+```python
+# BEFORE (deprecated in v0.6.0)
+task = LegacyTask(
+    prompt="What's the current URL?",
+    mcp_config={"hud": {...}},
+    setup_tool={"name": "navigate", "arguments": {"url": "https://google.com"}},
+    evaluate_tool={"name": "check_url", "arguments": {"expected": "google.com"}}
+)
+
+# AFTER
+from hud import Environment
+
+env = Environment("browser").connect_hub("hud-evals/browser")
+
+@env.scenario("navigate-google")
+async def navigate_google():
+    # ===== SETUP SECTION (replaces setup_tool) =====
+    await env.call_tool("navigate", url="https://google.com")
+    
+    # ===== PROMPT (first yield) =====
+    answer = yield "What's the current URL?"
+    
+    # ===== EVALUATE SECTION (replaces evaluate_tool) =====
+    result = await env.call_tool("check_url", expected="google.com")
+    
+    # ===== REWARD (second yield) =====
+    yield 1.0 if result else 0.0
+
+# Create task from scenario
+task = env("navigate-google")
+```
+
+### Multiple setup_tool Calls
+
+If you have multiple setup tools, just call them in sequence:
+
+```python
+# BEFORE
+setup_tool=[
+    {"name": "navigate", "arguments": {"url": "..."}},
+    {"name": "login", "arguments": {"user": "..."}},
+    {"name": "go_to_page", "arguments": {"page": "settings"}}
+]
+
+# AFTER
+@env.scenario("settings-test")
+async def settings_test():
+    # Multiple setup steps - just call them in order
+    await env.call_tool("navigate", url="...")
+    await env.call_tool("login", user="...")
+    await env.call_tool("go_to_page", page="settings")
+    
+    answer = yield "Verify the settings page loaded correctly"
+    
+    result = await env.call_tool("check_settings")
+    yield 1.0 if result else 0.0
+```
+
+## Using with Built-in Agents
+
+Built-in agents (ClaudeAgent, OpenAIAgent, etc.) work with both patterns:
+
+```python
+from hud.agents import ClaudeAgent
+
+agent = ClaudeAgent.create()
+
+# Works with Task from scenario
+result = await agent.run(env("navigate-google"))
+
+# Works with Task.from_v4() conversion
+result = await agent.run(Task.from_v4(legacy_task))
+```
+
+## Optional: Bring Your Own Agent
+
+v5 gives you the `hud.eval()` context manager for maximum flexibility:
+
+```python
+async with hud.eval(env("checkout", product="laptop")) as ctx:
+    # Use OpenAI, Anthropic, your own agent—whatever you want
+    response = await client.chat.completions.create(
+        model="gpt-4o",
+        messages=[{"role": "user", "content": ctx.prompt}],
+        tools=ctx.as_openai_chat_tools()
+    )
+    
+    # Handle tool calls, run your agent loop...
+    await ctx.submit(response.choices[0].message.content)
+
+print(ctx.reward)
+```
+
+The old `ClaudeAgent` and `OperatorAgent` still work—even with the new `hud.eval()` system. But now you're not locked into a specific agent spec. Pair with the [Gateway](/quick-links/gateway) to use any model through one API.
+
+## Quick Reference
+
+| v4 (deprecated in v0.6.0) | v5 |
+|---------------------------|-----|
+| `LegacyTask(...)` | `Task.from_v4(...)` (quick) or `env("scenario", ...)` (recommended) |
+| `setup_tool` | Code before first yield in `@env.scenario()` |
+| `evaluate_tool` | Code after first yield in `@env.scenario()` |
+| `MCPServer` | `Environment` (drop-in replacement) |
+| `agent.run(task)` | Still works, or use `hud.eval()` for BYOA |
diff --git a/docs/quick-links/ab-testing.mdx b/docs/quick-links/ab-testing.mdx
new file mode 100644
index 00000000..0c1215f8
--- /dev/null
+++ b/docs/quick-links/ab-testing.mdx
@@ -0,0 +1,61 @@
+---
+title: "A/B Evals"
+description: "Find out which model actually performs best for your use case."
+icon: "flask-vial"
+---
+
+LLM outputs vary from run to run—ask the same question twice and you might get different quality answers. To find out which model actually performs best, you need to test each one multiple times and look at the spread. **Variants** let you test different models side-by-side. **Groups** repeat each test so you see the full distribution, not just one lucky or unlucky result.
+
+## Variants
+
+Pass the configurations you want to test:
+
+```python
+import hud
+
+async with hud.eval(variants={"model": ["gpt-4o", "claude-sonnet-4-5"]}) as ctx:
+    response = await client.chat.completions.create(
+        model=ctx.variants["model"],
+        messages=[{"role": "user", "content": "What is 2+2?"}]
+    )
+    ctx.reward = 1.0 if "4" in response.choices[0].message.content else 0.0
+
+for result in ctx.results:
+    print(f"{result.variants}: reward={result.reward}")
+```
+
+## Groups
+
+Run each variant multiple times to get a distribution:
+
+```python
+async with hud.eval(
+    variants={"model": ["gpt-4o", "claude-sonnet-4-5"]},
+    group=5  # 10 runs total: 2 models × 5 each
+) as ctx:
+    ...
+```
+
+The `hud.eval` manager will parallelize your evals automatically and show the distribution across all your runs on [hud.ai](https://hud.ai/home).
+
+## Remote Rollouts
+
+Once you've [deployed an environment](/quick-links/deploy#deploying-environments) and created evals, run them by name:
+
+```python
+async with hud.eval("my-org/checkout-laptop", variants={"model": ["gpt-4o", "claude"]}) as ctx:
+    response = await client.chat.completions.create(
+        model=ctx.variants["model"],
+        messages=[{"role": "user", "content": ctx.prompt}]
+    )
+```
+
+The platform loads everything—environment, prompt, evaluation logic, comparisons across models. You just provide the agent.
+
+Or via CLI:
+
+```bash
+hud eval my-org/checkout-laptop --model gpt-4o --group-size 5
+```
+
+Or run directly on the platform—see [Running at Scale](/quick-links/deploy#running-at-scale).
diff --git a/docs/quick-links/deploy.mdx b/docs/quick-links/deploy.mdx
new file mode 100644
index 00000000..5553c7d8
--- /dev/null
+++ b/docs/quick-links/deploy.mdx
@@ -0,0 +1,66 @@
+---
+title: "Deploy"
+description: "Deploy environments. Create evals. Run and train at scale."
+icon: "rocket"
+---
+
+You've built an environment with tools and scripts. Deploy it to the platform and you can run evals at scale—hundreds of parallel runs across models, all traced, all generating training data.
+
+## Deploying Environments
+
+Start with `hud init` ([see Environments](/quick-links/environments)) to scaffold locally. When ready:
+
+1. Go to [hud.ai](https://hud.ai) → **New** → **Environment**
+2. Connect your GitHub repo and name your environment
+3. Push changes and it rebuilds automatically, like Vercel
+
+Your environment—tools, scripts, everything—is now live. Connect from anywhere:
+
+```python
+env.connect_hub("my-org/my-env")
+```
+
+## Running at Scale
+
+Once deployed, create evals on [hud.ai](https://hud.ai) from your scripts. Each eval is a frozen configuration—same prompt, same scoring, every time.
+
+Your scenario might take arguments:
+
+```python
+@env.scenario("checkout")
+async def checkout_flow(product_name: str, apply_coupon: bool = False):
+    yield f"Complete checkout for {product_name}" + (" with coupon" if apply_coupon else "")
+    yield 1.0 if order_confirmed() else 0.0
+```
+
+On the platform, click **New Eval** → select your scenario → fill in the arguments. Create multiple evals from the same scenario:
+
+| Eval Name | Arguments |
+|-----------|-----------|
+| `checkout-laptop` | `product_name="Laptop"`, `apply_coupon=False` |
+| `checkout-phone-coupon` | `product_name="Phone"`, `apply_coupon=True` |
+| `checkout-headphones` | `product_name="Headphones"`, `apply_coupon=False` |
+
+Then run them—select an eval, choose variants and groups, launch hundreds of runs in parallel. Every run is traced. Results show scores, distributions, and side-by-side model comparisons. These become your training data.
+
+For A/B testing with variants and groups, see [A/B Evals](/quick-links/ab-testing).
+
+## What's Next?
+
+With your environment deployed:
+
+- **Scale**: Launch thousands of rollouts. Every run generates traces—prompts, tool calls, rewards.
+- **Analyze**: See which evals agents struggle with. Compare models across your entire benchmark.
+- **Train**: Use runs as training data. Fine-tune on successful completions. Run reinforcement learning to optimize for your specific environment.
+
+The loop: deploy → eval at scale → analyze → train → redeploy. Agents get better at *your* environment.
+
+<CardGroup cols={2}>
+<Card title="Integrations" icon="robot" href="/guides/integrations">
+  Connect OpenAI, Anthropic, LangChain, and more.
+</Card>
+
+<Card title="Sandboxing" icon="shield" href="/guides/sandboxing">
+  Turn production services into safe test environments.
+</Card>
+</CardGroup>
diff --git a/docs/quick-links/environments.mdx b/docs/quick-links/environments.mdx
new file mode 100644
index 00000000..2827055a
--- /dev/null
+++ b/docs/quick-links/environments.mdx
@@ -0,0 +1,108 @@
+---
+title: "Environments"
+description: "Turn your code into agent-callable tools. Define how agents are evaluated."
+icon: "cube"
+---
+
+An environment is everything an agent can interact with—your APIs, services, databases, wrapped as tools. But it's more than that: the environment also defines how agents are *evaluated* through **scripts**. When you deploy an environment, you're creating a sandbox that agents can learn from at scale.
+
+## Why Environments, Not API Servers?
+
+Your production API is a single live instance with shared state—you can't run 500 tests against it in parallel without causing chaos. Environments spin up fresh for every evaluation: isolated, deterministic, reproducible. Run thousands in parallel, each starting from the exact state you define, each generating training data. An API server is a live system you observe. An environment is a sandbox you control.
+
+## Tools
+
+Start with `hud init` to scaffold an environment—works with existing codebases or from scratch:
+
+```bash
+hud init
+```
+
+Every tool is just a function. Decorate it with `@env.tool()` and agents can call it:
+
+```python
+from hud import Environment
+
+env = Environment("my-env")
+
+@env.tool()
+async def search(query: str) -> str:
+    """Search the knowledge base."""
+    return db.search(query)
+```
+
+Got a FastAPI app? One line:
+
+```python
+env.connect_fastapi(app)
+```
+
+All your routes become tools. Run it:
+
+```python
+async with env() as ctx:
+    tools = await ctx.list_tools()
+    result = await ctx.call_tool("search", query="test")
+```
+
+## Scripts
+
+To evaluate an agent, you need two things: what to tell it, and how to score what it did. Scripts capture both with two `yield` statements:
+
+```python
+@env.scenario("checkout")
+async def checkout_flow(product_name: str):
+    # Yield the prompt, receive the agent's final answer
+    answer = yield f"Add '{product_name}' to cart and complete checkout"
+    
+    # Score based on environment state and/or the answer
+    order_exists = await check_order_status(product_name)
+    yield 1.0 if order_exists else 0.0
+```
+
+The agent runs between the yields. First yield sends the prompt and returns the agent's answer. Second yield checks environment state—database rows, files, API calls—and returns a reward. Scripts live with the environment because only the environment knows how to verify what happened.
+
+## Evals
+
+Call the environment with a scenario name and arguments to create a task:
+
+```python
+task = env("checkout", product_name="Laptop")
+
+async with hud.eval(task, group=4) as ctx:
+    # Connect your agent here. Handle tool calls, run agent loop...
+    response = await client.chat.completions.create(
+        model="gpt-4o",
+        messages=[{"role": "user", "content": ctx.prompt}],
+        tools=ctx.as_openai_chat_tools()
+    )
+
+    await ctx.submit(response.choices[0].message.content)
+
+print(ctx.reward)
+```
+
+This creates a trace on [hud.ai](https://hud.ai/home). Add [variants](/quick-links/ab-testing) to A/B test across models. To run evals at scale, [deploy your environment](/quick-links/deploy).
+
+## Mock Mode
+
+Testing your agent loop without hitting real services? Mock mode returns fake responses based on tool schemas:
+
+```python
+env.mock()
+env.mock_tool("search", "Mock search results") # Manual override of mock
+
+async with hud.eval(env(), group=4) as ctx:
+    tools = env.as_openai_chat_tools()
+    
+    response = await client.chat.completions.create(
+        model="claude-sonnet-4-5",
+        messages=[{"role": "user", "content": "Search for X"}],
+        tools=tools
+    )
+    
+    # Returns mock value instead of hitting real service
+    result = await env.call_tool(response.choices[0].message.tool_calls[0])
+```
+Your agent code stays the same—just toggle `env.mock()` for local testing.
+
diff --git a/docs/quick-links/gateway.mdx b/docs/quick-links/gateway.mdx
new file mode 100644
index 00000000..11d5d73d
--- /dev/null
+++ b/docs/quick-links/gateway.mdx
@@ -0,0 +1,128 @@
+---
+title: "Gateway"
+description: "One endpoint for every model. One API key. Full observability."
+icon: "server"
+---
+
+Stop juggling API keys. HUD Gateway routes to Anthropic, OpenAI, Gemini, xAI, and more through a single OpenAI-compatible endpoint—with built-in telemetry. Swap `model="gpt-4o"` for `model="claude-sonnet-4-5"` and you're [A/B testing](/quick-links/ab-testing) across providers. Continuous RL from production coming soon.
+
+## Quick Start
+
+Point any OpenAI-compatible client at `inference.hud.ai`:
+
+<CodeGroup>
+
+```python Python
+from openai import AsyncOpenAI
+import os
+
+client = AsyncOpenAI(
+    base_url="https://inference.hud.ai",
+    api_key=os.environ["HUD_API_KEY"]
+)
+
+response = await client.chat.completions.create(
+    model="claude-sonnet-4-5",  # or gpt-4o, gemini-2.5-pro, grok-4-1-fast...
+    messages=[{"role": "user", "content": "Hello!"}]
+)
+```
+
+```bash curl
+curl -X POST https://inference.hud.ai/chat/completions \
+  -H "Authorization: Bearer $HUD_API_KEY" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "claude-sonnet-4-5",
+    "messages": [{"role": "user", "content": "Hello!"}]
+  }'
+```
+
+</CodeGroup>
+
+## Supported Models
+
+Full list at [hud.ai/models](https://hud.ai/models).
+
+<Accordion title="Anthropic">
+| Model | Routes |
+|-------|--------|
+| `claude-sonnet-4-5` | chat, messages |
+| `claude-haiku-4-5` | chat, messages |
+| `claude-opus-4-5` | chat, messages |
+| `claude-opus-4-1` | chat, messages |
+</Accordion>
+
+<Accordion title="OpenAI">
+| Model | Routes |
+|-------|--------|
+| `gpt-5.1` | chat, responses |
+| `gpt-5-mini` | chat, responses |
+| `gpt-4o` | chat, responses |
+| `gpt-4o-mini` | chat, responses |
+| `operator` | responses |
+</Accordion>
+
+<Accordion title="Google Gemini">
+| Model | Routes |
+|-------|--------|
+| `gemini-3-pro-preview` | chat |
+| `gemini-2.5-pro` | chat |
+| `gemini-2.5-computer-use-preview` | gemini |
+</Accordion>
+
+<Accordion title="xAI & Others">
+| Model | Routes |
+|-------|--------|
+| `grok-4-1-fast` | chat |
+| `z-ai/glm-4.5v` | chat |
+</Accordion>
+
+## Telemetry
+
+Wrap code in a plain `hud.eval()` to group inference calls. In the trace you'll see the full conversation in sequence, not scattered API calls.
+
+```python
+async with hud.eval():
+    response = await client.chat.completions.create(
+        model="claude-sonnet-4-5",
+        messages=[{"role": "user", "content": "Hello!"}]
+    )
+```
+
+Or inject a trace ID manually if you're not using `hud.eval()`. Generate a UUID and pass it with each request in a task:
+
+<CodeGroup>
+
+```python Python
+import uuid
+
+trace_id = str(uuid.uuid4())  # e.g. "a1b2c3d4-e5f6-7890-abcd-ef1234567890"
+
+response = await client.chat.completions.create(
+    model="claude-sonnet-4-5",
+    messages=[{"role": "user", "content": "Hello!"}],
+    extra_headers={"Trace-Id": trace_id}
+)
+```
+
+```bash curl
+curl -X POST https://inference.hud.ai/chat/completions \
+  -H "Authorization: Bearer $HUD_API_KEY" \
+  -H "Content-Type: application/json" \
+  -H "Trace-Id: a1b2c3d4-e5f6-7890-abcd-ef1234567890" \
+  -d '{
+    "model": "claude-sonnet-4-5",
+    "messages": [{"role": "user", "content": "Hello!"}]
+  }'
+```
+
+</CodeGroup>
+
+View traces at [hud.ai/home](https://hud.ai/home).
+
+## Routes
+
+- **chat** — `/chat/completions` (OpenAI-compatible)
+- **messages** — `/messages` (Anthropic-compatible)
+- **responses** — `/responses` (OpenAI Responses API)
+- **gemini** — Google Gemini native API
diff --git a/docs/quickstart.mdx b/docs/quickstart.mdx
index 650f200a..6e14401c 100644
--- a/docs/quickstart.mdx
+++ b/docs/quickstart.mdx
@@ -55,7 +55,19 @@ Get up and running with HUD in minutes. Follow these four steps to install the C
 </Step>
 </Steps>
 
-## Environments/CLI Quick Reference
+## SDK Quick Reference
+
+```python
+import hud
+
+# Run evaluation with the new eval API
+async with hud.eval("hud-evals/SheetBench-50:0") as ctx:
+    agent = MyAgent()
+    result = await agent.run(ctx)
+    ctx.reward = result.reward
+```
+
+## CLI Quick Reference
 
 ```bash
 # Create sample environment
diff --git a/docs/reference/agents.mdx b/docs/reference/agents.mdx
index fa092a50..06316c24 100644
--- a/docs/reference/agents.mdx
+++ b/docs/reference/agents.mdx
@@ -97,7 +97,7 @@ Claude-specific implementation using Anthropic's API.
 
 ```python
 from hud.agents import ClaudeAgent
-from hud.datasets import Task
+from hud.datasets import LegacyTask
 
 agent = ClaudeAgent.create(
     checkpoint_name="claude-sonnet-4-5",
@@ -105,7 +105,7 @@ agent = ClaudeAgent.create(
 )
 
 result = await agent.run(
-    Task(
+    LegacyTask(
         prompt="Navigate to example.com",
         mcp_config={
             "hud": {
@@ -245,12 +245,12 @@ agent = OpenAIChatAgent.create(
 
 ```python
 from hud.agents import ClaudeAgent
-from hud.datasets import Task
+from hud.datasets import LegacyTask
 
 agent = ClaudeAgent.create()
 
 result = await agent.run(
-    Task(
+    LegacyTask(
         prompt="Click the submit button",
         mcp_config={
             "hud": {
@@ -270,7 +270,7 @@ print(f"Reward: {result.reward}, Done: {result.done}")
 ### With Setup and Evaluation
 
 ```python
-task = Task(
+task = LegacyTask(
     prompt="Find the price of the product",
     mcp_config={
         "hud": {
diff --git a/docs/reference/cli/eval.mdx b/docs/reference/cli/eval.mdx
index 0bfad028..e8be3c21 100644
--- a/docs/reference/cli/eval.mdx
+++ b/docs/reference/cli/eval.mdx
@@ -227,5 +227,5 @@ hud cancel --all
 
 - [Tasks Reference](/reference/tasks) - Task configuration
 - [Agents Reference](/reference/agents) - Agent options
-- [`hud rl`](/reference/cli/rl) - RL training
+- [`hud rft`](/reference/cli/rft) - Reinforcement fine-tuning
 - [`hud cancel`](/reference/cli/misc) - Cancel remote jobs
diff --git a/docs/reference/cli/overview.mdx b/docs/reference/cli/overview.mdx
index a474e3ef..49d226a1 100644
--- a/docs/reference/cli/overview.mdx
+++ b/docs/reference/cli/overview.mdx
@@ -21,8 +21,7 @@ The HUD CLI provides a complete toolkit for creating, developing, and running MC
   - `hud debug` — 5‑phase compliance test
   - `hud run` — Execute (Python module/command/Docker)
   - `hud eval` — Run agents on tasks/datasets
-  - `hud rl` — Train with GRPO on tasks
-  - `hud rft` — Fine-tune models with RL (BETA, invite-only)
+  - `hud rft` — Fine-tune models (BETA, invite-only)
 </Card>
 </CardGroup>
 
@@ -62,8 +61,7 @@ hud --version
 | `hud debug` | Image/dir/config | 5‑phase compliance test | `hud debug my-env:latest` |
 | `hud run` | Module/command/image | Execute server (local/remote) | `hud run controller --reload` |
 | `hud eval` | Tasks/dataset | Run agent on tasks | `hud eval tasks.json claude` |
-| `hud rl` | Tasks/dataset | Train with GRPO | `hud rl tasks.json --local` |
-| `hud rft` | Tasks file | Fine-tune with RL (BETA, invite-only) | `hud rft run tasks.json` |
+| `hud rft` | Tasks file | Fine-tune models (BETA, invite-only) | `hud rft run tasks.json` |
 
 ### Other Commands
 | Command | Description | Example |
diff --git a/docs/reference/cli/rft.mdx b/docs/reference/cli/rft.mdx
index 8d1d3be1..771b806d 100644
--- a/docs/reference/cli/rft.mdx
+++ b/docs/reference/cli/rft.mdx
@@ -1,6 +1,6 @@
 ---
 title: "hud rft"
-description: "Reinforcement Fine-Tuning commands (invite-only)"
+description: "Fine-Tuning commands (invite-only)"
 icon: "brain-circuit"
 ---
 
@@ -12,7 +12,7 @@ RFT is currently in BETA. Features and APIs may change.
 **Access Required**: RFT is available by invite only. Contact [founders@hud.ai](mailto:founders@hud.ai) to request access.
 </Info>
 
-The `hud rft` command group provides tools for fine-tuning models using reinforcement learning on HUD tasks.
+The `hud rft` command group provides tools for fine-tuning models on HUD tasks.
 
 ## Subcommands
 
@@ -133,4 +133,3 @@ hud rft status f5f050a3-99c1-4339-b819-ccb1325f79d8 --verbose
 ## See Also
 
 - [Beta RFT Documentation](/beta/rft) - Detailed guide and examples
-- [hud rl](/reference/cli/rl) - Standard reinforcement learning training
diff --git a/docs/reference/cli/rl.mdx b/docs/reference/cli/rl.mdx
deleted file mode 100644
index f644770b..00000000
--- a/docs/reference/cli/rl.mdx
+++ /dev/null
@@ -1,87 +0,0 @@
----
-title: "hud rl"
-description: "Run GRPO reinforcement learning on tasks"
-icon: "brain"
----
-
-The `hud rl` command trains an agent with GRPO on tasks, locally or via the HUD remote service.
-
-## Usage
-
-```bash
-hud rl [TASKS_FILE|DATASET] [MODEL] [OPTIONS]
-```
-
-## Arguments
-
-<ParamField path="tasks_file" type="string">
-  Path to tasks JSON/JSONL file or HuggingFace dataset name. If omitted, looks for a tasks file in the current directory.
-</ParamField>
-
-<ParamField path="model" type="string">
-  Model to train (default: interactive selection)
-</ParamField>
-
-## Options
-
-<ParamField path="--config" type="string">
-  Path to existing configuration file. Short: `-c`
-</ParamField>
-
-<ParamField path="--output-dir" type="string" default="/checkpoints">
-  Output directory for checkpoints. Short: `-o`
-</ParamField>
-
-<ParamField path="--restart" type="boolean" default="false">
-  Restart the vLLM server before training
-</ParamField>
-
-<ParamField path="--verbose" type="boolean" default="false">
-  Enable verbose output. Short: `-v`
-</ParamField>
-
-<ParamField path="--no-ddp" type="boolean" default="false">
-  Disable DistributedDataParallel (even with multiple GPUs)
-</ParamField>
-
-<ParamField path="--ddp-gpus" type="string">
-  Specific GPUs for DDP (e.g., `0,1,2,3`)
-</ParamField>
-
-<ParamField path="--vllm-gpu" type="integer">
-  Specific GPU for vLLM server
-</ParamField>
-
-<ParamField path="--local" type="boolean" default="false">
-  Run training locally instead of the remote HUD server
-</ParamField>
-
-## Behavior
-
-- If no tasks file is provided, an interactive picker helps locate one.
-- Remote mode (default) converts tasks to remote MCP automatically (build/push as needed) and launches remote training.
-- Local mode runs training on your machine (delegated to `local_runner`).
-
-## Examples
-
-```bash
-# Remote (default): auto-convert tasks to remote, then train
-hud rl tasks.json --model claude-rl
-
-# Local training with GPU selection
-hud rl tasks.json llama3.1 --local --ddp-gpus 0,1 --vllm-gpu 0
-
-# Use a dataset directly (remote)
-hud rl hud-evals/SheetBench-50 --model claude-rl
-```
-
-## See Also
-
-- [`hud eval`](/reference/cli/eval)
-- [`hud get`](/reference/cli/get)
-- [`hud build`](/reference/cli/build)
-- [`hud push`](/reference/cli/push)
-
-## Pricing & Billing
-
-See hosted vLLM and training GPU rates in the [Training Quickstart → Pricing](/train-agents/quickstart#pricing). Manage usage and billing at `https://hud.ai/project/billing`.
\ No newline at end of file
diff --git a/docs/reference/environments.mdx b/docs/reference/environments.mdx
index 889477f9..94942849 100644
--- a/docs/reference/environments.mdx
+++ b/docs/reference/environments.mdx
@@ -1,490 +1,302 @@
 ---
-title: "Environments"
-description: "SDK reference for building MCP environments"
-icon: "cube"
+title: "Environment"
+description: "SDK reference for the Environment class - tools, connectors, and integrations"
+icon: "desktop"
 ---
 
-The HUD SDK provides `MCPServer` for building MCP-compatible environments that work with any MCP client.
+`Environment` is the unified class for defining tools, connecting to services, and formatting for any LLM provider.
 
-## MCPServer
+## Environment
 
 ```python
-from hud.server import MCPServer
+from hud import Environment
+
+env = Environment("my-env")
 ```
 
-Enhanced FastMCP server with Docker-friendly features for building HUD environments.
+### Constructor
 
-**Constructor Parameters:**
 | Parameter | Type | Description | Default |
 |-----------|------|-------------|---------|
-| `name` | `str` | Server name for MCP handshake | Required |
-| `instructions` | `str` | Server instructions/description | `None` |
-| `**fastmcp_kwargs` | `Any` | Additional FastMCP parameters | - |
+| `name` | `str` | Environment name | `"environment"` |
+| `instructions` | `str \| None` | Description/instructions | `None` |
+| `conflict_resolution` | `ConflictResolution` | How to handle tool name conflicts | `PREFIX` |
+
+### Context Manager
+
+Environment must be used as an async context manager to connect:
 
-**Key Features:**
-1. **SIGTERM handling** - Graceful shutdown in containers via custom runner
-2. **Initialize decorator** - Async setup during MCP initialize request (stdout is temporarily redirected to stderr during initialization to avoid corrupting MCP output)
-3. **Shutdown decorator** - Runs only on SIGTERM (container termination), not on hot‑reload/SIGINT
-4. **Enhanced add_tool()** - Automatically handles `BaseTool` instances and raw FastMCP Tool objects
-5. **Tool decorator passthrough** - `@mcp.tool` returns the original function for easy composition
-6. **FastMCP inheritance** - All FastMCP methods available (`mount`, `resource`, `tool`)
+```python
+async with env:
+    tools = env.as_openai_chat_tools()
+    result = await env.call_tool("my_tool", arg="value")
+```
 
-### Decorators
+## Defining Tools
 
-#### @initialize
+### @env.tool()
 
-Run async setup during MCP initialize request:
+Register functions as callable tools:
 
 ```python
-mcp = MCPServer(name="my-env")
+@env.tool()
+def count_letter(text: str, letter: str) -> int:
+    """Count occurrences of a letter in text."""
+    return text.lower().count(letter.lower())
+
+@env.tool()
+async def fetch_data(url: str) -> dict:
+    """Fetch JSON data from URL."""
+    async with httpx.AsyncClient() as client:
+        response = await client.get(url)
+        return response.json()
+```
+
+Tools are automatically documented from type hints and docstrings.
+
+## Scripts
+
+Scripts define evaluation logic with two yields:
 
-@mcp.initialize
-async def setup_environment(ctx):
-    """
-    Initialize environment resources.
+```python
+@env.scenario("checkout")
+async def checkout_flow(product: str):
+    # First yield: send prompt, receive answer
+    answer = yield f"Add '{product}' to cart and checkout"
     
-    Args:
-        ctx: RequestContext with:
-            - ctx.meta: Client metadata dict
-            - ctx.session: MCP ServerSession
-    """
-    # Access metadata from agent (if provided)
-    if ctx.meta:
-        progress_token = ctx.meta.get("progressToken")
-        display_width = ctx.meta.get("display_width", 1920)
-        display_height = ctx.meta.get("display_height", 1080)
-        
-        # Send progress notifications
-        if progress_token:
-            await ctx.session.send_progress_notification(
-                progress_token=progress_token,
-                progress=50,
-                total=100,
-                message="Initializing environment..."
-            )
+    # Second yield: return reward based on result
+    order_exists = await check_order(product)
+    yield 1.0 if order_exists else 0.0
 ```
 
-#### @shutdown  
-
-Run cleanup on SIGTERM (container termination only):
+Create Tasks from scripts:
 
 ```python
-@mcp.shutdown
-async def cleanup():
-    """Clean up resources on shutdown."""
-    if browser_provider:
-        browser_provider.close()
-    logger.info("Cleanup complete")
+task = env("checkout", product="laptop")
+
+async with hud.eval(task) as ctx:
+    await agent.run(ctx.prompt)
+    await ctx.submit(agent.response)
 ```
 
-### Tool Registration
+## Connectors
+
+Connect to external services as tool sources.
+
+### connect_hub()
 
-Three ways to register tools:
+Connect to a deployed HUD environment:
 
 ```python
-# 1. Decorator for simple functions
-@mcp.tool()
-async def my_tool(param: str) -> dict:
-    return {"result": param}
-
-# 2. Add BaseTool instances
-from hud.tools import BashTool
-bash = BashTool()
-mcp.add_tool(bash)  # Automatically uses bash.mcp internally
-
-# 3. Add non-BaseTool instances directly
-from custom import PlaywrightTool
-playwright = PlaywrightTool()
-mcp.add_tool(playwright)  # Added as-is
+env.connect_hub("my-org/browser", prefix="browser")
+# Tools available as browser_navigate, browser_click, etc.
 ```
 
-### Hub Pattern (mount)
+### connect_fastapi()
 
-Use BaseHub for organized tool namespaces:
+Import FastAPI routes as tools:
 
 ```python
-from hud.tools import BaseHub
+from fastapi import FastAPI
 
-# Create hub
-setup_hub = BaseHub("setup")
+api = FastAPI()
 
-# Add internal tools (hidden from agents)
-@setup_hub.tool("board")
-async def setup_board(size: int = 4):
-    game = setup_hub.env
-    game.reset(size=size)
-    return [TextContent(text=f"{size}x{size} board initialized")]
+@api.get("/users/{user_id}", operation_id="get_user")
+def get_user(user_id: int):
+    return {"id": user_id, "name": "Alice"}
 
-# Mount hub on server
-mcp.mount(setup_hub)
-
-# Agents call via dispatcher: setup(name="board", arguments={"size": 4})
+env.connect_fastapi(api)
+# Tool available as get_user
 ```
 
-### Resources
+| Parameter | Type | Description | Default |
+|-----------|------|-------------|---------|
+| `app` | `FastAPI` | FastAPI application | Required |
+| `name` | `str \| None` | Server name | `app.title` |
+| `prefix` | `str \| None` | Tool name prefix | `None` |
+| `include_hidden` | `bool` | Include routes with `include_in_schema=False` | `True` |
+
+### connect_openapi()
 
-Expose metadata via MCP resources:
+Import from OpenAPI spec:
 
 ```python
-@mcp.resource("telemetry://live")
-async def get_telemetry():
-    """Expose live telemetry data."""
-    return {
-        "provider": os.getenv("BROWSER_PROVIDER"),
-        "status": "running" if browser_provider else "stopped",
-        "live_url": browser_provider.get_live_view_url() if browser_provider else None,
-        "timestamp": datetime.now().isoformat()
-    }
+env.connect_openapi("https://api.example.com/openapi.json")
 ```
 
-### Running the Server
+### connect_server()
+
+Mount an MCPServer or FastMCP directly:
 
 ```python
-if __name__ == "__main__":
-    # Run with SIGTERM handling (stdio by default)
-    mcp.run()
+from fastmcp import FastMCP
 
-    # Or use development transports (HTTP/SSE)
-    mcp.run(transport="http", port=8765)
-    mcp.run(transport="sse", port=8080)
-```
+tools = FastMCP("tools")
 
-When using HTTP/SSE, HUD development helper endpoints are available:
+@tools.tool
+def greet(name: str) -> str:
+    return f"Hello, {name}!"
 
-- `GET /hud` – overview
-- `GET /hud/tools` – list tools with schemas
-- `GET /hud/resources` – list resources
-- `GET /hud/prompts` – list prompts
+env.connect_server(tools)
+```
 
-## Real Environment Examples
+### connect_mcp_config()
 
-### Minimal Environment
+Connect via MCP config dict:
 
 ```python
-# src/hud_controller/server.py
-from hud.server import MCPServer
-from mcp.types import TextContent
-
-mcp = MCPServer(name="counter-env")
-counter = {"value": 0}
-
-@mcp.tool()
-async def setup(start_value: int = 0):
-    """Initialize counter."""
-    counter["value"] = start_value
-    return {"status": "ready", "counter": counter["value"]}
-
-@mcp.tool()
-async def increment():
-    """Increment counter."""
-    counter["value"] += 1
-    return [TextContent(text=f"Counter: {counter['value']}", type="text")]
-
-@mcp.tool()
-async def evaluate(target: int):
-    """Check if target reached."""
-    from hud.tools.types import EvaluationResult
-    return EvaluationResult(
-        reward=1.0 if counter["value"] >= target else 0.0,
-        done=counter["value"] >= target
-    )
-
-if __name__ == "__main__":
-    mcp.run()
+env.connect_mcp_config({
+    "my-server": {
+        "command": "uvx",
+        "args": ["some-mcp-server"]
+    }
+})
 ```
 
-### text_2048 Environment
+### connect_image()
 
-From `environments/text_2048/src/hud_controller/server.py`:
+Connect to a Docker image via stdio:
 
 ```python
-from hud.server import MCPServer
-from .game import Game2048
-from .tools import MoveTool
-from .setup import setup as setup_hub
-from .evaluate import evaluate as evaluate_hub
-
-mcp = MCPServer(name="text-2048")
-game = None
-
-@mcp.initialize
-async def initialize_environment(ctx):
-    global game
-    
-    # Progress notifications
-    progress_token = getattr(ctx.meta, "progressToken", None) if ctx.meta else None
-    
-    async def send_progress(progress: int, message: str):
-        if progress_token:
-            await ctx.session.send_progress_notification(
-                progress_token=progress_token,
-                progress=progress,
-                total=100,
-                message=message
-            )
-    
-    await send_progress(0, "Starting 2048 game environment...")
-    
-    # Create game
-    game = Game2048()
-    game.reset()
-    
-    await send_progress(50, "Setting up game board...")
-    
-    # Set game on hubs
-    setup_hub.env = game
-    evaluate_hub.env = game
-    
-    # Mount hubs
-    mcp.mount(setup_hub)
-    mcp.mount(evaluate_hub)
-    
-    await send_progress(70, "Configuring tools...")
-    
-    # Add move tool
-    mcp.add_tool(MoveTool(env=game))
-    
-    await send_progress(100, "2048 environment ready")
+env.connect_image("mcp/fetch")
 ```
 
-### remote_browser Environment
+## Tool Formatting
 
-From `environments/remote_browser/src/hud_controller/server.py`:
+Convert tools to provider-specific formats.
+
+### OpenAI
 
 ```python
-from hud.server import MCPServer
-from hud.tools.computer import HudComputerTool, AnthropicComputerTool, OpenAIComputerTool
-from .tools import PlaywrightToolWithMemory, BrowserExecutor
-from .setup import setup as setup_hub
-from .evaluate import evaluate as evaluate_hub
-from .providers import get_provider
-
-mcp = MCPServer(
-    name="HUD Remote Browser Environment",
-    instructions="""Remote browser automation environment..."""
+# Chat Completions API
+tools = env.as_openai_chat_tools()
+response = await client.chat.completions.create(
+    model="gpt-4o",
+    messages=messages,
+    tools=tools,
 )
 
-# Global state
-browser_provider = None
-playwright_tool = None
-
-@mcp.resource("telemetry://live")
-async def get_telemetry_resource():
-    """MCP resource with live browser status."""
-    return {
-        "provider": os.getenv("BROWSER_PROVIDER", "unknown"),
-        "status": "running" if browser_provider else "stopped",
-        "live_url": browser_provider.get_live_view_url() if browser_provider else None,
-        "cdp_url": browser_provider.cdp_url if browser_provider else None
-    }
+# Responses API
+tools = env.as_openai_responses_tools()
 
-@mcp.initialize
-async def initialize_environment(ctx):
-    global browser_provider, playwright_tool
-    
-    # Get metadata
-    metadata = ctx.meta
-    progress_token = metadata.get("progressToken", None)
-    
-    # Initialize provider
-    provider_name = os.getenv("BROWSER_PROVIDER")
-    provider_class = get_provider(provider_name)
-    browser_provider = provider_class(config)
-    
-    # Launch browser
-    cdp_url = await browser_provider.launch()
-    
-    # Create playwright tool
-    playwright_tool = PlaywrightToolWithMemory(cdp_url=cdp_url)
-    await playwright_tool._ensure_browser()
-    
-    # Add playwright tool (not a BaseTool, added directly)
-    mcp.add_tool(playwright_tool)
-    
-    # Create computer tools
-    executor = BrowserExecutor(playwright_tool)
-    tool_kwargs = {"executor": executor}
-    
-    # Add display dimensions from metadata
-    if metadata:
-        width = metadata.get("display_width")
-        height = metadata.get("display_height")
-        if width and height:
-            tool_kwargs["width"] = width
-            tool_kwargs["height"] = height
-    
-    # Add computer tools (all are BaseTool subclasses)
-    mcp.add_tool(HudComputerTool(**tool_kwargs))
-    mcp.add_tool(AnthropicComputerTool(**tool_kwargs))
-    mcp.add_tool(OpenAIComputerTool(**tool_kwargs))
-    
-    # Mount hubs
-    setup_hub.env = playwright_tool
-    evaluate_hub.env = playwright_tool
-    mcp.mount(setup_hub)
-    mcp.mount(evaluate_hub)
-
-@mcp.shutdown
-async def shutdown_environment():
-    """Cleanup browser resources."""
-    global browser_provider
-    if browser_provider:
-        browser_provider.close()
-    browser_provider = None
+# Agents SDK (requires openai-agents)
+tools = env.as_openai_agent_tools()
 ```
 
-## Standard Structure
+### Anthropic/Claude
 
-### Directory Layout
-
-```
-my-environment/
-├── Dockerfile
-├── pyproject.toml
-├── controller/                 # MCP controller (stdio)
-│   ├── __init__.py             # mcp = MCPServer()
-│   ├── __main__.py             # python -m controller → mcp.run()
-│   ├── hooks.py                # @mcp.initialize / @mcp.shutdown
-│   └── tools.py                # @mcp.tool(...)
-└── environment/                # Optional backend (HTTP/IPC)
-    └── server.py               # e.g., FastAPI app
+```python
+tools = env.as_claude_tools()
+response = await client.messages.create(
+    model="claude-sonnet-4-5",
+    messages=messages,
+    tools=tools,
+)
 ```
 
-### Dockerfile
+### Gemini
 
-```dockerfile
-FROM python:3.11-slim
+```python
+tools = env.as_gemini_tools()
+config = env.as_gemini_tool_config()
+```
 
-WORKDIR /app
+### LangChain
 
-# Copy and install
-COPY pyproject.toml ./
-COPY controller/ ./controller/
-COPY environment/ ./environment/
-RUN pip install --no-cache-dir -e .
+```python
+# Requires langchain-core
+tools = env.as_langchain_tools()
+```
 
-ENV ENV_SERVER_PORT=8005
+### LlamaIndex
 
-# Start optional backend, then MCP controller on stdio
-CMD ["sh", "-c", "uvicorn environment.server:app --host 0.0.0.0 --port $ENV_SERVER_PORT --log-level warning & python -m controller"]
+```python
+# Requires llama-index-core
+tools = env.as_llamaindex_tools()
 ```
 
-### Hub Module Pattern
-
-Example from text_2048:
+### Google ADK
 
 ```python
-# src/hud_controller/setup/__init__.py
-from hud.tools.base import BaseHub
+# Requires google-adk
+tools = env.as_adk_tools()
+```
 
-setup = BaseHub("setup")
+## Calling Tools
 
-# Import all setup functions to register them
-from . import board
+### call_tool()
 
-__all__ = ["setup"]
+Execute tools with auto-format detection:
+
+```python
+# Simple call
+result = await env.call_tool("my_tool", arg="value")
 
-# src/hud_controller/setup/board.py
-from . import setup
+# From OpenAI tool call
+result = await env.call_tool(response.choices[0].message.tool_calls[0])
 
-@setup.tool("board")
-async def setup_board(board_size: int = 4):
-    """Initialize game board."""
-    game = setup.env  # Access environment from hub
-    game.reset(size=board_size)
-    return [TextContent(text=f"{board_size}x{board_size} game initialized")]
+# From Claude tool use
+result = await env.call_tool(response.content[0])  # tool_use block
 ```
 
-## Key Concepts
+Returns result in matching format (OpenAI tool call → OpenAI tool message, etc.).
 
-### Environment State
+## Mock Mode
 
-Three patterns for managing state:
+Test without real connections:
 
-1. **Global variables** (simple environments):
-   ```python
-   game = None
-   
-   @mcp.initialize
-   async def initialize_environment(ctx):
-       global game
-       game = Game2048()
-   ```
+```python
+env.mock()  # Enable mock mode
 
-2. **Context class** (complex environments):
-   ```python
-   class EnvironmentContext:
-       def __init__(self):
-           self.browser = None
-           self.page = None
-   
-   env = EnvironmentContext()
-   ```
+# Set specific mock outputs
+env.mock_tool("navigate", "Navigation successful")
+env.mock_tool("screenshot", b"fake_image_data")
 
-3. **Hub env attribute** (for tool access):
-   ```python
-   setup_hub.env = game  # Tools access via hub.env
-   ```
+async with env:
+    result = await env.call_tool("navigate", url="https://example.com")
+    # Returns "Navigation successful" instead of actually navigating
 
-### Tool Lifecycle
+env.unmock()  # Disable mock mode
+```
 
-1. **Setup tools** - Hidden from agents, prepare environment state
-2. **Interaction tools** - Available to agents for control
-3. **Evaluate tools** - Hidden from agents, score performance
+| Method | Description |
+|--------|-------------|
+| `mock(enable=True)` | Enable/disable mock mode |
+| `unmock()` | Disable mock mode |
+| `mock_tool(name, output)` | Set specific mock output |
+| `is_mock` | Check if mock mode is enabled |
 
-### Progress Notifications
+## Properties
 
-Send [progress updates](https://modelcontextprotocol.io/specification/basic/utilities/progress) during long-running operations:
+| Property | Type | Description |
+|----------|------|-------------|
+| `name` | `str` | Environment name |
+| `prompt` | `str \| None` | Default prompt (set by scenarios or agent code) |
+| `is_connected` | `bool` | True if in context |
+| `connections` | `dict[str, Connector]` | Active connections |
 
-```python
-async def send_progress(progress: int, message: str):
-    if progress_token:
-        await ctx.session.send_progress_notification(
-            progress_token=progress_token,
-            progress=progress,
-            total=100,
-            message=message
-        )
-```
+## Creating Tasks
 
-<Info>
-Progress notifications follow the [MCP progress specification](https://modelcontextprotocol.io/specification/basic/utilities/progress#progress-flow). The `progressToken` comes from the client's request [metadata](https://modelcontextprotocol.io/specification/basic/index#_meta).
-</Info>
-
-### Metadata Access
-
-Agent metadata flows through initialization:
+Call the environment to create a Task:
 
 ```python
-@mcp.initialize
-async def initialize_environment(ctx):
-    # From agent's metadata class variable
-    width = ctx.meta.get("display_width", 1920) if ctx.meta else 1920
-    height = ctx.meta.get("display_height", 1080) if ctx.meta else 1080
-```
+# With scenario
+task = env("checkout", product="laptop")
 
-## Testing
+# Without scenario (just the environment)
+task = env()
+```
 
-```bash
-# CLI testing
-hud debug my-env:latest
-hud analyze my-env:latest
+Then run with `hud.eval()`:
 
-# Python testing
-async def test():
-    from hud.clients import MCPClient
-    
-    client = MCPClient({
-        "env": {"command": "docker", "args": ["run", "-i", "my-env"]}
-    })
-    
-    async with client:
-        tools = await client.list_tools()
-        result = await client.call_tool("setup", {"value": 0})
+```python
+async with hud.eval(task, variants={"model": ["gpt-4o"]}) as ctx:
+    ...
 ```
 
 ## See Also
 
-- [Build Environments](/build-environments) - Getting started guide
-- [Tools](/reference/tools) - Tool implementation reference
-- [Environment Spec](/build-environments/spec) - Technical specification and architecture
\ No newline at end of file
+- [Evals](/reference/evals) - hud.eval() reference
+- [MCPServer](/reference/mcpserver) - Building MCP servers
+- [Environments Guide](/quick-links/environments) - Getting started guide
+
diff --git a/docs/reference/evals.mdx b/docs/reference/evals.mdx
new file mode 100644
index 00000000..425e461e
--- /dev/null
+++ b/docs/reference/evals.mdx
@@ -0,0 +1,208 @@
+---
+title: "Evals"
+description: "SDK reference for hud.eval() - the unified evaluation context manager"
+icon: "flask-vial"
+---
+
+`hud.eval()` is the primary way to run evaluations. It creates an `EvalContext` with telemetry, handles parallel execution, and integrates with the HUD platform.
+
+## hud.eval()
+
+```python
+import hud
+
+async with hud.eval() as ctx:
+    # ctx is an EvalContext (extends Environment)
+    response = await client.chat.completions.create(...)
+    ctx.reward = 1.0
+```
+
+### Parameters
+
+| Parameter | Type | Description | Default |
+|-----------|------|-------------|---------|
+| `source` | `Task \| list[Task] \| str \| None` | Task objects from `env()`, task slugs, or None | `None` |
+| `variants` | `dict[str, Any] \| None` | A/B test configuration (lists expand to combinations) | `None` |
+| `group` | `int` | Runs per variant for statistical significance | `1` |
+| `group_ids` | `list[str] \| None` | Custom group IDs for parallel runs | `None` |
+| `job_id` | `str \| None` | Job ID to link traces to | `None` |
+| `api_key` | `str \| None` | API key for backend calls | `None` |
+| `max_concurrent` | `int \| None` | Maximum concurrent evaluations | `None` |
+| `trace` | `bool` | Send telemetry to backend | `True` |
+| `quiet` | `bool` | Suppress console output | `False` |
+
+### Source Types
+
+The `source` parameter accepts:
+
+```python
+# 1. Blank eval - manual setup and reward
+async with hud.eval() as ctx:
+    ctx.reward = compute_reward()
+
+# 2. Task from Environment (recommended)
+env = Environment("my-env")
+task = env("checkout", product="laptop")  # Creates Task from scenario
+async with hud.eval(task) as ctx:
+    await agent.run(ctx.prompt)
+
+# 3. Task slug (loads from platform)
+async with hud.eval("my-org/browser-task") as ctx:
+    await agent.run(ctx)
+
+# 4. Multiple tasks
+tasks = [env("checkout", product="laptop"), env("checkout", product="phone")]
+async with hud.eval(tasks) as ctx:
+    await agent.run(ctx.prompt)
+```
+
+### Variants
+
+Test multiple configurations in parallel:
+
+```python
+async with hud.eval(
+    eval,
+    variants={"model": ["gpt-4o", "claude-sonnet-4-5"]},
+) as ctx:
+    model = ctx.variants["model"]  # Current variant
+    response = await client.chat.completions.create(model=model, ...)
+```
+
+Lists expand to all combinations:
+
+```python
+variants = {
+    "model": ["gpt-4o", "claude"],
+    "temperature": [0.0, 0.7],
+}
+# Creates 4 combinations: gpt-4o+0.0, gpt-4o+0.7, claude+0.0, claude+0.7
+```
+
+### Groups
+
+Run each variant multiple times for statistical significance:
+
+```python
+async with hud.eval(eval, variants={"model": ["gpt-4o"]}, group=5) as ctx:
+    # Runs 5 times - see the distribution of results
+    ...
+```
+
+Total runs = `len(evals) × len(variant_combinations) × group`
+
+### Concurrency Control
+
+```python
+async with hud.eval(
+    evals,
+    max_concurrent=10,  # Max 10 parallel evaluations
+) as ctx:
+    ...
+```
+
+## EvalContext
+
+`EvalContext` extends `Environment` with evaluation tracking.
+
+### Properties
+
+| Property | Type | Description |
+|----------|------|-------------|
+| `trace_id` | `str` | Unique trace identifier |
+| `eval_name` | `str` | Evaluation name |
+| `prompt` | `str \| None` | Task prompt (from scenario or task) |
+| `variants` | `dict[str, Any]` | Current variant assignment |
+| `reward` | `float \| None` | Evaluation reward (settable) |
+| `answer` | `str \| None` | Submitted answer |
+| `error` | `BaseException \| None` | Error if failed |
+| `results` | `list[EvalContext]` | Results from parallel runs |
+| `headers` | `dict[str, str]` | Trace headers for HTTP requests |
+| `job_id` | `str \| None` | Parent job ID |
+| `group_id` | `str \| None` | Group ID for parallel runs |
+| `index` | `int` | Index in parallel execution |
+
+### Methods
+
+All `Environment` methods are available, plus:
+
+```python
+# Submit answer (passes to scenario for evaluation)
+await ctx.submit(answer)
+
+# Set reward directly
+ctx.reward = 1.0
+
+# Access tools in provider formats
+tools = ctx.as_openai_chat_tools()
+
+# Call tools
+result = await ctx.call_tool("my_tool", arg="value")
+```
+
+### Headers for Telemetry
+
+Inside an eval context, trace headers are automatically injected into HTTP requests:
+
+```python
+async with hud.eval() as ctx:
+    # Requests to HUD services include Trace-Id automatically
+    response = await client.chat.completions.create(...)
+    
+    # Manual access
+    print(ctx.headers)  # {"Trace-Id": "..."}
+```
+
+## Working with Environments
+
+The recommended pattern is to create Evals from an Environment:
+
+```python
+from hud import Environment
+import hud
+
+env = Environment("my-env")
+
+@env.tool()
+def count_letter(text: str, letter: str) -> int:
+    return text.lower().count(letter.lower())
+
+@env.scenario("count")
+async def count_scenario(sentence: str, letter: str):
+    answer = yield f"How many '{letter}' in '{sentence}'?"
+    correct = str(sentence.lower().count(letter.lower()))
+    yield correct in answer
+
+# Create a Task from the scenario
+task = env("count", sentence="Strawberry", letter="r")
+
+# Run with variants
+async with hud.eval(task, variants={"model": ["gpt-4o", "claude"]}) as ctx:
+    response = await client.chat.completions.create(
+        model=ctx.variants["model"],
+        messages=[{"role": "user", "content": ctx.prompt}],
+        tools=ctx.as_openai_chat_tools(),
+    )
+    await ctx.submit(response.choices[0].message.content or "")
+```
+
+## Results
+
+After parallel runs complete, access results on the summary context:
+
+```python
+async with hud.eval(eval, variants={"model": ["gpt-4o", "claude"]}, group=3) as ctx:
+    ...
+
+# ctx.results contains all individual EvalContexts
+for result in ctx.results:
+    print(f"{result.variants}: reward={result.reward}, answer={result.answer}")
+```
+
+## See Also
+
+- [Environments](/reference/environments) - Environment class reference
+- [A/B Evals](/quick-links/ab-testing) - Variants and groups guide
+- [Deploy](/quick-links/deploy) - Running evals at scale
+- [`hud eval` CLI](/reference/cli/eval) - Command-line interface
+
diff --git a/docs/reference/mcpserver.mdx b/docs/reference/mcpserver.mdx
new file mode 100644
index 00000000..42d33e2b
--- /dev/null
+++ b/docs/reference/mcpserver.mdx
@@ -0,0 +1,510 @@
+---
+title: "MCPServer"
+description: "SDK reference for building MCP servers"
+icon: "server"
+---
+
+`MCPServer` is the base class for building MCP-compatible servers that work with any MCP client. It extends FastMCP with Docker-friendly features.
+
+## Why MCP?
+
+Traditional agent frameworks couple agents tightly to specific environments. MCP decouples them:
+
+<CardGroup cols={2}>
+<Card title="Without MCP" icon="x">
+  - Agent code hardcoded for each environment
+  - No standardization across tools
+  - Difficult to swap agents or environments
+</Card>
+
+<Card title="With MCP" icon="check">
+  - Any agent works with any environment
+  - Standard protocol for all interactions
+  - Easy to swap components
+</Card>
+</CardGroup>
+
+MCP standardizes agent-environment communication through JSON-RPC messages. Agents call tools exposed by servers and receive structured responses.
+
+## MCPServer
+
+```python
+from hud.server import MCPServer
+```
+
+Enhanced FastMCP server with Docker-friendly features for building HUD environments.
+
+**Constructor Parameters:**
+| Parameter | Type | Description | Default |
+|-----------|------|-------------|---------|
+| `name` | `str` | Server name for MCP handshake | Required |
+| `instructions` | `str` | Server instructions/description | `None` |
+| `**fastmcp_kwargs` | `Any` | Additional FastMCP parameters | - |
+
+**Key Features:**
+1. **SIGTERM handling** - Graceful shutdown in containers via custom runner
+2. **Initialize decorator** - Async setup during MCP initialize request (stdout is temporarily redirected to stderr during initialization to avoid corrupting MCP output)
+3. **Shutdown decorator** - Runs only on SIGTERM (container termination), not on hot‑reload/SIGINT
+4. **Enhanced add_tool()** - Automatically handles `BaseTool` instances and raw FastMCP Tool objects
+5. **Tool decorator passthrough** - `@mcp.tool` returns the original function for easy composition
+6. **FastMCP inheritance** - All FastMCP methods available (`mount`, `resource`, `tool`)
+
+### Decorators
+
+#### @initialize
+
+Run async setup during MCP initialize request:
+
+```python
+mcp = MCPServer(name="my-env")
+
+@mcp.initialize
+async def setup_environment(ctx):
+    """
+    Initialize environment resources.
+    
+    Args:
+        ctx: RequestContext with:
+            - ctx.meta: Client metadata dict
+            - ctx.session: MCP ServerSession
+    """
+    # Access metadata from agent (if provided)
+    if ctx.meta:
+        progress_token = ctx.meta.get("progressToken")
+        display_width = ctx.meta.get("display_width", 1920)
+        display_height = ctx.meta.get("display_height", 1080)
+        
+        # Send progress notifications
+        if progress_token:
+            await ctx.session.send_progress_notification(
+                progress_token=progress_token,
+                progress=50,
+                total=100,
+                message="Initializing environment..."
+            )
+```
+
+#### @shutdown  
+
+Run cleanup on SIGTERM (container termination only):
+
+```python
+@mcp.shutdown
+async def cleanup():
+    """Clean up resources on shutdown."""
+    if browser_provider:
+        browser_provider.close()
+    logger.info("Cleanup complete")
+```
+
+### Tool Registration
+
+Three ways to register tools:
+
+```python
+# 1. Decorator for simple functions
+@mcp.tool()
+async def my_tool(param: str) -> dict:
+    return {"result": param}
+
+# 2. Add BaseTool instances
+from hud.tools import BashTool
+bash = BashTool()
+mcp.add_tool(bash)  # Automatically uses bash.mcp internally
+
+# 3. Add non-BaseTool instances directly
+from custom import PlaywrightTool
+playwright = PlaywrightTool()
+mcp.add_tool(playwright)  # Added as-is
+```
+
+### Hub Pattern (mount)
+
+Use BaseHub for organized tool namespaces:
+
+```python
+from hud.tools import BaseHub
+
+# Create hub
+setup_hub = BaseHub("setup")
+
+# Add internal tools (hidden from agents)
+@setup_hub.tool("board")
+async def setup_board(size: int = 4):
+    game = setup_hub.env
+    game.reset(size=size)
+    return [TextContent(text=f"{size}x{size} board initialized")]
+
+# Mount hub on server
+mcp.mount(setup_hub)
+
+# Agents call via dispatcher: setup(name="board", arguments={"size": 4})
+```
+
+### Resources
+
+Expose metadata via MCP resources:
+
+```python
+@mcp.resource("telemetry://live")
+async def get_telemetry():
+    """Expose live telemetry data."""
+    return {
+        "provider": os.getenv("BROWSER_PROVIDER"),
+        "status": "running" if browser_provider else "stopped",
+        "live_url": browser_provider.get_live_view_url() if browser_provider else None,
+        "timestamp": datetime.now().isoformat()
+    }
+```
+
+### Running the Server
+
+```python
+if __name__ == "__main__":
+    # Run with SIGTERM handling (stdio by default)
+    mcp.run()
+
+    # Or use development transports (HTTP/SSE)
+    mcp.run(transport="http", port=8765)
+    mcp.run(transport="sse", port=8080)
+```
+
+When using HTTP/SSE, HUD development helper endpoints are available:
+
+- `GET /hud` – overview
+- `GET /hud/tools` – list tools with schemas
+- `GET /hud/resources` – list resources
+- `GET /hud/prompts` – list prompts
+
+## Real Environment Examples
+
+### Minimal Environment
+
+```python
+# src/hud_controller/server.py
+from hud.server import MCPServer
+from mcp.types import TextContent
+
+mcp = MCPServer(name="counter-env")
+counter = {"value": 0}
+
+@mcp.tool()
+async def setup(start_value: int = 0):
+    """Initialize counter."""
+    counter["value"] = start_value
+    return {"status": "ready", "counter": counter["value"]}
+
+@mcp.tool()
+async def increment():
+    """Increment counter."""
+    counter["value"] += 1
+    return [TextContent(text=f"Counter: {counter['value']}", type="text")]
+
+@mcp.tool()
+async def evaluate(target: int):
+    """Check if target reached."""
+    from hud.tools.types import EvaluationResult
+    return EvaluationResult(
+        reward=1.0 if counter["value"] >= target else 0.0,
+        done=counter["value"] >= target
+    )
+
+if __name__ == "__main__":
+    mcp.run()
+```
+
+### text_2048 Environment
+
+From `environments/text_2048/src/hud_controller/server.py`:
+
+```python
+from hud.server import MCPServer
+from .game import Game2048
+from .tools import MoveTool
+from .setup import setup as setup_hub
+from .evaluate import evaluate as evaluate_hub
+
+mcp = MCPServer(name="text-2048")
+game = None
+
+@mcp.initialize
+async def initialize_environment(ctx):
+    global game
+    
+    # Progress notifications
+    progress_token = getattr(ctx.meta, "progressToken", None) if ctx.meta else None
+    
+    async def send_progress(progress: int, message: str):
+        if progress_token:
+            await ctx.session.send_progress_notification(
+                progress_token=progress_token,
+                progress=progress,
+                total=100,
+                message=message
+            )
+    
+    await send_progress(0, "Starting 2048 game environment...")
+    
+    # Create game
+    game = Game2048()
+    game.reset()
+    
+    await send_progress(50, "Setting up game board...")
+    
+    # Set game on hubs
+    setup_hub.env = game
+    evaluate_hub.env = game
+    
+    # Mount hubs
+    mcp.mount(setup_hub)
+    mcp.mount(evaluate_hub)
+    
+    await send_progress(70, "Configuring tools...")
+    
+    # Add move tool
+    mcp.add_tool(MoveTool(env=game))
+    
+    await send_progress(100, "2048 environment ready")
+```
+
+### remote_browser Environment
+
+From `environments/remote_browser/src/hud_controller/server.py`:
+
+```python
+from hud.server import MCPServer
+from hud.tools.computer import HudComputerTool, AnthropicComputerTool, OpenAIComputerTool
+from .tools import PlaywrightToolWithMemory, BrowserExecutor
+from .setup import setup as setup_hub
+from .evaluate import evaluate as evaluate_hub
+from .providers import get_provider
+
+mcp = MCPServer(
+    name="HUD Remote Browser Environment",
+    instructions="""Remote browser automation environment..."""
+)
+
+# Global state
+browser_provider = None
+playwright_tool = None
+
+@mcp.resource("telemetry://live")
+async def get_telemetry_resource():
+    """MCP resource with live browser status."""
+    return {
+        "provider": os.getenv("BROWSER_PROVIDER", "unknown"),
+        "status": "running" if browser_provider else "stopped",
+        "live_url": browser_provider.get_live_view_url() if browser_provider else None,
+        "cdp_url": browser_provider.cdp_url if browser_provider else None
+    }
+
+@mcp.initialize
+async def initialize_environment(ctx):
+    global browser_provider, playwright_tool
+    
+    # Get metadata
+    metadata = ctx.meta
+    progress_token = metadata.get("progressToken", None)
+    
+    # Initialize provider
+    provider_name = os.getenv("BROWSER_PROVIDER")
+    provider_class = get_provider(provider_name)
+    browser_provider = provider_class(config)
+    
+    # Launch browser
+    cdp_url = await browser_provider.launch()
+    
+    # Create playwright tool
+    playwright_tool = PlaywrightToolWithMemory(cdp_url=cdp_url)
+    await playwright_tool._ensure_browser()
+    
+    # Add playwright tool (not a BaseTool, added directly)
+    mcp.add_tool(playwright_tool)
+    
+    # Create computer tools
+    executor = BrowserExecutor(playwright_tool)
+    tool_kwargs = {"executor": executor}
+    
+    # Add display dimensions from metadata
+    if metadata:
+        width = metadata.get("display_width")
+        height = metadata.get("display_height")
+        if width and height:
+            tool_kwargs["width"] = width
+            tool_kwargs["height"] = height
+    
+    # Add computer tools (all are BaseTool subclasses)
+    mcp.add_tool(HudComputerTool(**tool_kwargs))
+    mcp.add_tool(AnthropicComputerTool(**tool_kwargs))
+    mcp.add_tool(OpenAIComputerTool(**tool_kwargs))
+    
+    # Mount hubs
+    setup_hub.env = playwright_tool
+    evaluate_hub.env = playwright_tool
+    mcp.mount(setup_hub)
+    mcp.mount(evaluate_hub)
+
+@mcp.shutdown
+async def shutdown_environment():
+    """Cleanup browser resources."""
+    global browser_provider
+    if browser_provider:
+        browser_provider.close()
+    browser_provider = None
+```
+
+## Standard Structure
+
+### Directory Layout
+
+```
+my-environment/
+├── Dockerfile
+├── pyproject.toml
+├── controller/                 # MCP controller (stdio)
+│   ├── __init__.py             # mcp = MCPServer()
+│   ├── __main__.py             # python -m controller → mcp.run()
+│   ├── hooks.py                # @mcp.initialize / @mcp.shutdown
+│   └── tools.py                # @mcp.tool(...)
+└── environment/                # Optional backend (HTTP/IPC)
+    └── server.py               # e.g., FastAPI app
+```
+
+### Dockerfile
+
+```dockerfile
+FROM python:3.11-slim
+
+WORKDIR /app
+
+# Copy and install
+COPY pyproject.toml ./
+COPY controller/ ./controller/
+COPY environment/ ./environment/
+RUN pip install --no-cache-dir -e .
+
+ENV ENV_SERVER_PORT=8005
+
+# Start optional backend, then MCP controller on stdio
+CMD ["sh", "-c", "uvicorn environment.server:app --host 0.0.0.0 --port $ENV_SERVER_PORT --log-level warning & python -m controller"]
+```
+
+### Hub Module Pattern
+
+Example from text_2048:
+
+```python
+# src/hud_controller/setup/__init__.py
+from hud.tools.base import BaseHub
+
+setup = BaseHub("setup")
+
+# Import all setup functions to register them
+from . import board
+
+__all__ = ["setup"]
+
+# src/hud_controller/setup/board.py
+from . import setup
+
+@setup.tool("board")
+async def setup_board(board_size: int = 4):
+    """Initialize game board."""
+    game = setup.env  # Access environment from hub
+    game.reset(size=board_size)
+    return [TextContent(text=f"{board_size}x{board_size} game initialized")]
+```
+
+## Key Concepts
+
+### Environment State
+
+Three patterns for managing state:
+
+1. **Global variables** (simple environments):
+   ```python
+   game = None
+   
+   @mcp.initialize
+   async def initialize_environment(ctx):
+       global game
+       game = Game2048()
+   ```
+
+2. **Context class** (complex environments):
+   ```python
+   class EnvironmentContext:
+       def __init__(self):
+           self.browser = None
+           self.page = None
+   
+   env = EnvironmentContext()
+   ```
+
+3. **Hub env attribute** (for tool access):
+   ```python
+   setup_hub.env = game  # Tools access via hub.env
+   ```
+
+### Tool Lifecycle
+
+1. **Setup tools** - Hidden from agents, prepare environment state
+2. **Interaction tools** - Available to agents for control
+3. **Evaluate tools** - Hidden from agents, score performance
+
+### Progress Notifications
+
+Send [progress updates](https://modelcontextprotocol.io/specification/basic/utilities/progress) during long-running operations:
+
+```python
+async def send_progress(progress: int, message: str):
+    if progress_token:
+        await ctx.session.send_progress_notification(
+            progress_token=progress_token,
+            progress=progress,
+            total=100,
+            message=message
+        )
+```
+
+<Info>
+Progress notifications follow the [MCP progress specification](https://modelcontextprotocol.io/specification/basic/utilities/progress#progress-flow). The `progressToken` comes from the client's request [metadata](https://modelcontextprotocol.io/specification/basic/index#_meta).
+</Info>
+
+### Metadata Access
+
+Agent metadata flows through initialization:
+
+```python
+@mcp.initialize
+async def initialize_environment(ctx):
+    # From agent's metadata class variable
+    width = ctx.meta.get("display_width", 1920) if ctx.meta else 1920
+    height = ctx.meta.get("display_height", 1080) if ctx.meta else 1080
+```
+
+## Testing
+
+```bash
+# CLI testing
+hud debug my-env:latest
+hud analyze my-env:latest
+
+# Python testing
+async def test():
+    from hud.clients import MCPClient
+    
+    client = MCPClient({
+        "env": {"command": "docker", "args": ["run", "-i", "my-env"]}
+    })
+    
+    async with client:
+        tools = await client.list_tools()
+        result = await client.call_tool("setup", {"value": 0})
+```
+
+## See Also
+
+- [Environments](/reference/environments) - Environment class (client-side)
+- [Tools](/reference/tools) - Tool implementation reference
+- [Evals](/reference/evals) - Running evaluations
\ No newline at end of file
diff --git a/docs/reference/tasks.mdx b/docs/reference/tasks.mdx
index 0bd6d76a..44f93138 100644
--- a/docs/reference/tasks.mdx
+++ b/docs/reference/tasks.mdx
@@ -4,12 +4,16 @@ description: "SDK reference for task configuration and dataset utilities"
 icon: "list-check"
 ---
 
-The HUD SDK provides the `Task` class for defining agent objectives and dataset utilities for managing task collections.
+The HUD SDK provides the `LegacyTask` class for defining agent objectives and dataset utilities for managing task collections.
 
-## Task Class
+<Warning>
+`LegacyTask` is deprecated. For new code, use `env("scenario_name", **args)` to create Task objects. See [Environments](/reference/environments) for the recommended approach.
+</Warning>
+
+## LegacyTask Class
 
 ```python
-from hud.datasets import Task
+from hud.datasets import LegacyTask
 ```
 
 Pydantic model that defines an agent's objective, setup, and evaluation criteria.
@@ -31,7 +35,7 @@ Pydantic model that defines an agent's objective, setup, and evaluation criteria
 The `mcp_config` field automatically resolves environment variables using `${VAR_NAME}` syntax:
 
 ```python
-task = Task(
+task = LegacyTask(
     prompt="Navigate to the dashboard",
     mcp_config={
         "browser": {
@@ -45,7 +49,7 @@ task = Task(
 )
 ```
 
-Variables are resolved when Task is created from a dict - this is why datasets should store raw dictionaries.
+Variables are resolved when LegacyTask is created from a dict - this is why datasets should store raw dictionaries.
 
 ## Running Tasks
 
@@ -208,7 +212,7 @@ The `agent_config` field on tasks supports:
 | `initial_screenshot` | `bool` | Take screenshot before first action |
 
 ```python
-task = Task(
+task = LegacyTask(
     prompt="Complete the form",
     mcp_config={...},
     agent_config={
diff --git a/docs/reference/types.mdx b/docs/reference/types.mdx
index 8361353a..57f8cdb5 100644
--- a/docs/reference/types.mdx
+++ b/docs/reference/types.mdx
@@ -6,133 +6,126 @@ icon: "code"
 
 Core types used throughout the HUD SDK.
 
-## Trace
+## Task
 
-Returned by `agent.run()`. Contains the result of an agent execution.
+Created by calling an Environment. Holds configuration for running an evaluation.
 
 ```python
-from hud.types import Trace
+from hud import Environment
+
+env = Environment("my-env")
+task = env("scenario_name", arg1="value")  # Returns Task
 ```
 
 | Field | Type | Description |
 |-------|------|-------------|
-| `reward` | `float` | Evaluation score (0.0-1.0) |
-| `done` | `bool` | Whether execution completed |
-| `content` | `str \| None` | Final response content |
-| `isError` | `bool` | Whether an error occurred |
-| `info` | `dict[str, Any]` | Additional metadata |
-| `task` | `Task \| None` | The executed task |
-| `trace` | `list[TraceStep]` | Execution trace steps |
-| `messages` | `list[Any]` | Final conversation state |
+| `env` | `Environment \| dict \| None` | Source environment |
+| `scenario` | `str \| None` | Scenario name to run |
+| `args` | `dict[str, Any]` | Script arguments |
+| `trace_id` | `str \| None` | Trace identifier |
+| `job_id` | `str \| None` | Parent job ID |
+| `group_id` | `str \| None` | Group ID for parallel runs |
+| `index` | `int` | Index in parallel execution |
+| `variants` | `dict[str, Any] \| None` | Variant assignment |
 
-## AgentResponse
+## EvalContext
 
-Returned by agent `get_response()` methods. Represents a single model response.
+Returned by `hud.eval()`. Extends Environment with evaluation tracking.
 
 ```python
-from hud.types import AgentResponse
+async with hud.eval(task) as ctx:
+    print(ctx.prompt)      # Task prompt
+    print(ctx.variants)    # Current variant
+    ctx.reward = 1.0       # Set reward
 ```
 
-| Field | Type | Description |
-|-------|------|-------------|
-| `tool_calls` | `list[MCPToolCall]` | Tools to execute |
-| `done` | `bool` | Whether agent should stop |
-| `content` | `str \| None` | Response text |
-| `reasoning` | `str \| None` | Model reasoning/thinking |
-| `info` | `dict[str, Any]` | Provider-specific metadata |
-| `isError` | `bool` | Error flag |
+| Property | Type | Description |
+|----------|------|-------------|
+| `trace_id` | `str` | Unique trace identifier |
+| `eval_name` | `str` | Evaluation name |
+| `prompt` | `str \| None` | Task prompt |
+| `variants` | `dict[str, Any]` | Current variant assignment |
+| `reward` | `float \| None` | Evaluation reward |
+| `answer` | `str \| None` | Submitted answer |
+| `error` | `BaseException \| None` | Error if failed |
+| `results` | `list[EvalContext]` | Results from parallel runs |
+| `headers` | `dict[str, str]` | Trace headers |
 
 ## MCPToolCall
 
-Represents a tool call to be executed.
+Represents a tool call to execute.
 
 ```python
 from hud.types import MCPToolCall
+
+call = MCPToolCall(
+    name="navigate",
+    arguments={"url": "https://example.com"}
+)
 ```
 
 | Field | Type | Description |
 |-------|------|-------------|
-| `id` | `str` | Unique identifier (auto-generated if not provided) |
+| `id` | `str` | Unique identifier (auto-generated) |
 | `name` | `str` | Tool name |
 | `arguments` | `dict[str, Any]` | Tool arguments |
 
-**Example:**
-
-```python
-tool_call = MCPToolCall(
-    name="playwright",
-    arguments={"action": "click", "selector": "#submit"}
-)
-```
-
 ## MCPToolResult
 
 Result from executing a tool call.
 
 ```python
 from hud.types import MCPToolResult
+
+result = MCPToolResult(
+    content=[TextContent(text="Success", type="text")],
+    isError=False
+)
 ```
 
 | Field | Type | Description |
 |-------|------|-------------|
 | `content` | `list[ContentBlock]` | Result content blocks |
-| `structuredContent` | `dict[str, Any] \| None` | Structured result data |
-| `isError` | `bool` | Whether the tool call failed |
+| `structuredContent` | `dict \| None` | Structured result data |
+| `isError` | `bool` | Whether the call failed |
 
-## Task
+## Trace
 
-Defines an agent task with prompt, environment config, and lifecycle tools.
+Returned by `agent.run()`. Contains the result of an agent execution.
 
 ```python
-from hud.types import Task
+from hud.types import Trace
+
+result = await agent.run(task, max_steps=20)
+print(result.reward, result.done)
 ```
 
 | Field | Type | Description |
 |-------|------|-------------|
-| `prompt` | `str` | Instruction for the agent |
-| `mcp_config` | `dict` | Environment connection config |
-| `id` | `str \| None` | Unique identifier (required for datasets) |
-| `system_prompt` | `str \| None` | Custom system prompt |
-| `setup_tool` | `dict \| list[dict] \| None` | Tool(s) to initialize state |
-| `evaluate_tool` | `dict \| list[dict] \| None` | Tool(s) to score performance |
-| `agent_config` | `BaseAgentConfig \| None` | Task-specific agent config |
-| `metadata` | `dict \| None` | Additional task metadata |
-
-**Example:**
-
-```python
-task = Task(
-    prompt="Navigate to example.com and click login",
-    mcp_config={
-        "hud": {
-            "url": "https://mcp.hud.ai/v3/mcp",
-            "headers": {
-                "Authorization": "Bearer ${HUD_API_KEY}",
-                "Mcp-Image": "hudpython/hud-remote-browser:latest"
-            }
-        }
-    },
-    setup_tool={"name": "playwright", "arguments": {"action": "navigate", "url": "https://example.com"}},
-    evaluate_tool={"name": "evaluate", "arguments": {"name": "url_contains", "substring": "/login"}}
-)
-```
+| `reward` | `float` | Evaluation score (0.0-1.0) |
+| `done` | `bool` | Whether execution completed |
+| `content` | `str \| None` | Final response content |
+| `isError` | `bool` | Whether an error occurred |
+| `info` | `dict[str, Any]` | Additional metadata |
+| `trace` | `list[TraceStep]` | Execution trace steps |
+| `messages` | `list[Any]` | Final conversation state |
 
-## BaseAgentConfig
+## AgentResponse
 
-Standard agent configuration that tasks can override.
+Returned by agent `get_response()` methods.
 
 ```python
-from hud.types import BaseAgentConfig
+from hud.types import AgentResponse
 ```
 
-| Field | Type | Description | Default |
-|-------|------|-------------|---------|
-| `allowed_tools` | `list[str] \| None` | Tool patterns to expose | `None` (all) |
-| `disallowed_tools` | `list[str] \| None` | Tool patterns to hide | `None` |
-| `system_prompt` | `str \| None` | Custom system prompt | `None` |
-| `append_setup_output` | `bool` | Include setup output in first turn | `True` |
-| `initial_screenshot` | `bool` | Include screenshot in initial context | `True` |
-| `response_tool_name` | `str \| None` | Lifecycle tool for responses | `None` |
+| Field | Type | Description |
+|-------|------|-------------|
+| `tool_calls` | `list[MCPToolCall]` | Tools to execute |
+| `done` | `bool` | Whether agent should stop |
+| `content` | `str \| None` | Response text |
+| `reasoning` | `str \| None` | Model reasoning/thinking |
+| `info` | `dict[str, Any]` | Provider-specific metadata |
+| `isError` | `bool` | Error flag |
 
 ## AgentType
 
@@ -140,6 +133,9 @@ Enum of supported agent types.
 
 ```python
 from hud.types import AgentType
+
+agent_cls = AgentType.CLAUDE.cls
+agent = agent_cls.create()
 ```
 
 | Value | Agent Class |
@@ -150,25 +146,44 @@ from hud.types import AgentType
 | `AgentType.GEMINI` | `GeminiAgent` |
 | `AgentType.OPENAI_COMPATIBLE` | `OpenAIChatAgent` |
 
-**Example:**
+## ContentBlock
+
+MCP content types (from `mcp.types`):
 
 ```python
-from hud.types import AgentType
+from mcp.types import TextContent, ImageContent
 
-agent_cls = AgentType.CLAUDE.cls  # Returns ClaudeAgent class
-agent = agent_cls.create()
+# Text
+TextContent(text="Hello", type="text")
+
+# Image
+ImageContent(data="base64...", mimeType="image/png", type="image")
 ```
 
-## ContentBlock
+## EvaluationResult
 
-MCP content block types (from `mcp.types`):
+Returned by evaluation tools.
 
-- `TextContent` - Text content with `text` field
-- `ImageContent` - Image with `data` (base64) and `mimeType`
-- `EmbeddedResource` - Embedded resource reference
+```python
+from hud.tools.types import EvaluationResult
 
-## See Also
+result = EvaluationResult(
+    reward=0.8,
+    done=True,
+    content="Task completed",
+    info={"score": 80}
+)
+```
 
-- [Agents Reference](/reference/agents) - Agent classes and configuration
-- [Tasks Reference](/reference/tasks) - Task configuration details
+| Field | Type | Description |
+|-------|------|-------------|
+| `reward` | `float` | Score (0.0-1.0) |
+| `done` | `bool` | Task complete |
+| `content` | `str \| None` | Details |
+| `info` | `dict` | Metadata |
+
+## See Also
 
+- [Evals](/reference/evals) - hud.eval() reference
+- [Environments](/reference/environments) - Environment class
+- [Agents](/reference/agents) - Agent classes
diff --git a/docs/train-agents/quickstart.mdx b/docs/train-agents/quickstart.mdx
deleted file mode 100644
index 32e83471..00000000
--- a/docs/train-agents/quickstart.mdx
+++ /dev/null
@@ -1,126 +0,0 @@
----
-title: "RL Quickstart"
-icon: "graduation-cap"
----
-
-## Prerequisites
-
-- HUD API key: Remote training requires authentication. Set `HUD_API_KEY` before running:
-
-```bash
-export HUD_API_KEY="sk-hud-..."  # get one at https://hud.ai
-# Or persist it locally:
-hud set HUD_API_KEY=sk-hud-...
-```
-
-- Docker daemon: For local runs (using `--local`) or when training against a local Docker image, ensure Docker Desktop is installed and the Docker daemon is running.
-
-## Quickstart
-
-Install and download a taskset:
-
-```bash
-uv tool install hud-python@latest --python 3.12
-hud get hud-evals/2048-basic
-```
-
-### 1) Simple: Train (remote by default)
-
-```bash
-hud rl 2048-basic.json
-```
-
-This launches training remotely and automatically provisions a vLLM server and a trainer for you. You can monitor progress on https://hud.ai. The server persists between runs, so you can rerun training or evaluate against the same endpoint.
-
-Optional baseline first (Claude or Operator):
-
-```bash
-hud eval 2048-basic.json
-```
-
-### 2) Run on your own machine/remote
-
-Use any provider with at least 2 GPUs (one for inference, one for training). Run locally with the flag `--local`:
-
-```bash
-uv tool install hud-python@latest --python 3.12
-hud get hud-evals/2048-basic
-hud rl 2048-basic.json --local
-```
-
-### Recommended setups
-
-- 2× A100: quick iteration, shorter runs
-- 8× A100: higher throughput for larger tasksets
-
-Training throughput depends on task complexity and parallelism (`max_parallel_episodes`).
-
-### 3) Build your own environment (hud init)
-
-Create a new MCP environment, develop with hot-reload, and train on a production image:
-
-```bash
-hud init my-env && cd my-env
-hud dev --interactive
-# When ready to run:
-hud rl
-```
-
-Change the tasks.json to include other tasks you want to train on.
-
-See [hud init](/reference/cli/init) for options and details.
-
-
-## Getting the best performance
-
-Often training a good model requires many iterations over the parameters of the trainer. Take the config generated by `hud rl` and modify it to various values to do a hyperparameter sweep.
-
-For easy launching, specify the tasks and config upfront, and add `--yes` to automatically launch vllm and training.
-
-```bash
-hud rl taskset.json --config rl-config.json --yes
-```
-
-Additionally, sometimes it may be helpful to run an initial analysis on the dataset to determine which tasks would be the most informative to trian on. In that case either start with a deployed model or run `hud rl` without training, and then:
-
-```bash
-hud eval taskset.json --full --group-size 6 --max-steps 5
-```
-
-This will prompt you for the model choice, produce a table of accuracies per task. Prefer tasks which are 10%-60% accurate for training.
-
-Some general findings from our internal training runs:
-- As many different tasks per gradient update as possible (runs with 4+ GPUs and batch size of 50+ are much more stable than single GPU runs)
-- Batch size should be somewhere around 2/X where X is the accuracy of that given task on an untrained model.
-
-### Pricing
-
-Below is the pricing by GPU type. Actual prices vary — see https://hud.ai/project/billing for current rates.
-
-vLLM GPU Pricing (2 Hosted GPUs)
-
-| GPU type | Memory | Est. price/hr |
-| --- | --- | --- |
-| A100 80GB | 80 GB | $4.95 |
-| H100 80GB | 80 GB | $7.95 |
-
-Training GPU Pricing
-
-| GPU type | Memory | Est. price/hr |
-| --- | --- | --- |
-| A100 80GB | 80 GB | $3.95 |
-| H100 80GB | 80 GB | $5.40 |
-
----
-
-### Learn more
-
-<CardGroup cols={2}>
-<Card title="Environment Quickstart" icon="cube" href="/build-environments">
-  Complete guide to building environments from scratch
-</Card>
-
-<Card title="RL CLI Reference" icon="terminal" href="/reference/cli/rl">
-  Full `hud rl` command options and usage
-</Card>
-</CardGroup>
\ No newline at end of file
diff --git a/docs/train-agents/tasks.mdx b/docs/train-agents/tasks.mdx
deleted file mode 100644
index 58131b6f..00000000
--- a/docs/train-agents/tasks.mdx
+++ /dev/null
@@ -1,80 +0,0 @@
----
-title: Dataset Design
-icon: table
----
-
-## Tasks format
-
-HUD tasksets can be provided in two primary formats (both supported):
-
-1) A single JSON file containing a list of task objects (recommended)
-
-```json
-[
-  {
-    "id": "browser_2048_128",
-    "prompt": "Reach 128 in 2048.",
-    "mcp_config": {
-      "hud": {
-        "url": "https://mcp.hud.ai/v3/mcp",
-        "headers": {
-          "Authorization": "Bearer ${HUD_API_KEY}",
-          "Mcp-Image": "hudevals/hud-browser:0.1.3"
-        }
-      }
-    },
-    "setup_tool": {"name": "launch_app", "arguments": {"app_name": "2048"}},
-    "evaluate_tool": {"name": "evaluate", "arguments": {"name": "game_2048_max_number", "arguments": {"target": 128}}}
-  }
-]
-```
-
-Save as `2048-basic.json` and run:
-
-```bash
-hud eval 2048-basic.json
-hud rl 2048-basic.json
-```
-
-2) JSONL file with one task object per line
-
-- prompt: instruction for the agent
-- mcp_config: where to run the environment (local docker or remote MCP)
-- setup_tool (optional): a tool call to prepare the environment
-- evaluate_tool: a tool call to compute reward
-- system_prompt (optional): extra guidance for the agent
-
-## Hosting on HuggingFace
-
-You can host tasksets on the Hub and fetch them with:
-
-```bash
-hud get hud-evals/2048-basic
-```
-
-The command downloads the JSONL task file and places it in your project directory.
-
-This allows running the full dataset or training with simply:
-
-```bash
-hud eval hud-evals/2048-basic
-hud rl hud-evals/2048-basic
-```
-
-## Tips
-
-- Keep tasks self-contained; use `setup_tool` to open apps or load data
-- Ensure `evaluate_tool` returns a numeric reward per episode
-- Use small task counts to iterate quickly; scale up once stable
-
-<CardGroup cols={2}>
-<Card title="Agent Evals" icon="robot" href="/evaluate-agents/benchmarks">
-  Learn how to run benchmarks
-</Card>
-
-<Card title="Environment Spec" icon="wrench" href="/build-environments/spec">
-  Deep-dive into MCP configs and tools
-</Card>
-</CardGroup>
-
-
diff --git a/environments/README.md b/environments/README.md
deleted file mode 100644
index 40cba300..00000000
--- a/environments/README.md
+++ /dev/null
@@ -1,956 +0,0 @@
-# How to Build HUD-Compatible MCP Environments
-
-This document is a step-by-step guide for turning *any* piece of software that can run in a Docker container into a **Model Context Protocol (MCP)** environment that the HUD SDK can evaluate or control.  We’ll move through six short phases, each with a clear checkpoint.
-
-> **Big picture**
-> • An *agent* (LLM) wants to solve tasks inside a *software environment*.
-> • Your job: give that environment a clean, programmable surface – a set of
->   *tools* the agent can invoke.
-> • MCP is simply the wire-format we use to move those tool calls back and forth
->   (like gRPC or HTTP but JSON-RPC over stdio/Docker).
-> • FastMCP is the underlying SDK; HUD provides **MCPServer** – a thin wrapper that
->   adds SIGTERM handling, `@initialize` / `@shutdown` decorators, and easier
->   tool registration while remaining 100 % compatible with FastMCP.
-> 
-> The picture:
-> ```text
->  LLM Agent ──JSON-RPC──► FastMCP server (your code) ──► real app / game / browser
-> ```
-> Your job is to wrap *any* app in an MCP server so agents can control it reproducibly & safely.
-
----
-
-## Phase Overview
-
-| Phase | Goal |
-|-------|------|
-| 1 | A Docker image that *starts* and prints to **stderr** |
-| 2 | A minimal MCP server that responds to `initialize` over **stdio** |
-| 3 | Working `setup`, `evaluate`, and **interaction** tools |
-| 4 | Image launches remotely on the HUD platform & exposes live telemetry |
-| 5 | Fast local iteration with `hud dev` hot-reload |
-
-Take the phases one at a time; do **not** jump ahead.  Each stage's checkpoint is the foundation for the next.
-
-## Reference Implementations
-
-This repository includes two complete MCP environment implementations that demonstrate different levels of complexity:
-
-### 1. `text_2048` - Simple Game Environment
-A minimalist ASCII-based 2048 game that showcases:
-- Basic hub pattern with setup/evaluate tools
-- Custom interaction tools (move command)
-- Clean separation of game logic and MCP server
-- Minimal dependencies (Python only)
-- Perfect for learning the core concepts
-
-### 2. `remote_browser` - Advanced Browser Automation
-A sophisticated browser automation environment featuring:
-- Multiple cloud browser provider integrations (AnchorBrowser, Steel, BrowserBase, HyperBrowser, Kernel)
-- Both Playwright and computer tools for interaction
-- Extensive setup/evaluate capabilities (navigation, cookies, sheets, element checks)
-- Live telemetry with browser viewing URLs
-- Production-ready error handling and cleanup
-
-💡 **Follow along with text_2048** as you work through each phase - it demonstrates all the core patterns with minimal complexity.
-
-### Installing the HUD CLI
-
-The HUD SDK includes a powerful CLI for debugging and analyzing MCP environments:
-
-```bash
-# Install HUD CLI globally with uv (recommended)
-uv tool install hud-python@latest --python 3.12
-
-# Or use without installing
-uvx --from hud-python hud --help
-
-# Verify installation
-hud --help
-```
-
-Common commands:
-```bash
-# Debug your Docker image (runs 5-phase test)
-hud debug my-mcp-server:latest
-
-# Analyze available tools and resources
-hud analyze my-mcp-server:latest --format json
-
-# Debug any command-based MCP server
-hud debug --command "python my_server.py"
-```
-While you move through the phases it's handy to run the **interactive checker** to make sure nothing broke:
-
-```bash
-# First build your Docker image
-docker build -t my-environment environments/my-environment
-
-# Then debug it
-hud debug my-environment
-```
-
-**What's the difference?**
-- **`hud debug`** - Tests your environment in 5 phases, checking startup, MCP protocol, tools, and readiness. Use this first!
-- **`hud analyze`** - Explores the environment to discover all tools, resources, and capabilities. Only works after debug passes phase 3.
-
-The script walks the *same* checklist and prints coloured, human-friendly hints whenever something fails.
-
-| What it validates | Phase |
-|-------------------|-------|
-| Container starts & logs to **stderr** | 1 |
-| MCP server responds to an `initialize` request | 2 |
-| Discovers `setup`, `evaluate`, and interaction tools | 3 |
-| Calls `setup` / `evaluate`, checks telemetry & startup time | 4 |
-| Spawns three concurrent clients to stress-test resources | 5 |
-
-💡 **Run it after finishing each phase.** If the checker exits with a red ❌, scroll up for the gold-coloured *hint* block – it usually points directly to the root cause.
-
----
-
-## Phase 1 – Write a Dockerfile
-
-**Goal →** Create a container that can run your MCP server with proper Python packaging.
-
-Key principles:
-- **stdout** is reserved for MCP protocol (JSON-RPC)
-- **stderr** is for all logs and debug output
-- Use proper Python packaging with `pyproject.toml`
-- Run as a module for clean imports
-
-### Dockerfile Template
-
-```dockerfile
-FROM python:3.11-slim
-
-# Prevent Python from buffering output (important for logs)
-ENV PYTHONUNBUFFERED=1 \
-    PYTHONDONTWRITEBYTECODE=1
-
-WORKDIR /app
-
-# Copy package files
-COPY pyproject.toml ./
-COPY src/ ./src/
-
-# Install in editable mode for development flexibility
-RUN pip install --no-cache-dir -e .
-
-# Run as a module to ensure proper package imports
-CMD ["python", "-m", "my_module.server"]
-```
-
-### Build & Test
-
-```bash
-docker build -t my-environment .
-
-# Test Phase 1: Container should start without errors
-docker run --rm -i my-environment
-```
-
-### Recommended Environment Structure
-
-For Python-based MCP environments, use this standard structure:
-
-```
-my-environment/
-├── Dockerfile
-├── README.md
-├── server/                 # MCP server package
-│   ├── pyproject.toml      # MCP dependencies (hud-python, etc.)
-│   ├── __init__.py         # Empty package marker
-│   ├── main.py             # mcp = MCPServer() + lifecycle hooks
-│   ├── tools.py            # router = MCPRouter() + @router.tool decorators
-│   ├── setup/              # Setup router (modular approach)
-│   │   ├── __init__.py
-│   │   ├── basic.py        # Basic setup functions
-│   │   └── advanced.py     # Advanced setup functions
-│   └── evaluate/           # Evaluate router (modular approach)
-│       ├── __init__.py
-│       ├── checks.py       # Basic evaluation checks
-│       └── metrics.py      # Advanced metrics evaluators
-└── environment/            # Backend service package
-    ├── pyproject.toml      # Backend dependencies (fastapi, uvicorn)
-    ├── __init__.py
-    └── server.py           # FastAPI app with /health, /act, /reset, /state
-```
-
-This structure enables:
-- Clean separation of concerns (environment logic, tools, setup, evaluation)
-- Easy volume mounting for development (Phase 5)
-- Standard Python packaging with `pip install -e .`
-- Modular organization - each setup/evaluator in its own file for clarity
-
-• **One Dockerfile only** – no docker-compose.  
-• If you're building a GUI environment, start from `hudpython/novnc-base:latest` instead and leave VNC configuration for later phases.
-
-Checkpoint reached?  Congratulations – move on.
-
-👉 Quick sanity check: `hud debug my-environment` (verifies Phase 1 automatically)
-
-Need inspiration? Check out our reference implementations:
-• [`text_2048/Dockerfile`](./text_2048/Dockerfile) - Minimal Python setup, perfect for simple environments
-• [`remote_browser/Dockerfile`](./remote_browser/Dockerfile) - Uses pre-built base image with browser dependencies
-• [`browser/Dockerfile`](./browser/Dockerfile) - Multi-stage build with full GUI support
-
----
-
-## Phase 2 – Create the MCP Server
-
-**Goal →** a Python process that:
-1. Speaks MCP over **stdio**.
-2. Responds correctly to the `initialize` request.
-3. Logs everything to **stderr**.
-
-The MCP lifecycle is *initialize → operate → shutdown* (see spec link above).
-
-### Skeleton server (MCPServer)
-
-```python
-import sys
-import logging
-from hud.server import MCPServer
-
-# 1️⃣  Always log to stderr – stdout is reserved for JSON-RPC
-logging.basicConfig(
-    stream=sys.stderr,
-    level=logging.INFO,
-    format='[%(levelname)s] %(asctime)s | %(name)s | %(message)s'
-)
-
-# Create the server early so decorators can reference it
-mcp = MCPServer(name="My Environment")
-
-# Run heavy one-time setup during MCP initialize
-@mcp.initialize
-async def initialize_environment(session=None, progress_token=None):
-    """Heavy one-time setup – start databases, launch background apps, etc."""
-    logging.info("starting core services…")
-    await start_services()            # your coroutine
-    logging.info("services ready")
-
-if __name__ == "__main__":
-    mcp.run()
-```
-
-*(Replace `start_services()` with whatever takes noticeable startup time – browsers, DBs, X servers, …)*
-
-### Adapt Dockerfile
-
-At the end of your Dockerfile, you must launch the MCP server as the container's main process, ensuring it communicates over stdio (stdin/stdout). This is typically done by setting the `CMD` or `ENTRYPOINT` to run your server module directly, for example:
-
-
-```dockerfile
-FROM python:3.11-slim
-
-WORKDIR /app
-COPY . .
-
-# Optional: install requirements
-# RUN pip install -r requirements.txt
-
-CMD ["python", "-m", "your_module_name"]  # Replace 'your_module_name' with your actual entrypoint module
-```
-
-### Three validation steps (run them **in order**)
-
-| # | What you do | Why it matters |
-|---|-------------|----------------|
-| 1 | **Direct stdio test** – pipe the JSON below into your script | Proves the Python code handles `initialize` without any client or Docker noise |
-| 2 | **MCP Inspector** – `npx @modelcontextprotocol/inspector python -m my_package.server` | Lets you click around: view capabilities, tools, resources |
-| 3 | **Inside Docker** – rebuild the image and run it | This is *exactly* how HUD will execute the server |
-| 4 | **Run `hud debug`** – `hud debug my-environment` | Combines the above checks & points out common mistakes |
-
-#### JSON for step 1
-
-```jsonc
-{ "jsonrpc": "2.0", "id": 1, "method": "initialize", "params": {
-  "protocolVersion": "2024-11-05",
-  "capabilities": {"roots": {"listChanged": true}},
-  "clientInfo": {"name": "DevClient", "title": "Dev", "version": "0.0.0"}
-}}
-```
-
-Pipe it:
-
-```bash
-echo '<the-json-above>' | python -m my_package.server
-```
-
-If all three validations succeed, you have a real MCP server – time to make it useful.
-
----
-
-## Phase 3 – Add Setup / Evaluate / Interaction Tools
-
-**Goal →** tools are discoverable in the Inspector *and* callable from the HUD SDK.
-
-👉 After wiring in the tools, confirm with `hud debug my-environment --max-phase 3` – it now checks for their presence and basic execution.
-
-🔍 Once debug passes phase 3, you can analyze the environment:
-```bash
-hud analyze my-environment  # Interactive view of tools and resources
-hud analyze my-environment --format json  # JSON output for scripts
-hud analyze my-environment --format markdown  # Generate documentation
-```
-
-1. Write **`setup`** and **`evaluate`** tools first – they are *lifecycle* tools and never shown to the LLM.
-2. Register at least one **interaction** tool (`computer`, `playwright`, or your own).
-
-### Approach 1: Simple Direct Implementation
-
-For simple environments with just a few setup/evaluate functions, you can use direct tool decorators with **MCPServer**:
-
-```python
-from hud.server import MCPServer
-from hud.tools import HudComputerTool
-
-mcp = MCPServer(name="my-environment")
-
-@mcp.tool()
-async def setup(config: dict) -> dict:
-    ...               # prepare environment
-
-@mcp.tool()
-async def evaluate(config: dict) -> dict:
-    ...               # return {"reward": <0-1>, "done": bool}
-
-@mcp.initialize
-async def initialize_environment(session=None, progress_token=None):
-    custom_tool = HudComputerTool()
-    mcp.add_tool(custom_tool.mcp)
-    
-    # Any other initialization
-```
-
-### Approach 2: Hub Pattern (Recommended for Complex Environments)
-
-The BaseHub pattern provides a clean way to organize multiple setup/evaluate functions with automatic discovery and registration. **A BaseHub is fundamentally another MCP server (it's a subclass of FastMCP)** that you mount to your main server, providing namespace separation and modular organization. All hub functions are exposed through one tool named after the hub, and a resource that can list all of its tools.
-
-When mounted, the hub's tools are accessible through a single tool that dispatches to the appropriate function:
-```json
-{
-  "name": "setup",
-  "arguments": {
-    "name": "reset",  // Which function in the hub to call
-    "arguments": {"param": "value"}  // Additional parameters
-  }
-}
-```
-
-```python
-# In setup/__init__.py
-from hud.tools.base import BaseHub
-
-# Create the setup hub (a sub-server)
-setup = BaseHub("setup")
-
-# Import all setup modules to register their tools
-from . import basic, advanced  # This registers all @setup.tool() decorated functions
-
-# In setup/basic.py
-from . import setup
-from mcp.types import TextContent
-
-@setup.tool()
-async def reset(**kwargs):
-    """Reset the environment to its initial state.
-    
-    Args:
-        **kwargs: Additional parameters
-    
-    Returns:
-        TextContent
-    """
-    # Access environment from the hub
-    env = setup.env
-    await env.reset_state()
-    return TextContent(
-        text="Environment reset to initial state",
-        type="text"
-    )
-
-@setup.tool()
-async def seed_data(num_items: int = 5):
-    """Seed the environment with test data.
-    
-    Args:
-        num_items: Number of items to create
-    
-    Returns:
-        TextContent
-    """
-    # Access environment from the hub
-    env = setup.env
-    items = await env.create_items(num_items)
-    return TextContent(
-        text=f"Created {len(items)} items",
-        type="text"
-    )
-
-# In evaluate/__init__.py
-from hud.tools.base import BaseHub
-
-# Create the evaluate hub (another sub-server)
-evaluate = BaseHub("evaluate")
-
-# Import all evaluator modules
-from . import checks, metrics
-
-# In evaluate/checks.py
-from . import evaluate
-from hud.tools.types import EvaluationResult
-
-@evaluate.tool()
-async def task_complete(expected_count: int):
-    """Check if the expected number of tasks are completed.
-    
-    Args:
-        expected_count: Expected number of completed tasks
-    
-    Returns:
-        EvaluationResult
-    """
-    # Access environment from the hub
-    env = evaluate.env
-    completed = await env.count_completed()
-    return EvaluationResult(
-        reward=min(completed / expected_count, 1.0),
-        done=completed >= expected_count,
-        content=f"Completed {completed}/{expected_count} tasks",
-        info={"completed": completed, "expected": expected_count}
-    )
-
-# In server.py
-from .setup import setup as setup_hub
-from .evaluate import evaluate as evaluate_hub
-
-# Create MCP server
-mcp = MCPServer(name="my-environment")
-
-@mcp.initialize
-async def initialize_environment(ctx):
-    """Initialize the environment with progress notifications."""
-    # Extract progress token from context
-    progress_token = getattr(ctx.meta, "progressToken", None) if ctx.meta else None
-    # Send progress updates if available
-    async def send_progress(progress: int, message: str):
-        if progress_token:
-            await ctx.session.send_progress_notification(
-                progress_token=progress_token,
-                progress=progress,
-                total=100,
-                message=message,
-            )
-    
-    await send_progress(10, "Starting environment initialization...")
-    
-    # Initialize your environment state/context
-    env = await create_environment_context()
-    await send_progress(50, "Environment created...")
-    
-    # Set environment on hubs
-    setup_hub.env = env
-    evaluate_hub.env = env
-    
-    # Mount hubs to MCP server
-    mcp.mount(setup_hub)
-    mcp.mount(evaluate_hub)
-    await send_progress(80, "Tools registered...")
-    
-    # Register any custom interaction tools
-    if hasattr(env, 'custom_tool'):
-        mcp.add_tool(env.custom_tool.mcp)
-    
-    await send_progress(100, "Environment ready!")
-```
-
-The BaseHub pattern provides:
-- **Namespace isolation**: Tools are grouped under the hub's name (e.g., "setup", "evaluate")
-- **Modular organization**: Each hub can be developed and tested independently
-- **Type safety**: Full type hints preserved for parameters and returns
-
-When you call a hub's tool, you specify which function to execute:
-```python
-# Calling the "reset" function in the setup hub
-await client.call_tool("setup", {"name": "reset"})
-
-# Calling the "task_complete" function in the evaluate hub  
-await client.call_tool("evaluate", {"name": "task_complete", "expected_count": 5})
-```
-
-### Test workflow
-
-1. **Inspector first** – restart the server, refresh the *Tools* tab, confirm the new tools appear.  
-2. **Run `hud debug my-environment`** – this validates initialization, tool discovery and basic calls automatically.  
-3. **Rebuild the image** – `docker build -t my-environment .`.  
-4. **HUD SDK script test** – run a short script like the one below.  GUI environments built from `hudpython/novnc-base` still expose a VNC viewer on <http://localhost:8080/vnc.html> – keep it open while testing.
-
-```python
-import asyncio
-import hud
-from hud.datasets import Task
-from hud.agents import ClaudeAgent
-from hud.clients import MCPClient
-
-async def main():
-    # `trace` captures *everything* that happens and sends it to hud.ai
-    async with hud.async_trace("local_test"):
-        task = Task(
-            prompt="Complete the task",
-            mcp_config={
-                "local": {
-                    "command": "docker", 
-                    "args": ["run", "--rm", "-i", "my-environment:latest"]
-                }
-            },
-            setup_tool={"name": "setup", "arguments": {"name": "todo_seed", "num_items": 5}},
-            evaluate_tool={"name": "evaluate", "arguments": {"name": "todo_completed", "expected_count": 2}}
-        )
-        client = MCPClient(mcp_config=task.mcp_config)
-
-        agent = ClaudeAgent(
-            mcp_client=client,
-            model="claude-3-7-sonnet-20250219",
-            allowed_tools=["computer"]  # or ["move"] for text_2048
-        )
-
-        result = await agent.run(task)
-        print(result)
-
-    await client.close()
-
-asyncio.run(main())
-```
-
-The `trace` context manager sends a full timeline of agent actions, tool calls, and rewards to hud.ai – perfect for debugging.
-
-See `examples/01_hello_2048.py` and `examples/task_with_setup_eval.py` for larger end-to-end demos.
-
----
-
-## Phase 4 – Remote Deployment & HUD Runner
-
-**Goal →** the exact same image runs in parallel on hundreds of instances, and exposes more telemetry so the hud.ai can visualise the whole lifecycle.
-
-### 1. Publish your image
-
-Log in to Docker Hub (or any registry HUD can pull from) and push a tagged build:
-
-```bash
-docker tag my-environment yourdockerhubuser/my-environment:latest
-docker push yourdockerhubuser/my-environment:latest
-```
-
-*(If you’re using a private registry, make sure the HUD worker has pull credentials.)*
-
-### 2. Launch it remotely (gmail_remote pattern)
-
-Here's how to configure a remote MCP server that runs **the same Docker image**:
-
-```python
-from hud import settings
-from hud.clients import MCPClient
-
-# Your image is in a registry, now tell HUD to pull & run it on demand
-config = {
-    "hud": {
-        "url": settings.hud_mcp_url,
-        "headers": {
-            "Authorization": f"Bearer {settings.api_key}",
-            "Mcp-Image": "yourdockerhubuser/my-environment:latest",  # which image to launch
-        },
-    }
-}
-
-client = MCPClient(mcp_config=config)
-```
-
-_Steps 3 and 4 below are **optional but highly recommended** once the image boots successfully._
-
-Spin up **many** agents in parallel by just launching multiple tasks – HUD will queue and start as many containers as resources allow.
-
-### 3. Progress updates during `initialize` (Optional)
-
-At remote scale it can take 10-30 s for heavy services to boot.  Use the new
-`@mcp.initialize` decorator plus the `session` / `progress_token` parameters to
-stream status messages:
-
-```python
-@mcp.initialize
-async def initialize_environment(session=None, progress_token=None):
-    async def send(p, msg):
-        if session and progress_token:
-            await session.send_progress_notification(
-                progress_token=progress_token,
-                progress=p,
-                total=100,
-                message=msg
-            )
-    await send(10, "Starting X11...")
-    await start_x11()
-    await send(50, "Launching browser…")
-    await launch_browser()
-    await send(100, "ready")
-```
-
-Those messages are displayed live on hud.ai alongside resource graphs – perfect feedback while you wait.
-
-### 4. Live telemetry (`telemetry://live`) (Optional)
-
-Expose a resource named `telemetry://live` exactly like in `environments/browser/src/hud_controller/server.py` to return live url to be displayed on hud.ai.
-
-Once all of the above works you can unleash *hundreds* of concurrent agents on your new environment.
-
----
-
-## Phase 5 – Hot-Reload Development
-
-For rapid local development, run the controller and environment servers separately. This enables instant code updates without Docker rebuilds.
-
-### Development Setup
-
-You'll need **two terminal windows** for local development:
-
-#### Terminal 1: MCP Server
-```bash
-cd environments/my-environment/server
-hud dev                  # Auto-detects and runs with hot-reload
-
-# Optional flags:
-hud dev --inspector      # Launch MCP Inspector
-hud dev --interactive    # Launch interactive testing mode
-hud dev --stdio          # Use stdio transport (default: HTTP)
-hud dev --watch ../shared  # Watch additional directories
-```
-
-The `hud dev` command:
-- Auto-detects the MCP module in the current directory
-- Watches for file changes and reloads automatically
-- Runs on HTTP by default (http://localhost:8765/mcp)
-- Can launch MCP Inspector for testing tools
-- Can launch interactive mode for manual testing
-
-#### Terminal 2: Environment Server (Backend)
-```bash
-cd environments/my-environment/environment
-uvicorn server:app --reload  # Standard uvicorn with hot-reload
-```
-
-For the backend, we simply use `uvicorn` directly since it already provides excellent hot-reload capabilities.
-
-### Development Workflow
-
-1. Start both servers in separate terminals
-2. Edit code in either `server/` or `environment/` - changes reload automatically
-3. Test changes immediately without rebuilding Docker images
-4. Use MCP Inspector or interactive mode to test tools
-5. When ready, build the complete Docker image: `hud build`
-
-### Quick Cursor Setup
-
-Add to `.cursor/mcp.json` (or use the deeplink from `hud dev` output):
-
-```json
-{
-  "mcpServers": {
-    "my-environment-dev": {
-      "url": "http://localhost:8765/mcp"
-    }
-  }
-}
-```
-
-**Note**: Make sure both MCP server and environment backend are running when using with Cursor or agents.
-
-### Process Separation for Stateful Environments
-
-**Important Architecture Pattern**: For environments that maintain state (browsers, databases, running applications), you should separate the MCP server process from the actual environment process. This separation is critical for effective hot-reload development.
-
-#### Why Process Separation?
-
-When `hud dev` restarts your MCP server for code changes, you don't want to lose:
-- Open browser windows and navigation state
-- Database connections and data
-- Running application state
-- X11/VNC sessions
-- Any expensive initialization
-
-#### Architecture Pattern
-
-```
-┌─────────────────┐     ┌──────────────────────┐
-│   MCP Server    │────▶│ Environment Process  │
-│  (Restartable)  │     │    (Persistent)      │
-└─────────────────┘     └──────────────────────┘
-       ▲                         │
-       │                         │
-       └─── Communication ────────┘
-          (Socket, API, gRPC)
-```
-
-#### Implementation Example
-
-1. **Create a Context Server** (`context_server.py`):
-```python
-from hud.server.context import run_context_server
-
-class PersistentEnvironmentContext:
-    def __init__(self):
-        self.state = {}
-        self.resources = None
-    
-    def startup(self):
-        # One-time expensive initialization
-        self.resources = initialize_expensive_resources()
-    
-    def get_state(self):
-        return self.state
-
-if __name__ == "__main__":
-    context = PersistentEnvironmentContext()
-    context.startup()
-    # Run on Unix socket
-    asyncio.run(run_context_server(context, "/tmp/my_env_ctx.sock"))
-```
-
-2. **Connect from MCP Server** (`server.py`):
-```python
-from hud.server.context import attach_context
-
-@mcp.initialize
-async def initialize_environment(ctx):
-    # Connect to persistent context
-    persistent_ctx = attach_context("/tmp/my_env_ctx.sock")
-    
-    # Use existing state without reinitializing
-    state = persistent_ctx.get_state()
-    resources = persistent_ctx.get_resources()
-```
-
-3. **Update Dockerfile** to run both processes:
-```dockerfile
-# Start context server in background
-CMD ["sh", "-c", "python -m hud_controller.context_server & python -m hud_controller.server"]
-```
-
-#### Communication Options
-
-- **Unix Sockets** (recommended for local): Fast, simple, no network overhead
-- **TCP/HTTP API**: Good for distributed systems
-- **gRPC**: Type-safe, efficient for complex APIs
-- **Shared Memory**: Ultra-fast for large data
-
-See the `browser` environment for a complete production example of this pattern.
-
-### 4. Cursor rules – paste this once
-
-Inside `.cursor/rules/mcp_environment_iteration.mdc` add (or verify) the following so the agent always knows the expected iteration loop:
-
-```mdc
----
-description: Improve an MCP environment
-alwaysApply: false
----
-Setup
-1. Make sure the user has set up the mcp config for the environment by seeing if you have access to the tools by the given name (i.e. my-environment-dev), and make sure the title is in dev mode. If not, ask the user to make a dev version!
-2. Make sure you can find the source folder for this environment. Explore its contents and README.
-3. Clarify the objectives and ask follow up questions on the initial query to determine precise implementation details.
-
-Iteration
-1. Use the exposed tools by the environment to interact with it. This means navigating around with a computer, editing, launching commands, whatever means accessible to you. If there are any exposed resources, try to access them to determine the structure of the calls.
-2. Based on the objectives, test and verify the functionality of different tools and parts of the environment. If any tool call responds with an error, note it down. If any interaction with the environment is wrong, unexpected, incomplete, or parts of the environment are not developed fully, note it down. If any new problem sets up wrong or evaluation does not match the expected outcome, note it down. All of these inconsistencies you should note down in your TODOs.
-3. Then, based on the TODOs, view the source folder and find the places where those errors would occur. Think about the system and how to fix it. Then fix it.
-4. After you've fixed your TODO items, go back to step 2 and test them. Test through all of your available tools, and use feedback (such as screenshots) to determine your progress. If they now work as expected, mark them as complete. If not, continue the loop from step 2. Be extremely careful, scrupolous and attentive to all details. Never assume something is working unless you've tested it fully for all of its edge cases.
-5. The only time you can exit this iteration loop is if you're adding if there is no feasible way to create input conditions to test something. In this case, ask the user for help and recap your progress. If you're simply changing tools, changing code, and still have more realistic TODOs, the restart_server tool automatically refreshes the environment and you should continue working. In *all* other cases, you must continue this iteration loop until you can come up with no more TODOs. You must not halt.```
-
-### 5. Prompt the agent
-
-```txt
-Context: In the my-environment folder, I have a browser app environment. I've built a tool to interact with it called my-environment-dev.
-Interaction: There are multiple tools to setup and evaluate the environment. There are also interaction tools for you to be able to move around it, and a screenshot tool to see the state. Use all of the available tools.
-Objective: Please test if all setup, evaluation functions are working. This means you should come up with new problem definitions to test all functionality on. Be creative in how you pick edge cases to test on.
-Rules: @mcp_environment_iteration.mdc
-```
-
----
-
-## Phase 6 – Optional Polish & Extensions
-
-### Deeper dive into registries
-
-An environment often needs *structured knowledge* about tasks, evaluation logic, or problem definitions.  The browser examples keep these in three explicit registries:
-
-| Registry | Purpose | Example resource URI |
-|----------|---------|----------------------|
-| **Setup** | How to seed the environment before the agent starts | `setup://registry` & `setup://{env}` |
-| **Evaluators** | Functions that decide success & reward | `evaluators://registry` |
-| **Problems** | Bundled benchmarks / tasks with their own setup & evaluate pairs | `problems://registry` |
-
-Each registry is just a dictionary mapping a *name* to a *class*.  Use a **decorator** to register classes:
-
-```python
-from .registry import setup, evaluator, problem
-
-@setup("todo_seed")
-class TodoSeed:
-    ...
-
-@evaluator("todo_completed")
-class TodoCompleted:
-    ...
-
-@problem("todo_basic", description="Complete two todo items", difficulty="easy")
-class TodoBasic:
-    def get_setup(self):
-        return {"name": "todo_seed", "arguments": {"num_items": 5}}
-    def get_evaluation(self):
-        return {"name": "todo_completed", "arguments": {"expected_count": 2}}
-```
-
-Decorators keep registration *next to the implementation* and avoid manual bookkeeping.  The server simply exposes the combined metadata through an MCP **resource**.  Follow `environments/browser/src/hud_controller/problems/registry.py` as a template and expose the JSON with `@mcp.resource("problems://registry")`.
-
-### Other finishing touches
-
-* **Performance** – lazy-load heavy resources, pool DB connections, cache expensive calls.
-* **Security** – sandbox untrusted code, keep secrets in env vars, audit-log every tool call.
-* **Creative ideas** – API simulators, network test-beds, game worlds… if it fits in Docker it can be an MCP environment.
-
----
-
-## Contributing to Existing Environments
-
-When improving existing environments, follow these guidelines:
-
-### 1. Understanding the Environment
-
-Before making changes:
-- Read the environment's README and any documentation
-- Run `hud debug <image>` to test the environment
-- Run `hud analyze <image>` (after debug passes phase 3) to explore capabilities
-- Explore the folder structure and identify key components
-- Test existing setup/evaluate functions to understand behavior
-
-### 2. Making Improvements
-
-**Adding New Setup Functions**
-```python
-# In setup/my_new_setup.py
-from . import setup
-from hud.tools import BaseSetup, TextContent
-
-@setup("my_new_setup", description="Clear description of what this does")
-class MyNewSetup(BaseSetup):
-    async def __call__(self, context, param1: str, param2: int = 10) -> TextContent:
-        # Implementation
-        return TextContent(...)
-```
-
-**Adding New Evaluators**
-```python
-# In evaluate/my_evaluator.py
-from . import evaluator
-from hud.tools import BaseEvaluator, EvaluationResult
-
-@evaluator("my_check", description="What this evaluates")
-class MyCheckEvaluator(BaseEvaluator):
-    async def __call__(self, context, threshold: float) -> EvaluationResult:
-        score = await context.calculate_score()
-        return {
-            "reward": min(score / 100, 1.0),
-            "done": score >= threshold,
-            "info": {"score": score, "threshold": threshold}
-        }
-```
-
-### 3. Testing Your Changes
-
-**Use `hud dev` for Hot-Reload Development**
-```bash
-# Navigate to the environment directory
-cd environments/my-environment
-
-# Start development server with hot-reload
-hud dev --build
-
-# In another terminal, test your changes
-hud analyze hud-my-environment:dev
-
-# Or use interactive mode to test tools directly
-hud dev --build --interactive
-```
-
-The `hud dev` command automatically:
-- Mounts your `src/` directory for live code updates
-- Handles container lifecycle and restarts
-- Provides an HTTP endpoint for testing
-- Shows logs for debugging
-
-## Testing Your Environment
-
-Once your environment is working, create comprehensive tests to ensure it stays that way:
-
-### Creating Test Files
-
-Each environment should have a test file following this pattern:
-- `environments/<env_name>/test_<env_name>_mcp.py`
-
-The test file should include:
-1. **Docker Build Test**: Ensure the image builds successfully
-2. **MCP Initialization Tests**: Verify phases 1-3 using `hud debug`
-3. **Tool-Specific Tests**: Test your environment's unique tools
-4. **Integration Tests**: Test complete workflows
-
-Example test structure:
-```python
-class TestMyEnvironment:
-    IMAGE_NAME = "my-environment-test:latest"
-    
-    @classmethod
-    def setup_class(cls):
-        """Build Docker image before tests"""
-        # Build the image
-    
-    def test_phase1_basic_startup(self):
-        """Test container starts"""
-    
-    @pytest.mark.asyncio
-    async def test_phase2_3_mcp_initialize_and_tools(self):
-        """Test MCP init and tool discovery"""
-    
-    @pytest.mark.asyncio
-    async def test_environment_specific_tools(self):
-        """Test your custom tools"""
-```
-
-### Running Tests
-
-You can run tests directly with pytest:
-
-```bash
-# Run all tests for an environment
-cd environments/text_2048
-pytest test_text_2048_mcp.py -v
-```
-
-### Test Dependencies
-
-Add pytest to your environment's `pyproject.toml`:
-
-```toml
-[project.optional-dependencies]
-test = ["pytest>=7.0", "pytest-asyncio>=0.20"]
-```
-
-## Summary
-
-1. Start with a *plain* Dockerfile – verify it runs.  
-2. Add a minimal FastMCP server – verify with stdio, Inspector, Docker.  
-3. Implement tools – verify discovery + execution.  
-4. Run the same image remotely – verify telemetry.  
-5. Automate the loop with cursor-mcp.  
-6. **Write comprehensive tests** – ensure reliability.
-7. Polish and extend as inspiration strikes.
-
-Happy building – and remember: **stderr is your friend, stdout belongs to MCP.** 🚀
diff --git a/environments/blank/.env.example b/environments/blank/.env.example
deleted file mode 100644
index 86f9a702..00000000
--- a/environments/blank/.env.example
+++ /dev/null
@@ -1,7 +0,0 @@
-# HUD API Configuration
-# Get your API key from https://hud.ai/account
-HUD_API_KEY=""
-
-# Anthropic API Configuration (optional)
-# Required for using Claude agents - get from https://console.anthropic.com/
-ANTHROPIC_API_KEY=""
diff --git a/environments/blank/Dockerfile b/environments/blank/Dockerfile
deleted file mode 100644
index fd2639bd..00000000
--- a/environments/blank/Dockerfile
+++ /dev/null
@@ -1,22 +0,0 @@
-FROM public.ecr.aws/docker/library/python:3.11-bookworm
-
-WORKDIR /app
-
-RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
-
-# Copy and install MCP server dependencies
-COPY server/pyproject.toml ./server/
-RUN pip install --no-cache-dir ./server
-
-# Copy and install environment dependencies
-COPY environment/pyproject.toml ./environment/
-RUN pip install --no-cache-dir ./environment
-
-# Copy source code after dependencies
-COPY server/ ./server/
-COPY environment/ ./environment/
-
-ENV ENV_SERVER_PORT=8005
-
-# Start environment server in background, then run MCP server with hot-reload
-CMD ["sh", "-c", "uvicorn environment.server:app --host 0.0.0.0 --port $ENV_SERVER_PORT --log-level warning --reload >&2 & sleep 0.5 && hud dev server.main --stdio"]
diff --git a/environments/blank/README.md b/environments/blank/README.md
deleted file mode 100644
index e62c47e4..00000000
--- a/environments/blank/README.md
+++ /dev/null
@@ -1,128 +0,0 @@
-# Blank Environment
-
-Minimal starter template for building HUD environments.
-See [docs](https://docs.hud.ai/build-environments) for the complete environment design workflow.
-
-## Architecture
-
-**`environment/`** - Produces structured data
-
-- Owns all state (game logic, browser sessions, databases, etc.)
-- Exposes HTTP endpoints `/health`, `/act`, `/reset`, `/state` that return structured information about the environment state
-
-**`server/`** - Wraps data in MCP tools
-
-- Calls environment endpoints to get structured data for the agent, and environment setup/evaluation
-- Agents and tasks interact only with these tools!
-
-**Why separate?** Edit tools for the agent or tasks without restarting the heavy environment backend.
-
-## Development
-
-```bash
-# Terminal 1 - Environment backend
-cd environment
-uv run uvicorn server:app --reload
-
-# Terminal 2 - MCP server
-cd server
-uv run hud dev
-```
-
-Uncomment the `setup` tool in `server/tools.py`, save, and watch it reload.
-Visit http://localhost:8765/docs to see the new tool appear instantly.
-
-In general, we recommend starting work on the environment backend first, then developing the MCP server to expose the right things to the agent.
-
-For complex environments that require many dependencies, we recommend running `hud dev` in the environment root:
-
-```bash
-cd ..
-hud dev
-```
-
-## Tasks & Evaluation
-
-```bash
-# Build first in the global folder with the Dockerfile (creates blank:0.1.0)
-hud build
-```
-
-Your `tasks.json` uses `docker run` to launch the environment:
-
-```json
-{
-  "prompt": "Your task prompt",
-  "mcp_config": {
-    "local": {
-      "command": "docker",
-      "args": ["run", "--rm", "-i", "blank:0.1.0"]
-    }
-  }
-}
-```
-
-**Commands:**
-
-```bash
-# Build first
-hud build
-
-# Test task locally
-hud eval tasks.json
-
-# Push environment for remote running
-hud push
-
-# Production RL training
-hud rl tasks.json  # Auto-converts docker→remote, builds & pushes if needed
-```
-
-## Publishing Your Environment
-
-Once your environment is ready, you can share it with the community:
-
-### 1. Push to Registry
-
-```bash
-# Build and push your environment (requires docker hub login and hud api key)
-hud build
-hud push
-```
-
-### 2. Create a Dataset
-
-Create a dataset on HuggingFace with your tasks:
-
-**Option A: Upload manually**
-
-1. Upload your `tasks.json` to HuggingFace
-2. Make sure it's **public** to appear on leaderboards
-
-**Option B: Use the SDK**
-
-```python
-from hud.datasets import save_tasks
-import json
-
-# Load your tasks
-with open("tasks.json") as f:
-    tasks = json.load(f)
-
-# Push to HuggingFace
-save_tasks(tasks, repo_id="your-org/your-dataset")
-```
-
-### 3. Run and Track Performance
-
-```bash
-# Run Claude on your benchmark
-hud eval "your-org/your-dataset" claude
-
-# View results at:
-# hud.ai/leaderboards/your-org/your-dataset
-```
-
-**Note**: Only public HuggingFace datasets appear as leaderboards!
-
-📚 Learn more: [Creating Benchmarks](https://docs.hud.ai/evaluate-agents/create-benchmarks) | [Leaderboards](https://docs.hud.ai/evaluate-agents/leaderboards)
diff --git a/environments/blank/environment/README.md b/environments/blank/environment/README.md
deleted file mode 100644
index b902ec25..00000000
--- a/environments/blank/environment/README.md
+++ /dev/null
@@ -1,16 +0,0 @@
-# Environment
-
-Backend service: owns state and exposes HTTP APIs the controller calls.
-
-Endpoints (FastAPI)
-- `GET /health` → {status: ok}
-- `POST /act` → increments counter and returns {count}
-- `POST /reset` → resets counter
-- `GET /state` → returns {count}
-
-Run (dev)
-```bash
-uv run uvicorn server:app --reload --port 8005
-```
-
-Principle: treat like a backend. Keep long‑lived state here; add endpoints as tools need them.
diff --git a/environments/blank/environment/__init__.py b/environments/blank/environment/__init__.py
deleted file mode 100644
index d9cd6199..00000000
--- a/environments/blank/environment/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-"""Blank environment package."""
diff --git a/environments/blank/environment/pyproject.toml b/environments/blank/environment/pyproject.toml
deleted file mode 100644
index 8256f97e..00000000
--- a/environments/blank/environment/pyproject.toml
+++ /dev/null
@@ -1,16 +0,0 @@
-[project]
-name = "blank-environment"
-version = "0.1.0"
-description = "Backend service for blank environment"
-requires-python = ">=3.11"
-dependencies = [
-    "fastapi",
-    "uvicorn[standard]",
-]
-
-[build-system]
-requires = ["hatchling"]
-build-backend = "hatchling.build"
-
-[tool.hatch.build.targets.wheel]
-packages = ["."]
diff --git a/environments/blank/environment/server.py b/environments/blank/environment/server.py
deleted file mode 100644
index 7a382599..00000000
--- a/environments/blank/environment/server.py
+++ /dev/null
@@ -1,40 +0,0 @@
-"""Minimal FastAPI environment server (HTTP-based)."""
-
-from fastapi import FastAPI
-
-import logging
-import sys
-
-logging.basicConfig(
-    stream=sys.stderr,
-    level=logging.INFO,
-    format="[%(levelname)s] %(asctime)s | %(name)s | %(message)s",
-)
-
-app = FastAPI(title="Blank Environment API")
-
-_count = 0
-
-
-@app.get("/health")
-def health():
-    return {"status": "ok"}
-
-
-@app.post("/act")
-def act():
-    global _count
-    _count += 1
-    return {"count": _count}
-
-
-@app.post("/reset")
-def reset():
-    global _count
-    _count = 0
-    return {"ok": True}
-
-
-@app.get("/state")
-def state():
-    return {"count": _count}
diff --git a/environments/blank/server/README.md b/environments/blank/server/README.md
deleted file mode 100644
index 19fc7068..00000000
--- a/environments/blank/server/README.md
+++ /dev/null
@@ -1,21 +0,0 @@
-# MCP Server
-
-MCP layer that wraps environment data in tools for agent interaction.
-
-## Structure
-
-- `main.py` - Server initialization, imports routers
-- `tools.py` - MCP tools that call environment HTTP endpoints
-
-## Development
-
-```bash
-# Start MCP server with hot-reload
-uv run hud dev
-```
-
-## Key Principles
-
-- Keep tools thin - call environment HTTP endpoints
-- Use routers for organization
-- All long-lived state lives in `environment/`, not here
\ No newline at end of file
diff --git a/environments/blank/server/__init__.py b/environments/blank/server/__init__.py
deleted file mode 100644
index 219d9cdd..00000000
--- a/environments/blank/server/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-"""MCP server package."""
diff --git a/environments/blank/server/main.py b/environments/blank/server/main.py
deleted file mode 100644
index bbe98d13..00000000
--- a/environments/blank/server/main.py
+++ /dev/null
@@ -1,43 +0,0 @@
-import sys
-import logging
-from hud.server import MCPServer
-from server.shared import http_client
-
-# Configure logging to stderr
-logging.basicConfig(
-    stream=sys.stderr,
-    level=logging.INFO,
-    format="[%(levelname)s] %(asctime)s | %(name)s | %(message)s",
-    force=True,
-)
-for logger_name in ["httpx", "httpcore"]:
-    logging.getLogger(logger_name).setLevel(logging.WARNING)
-
-# Create main MCP server
-mcp = MCPServer(name="blank-environment")
-
-# Include routers
-from server.tools import router as tools_router
-
-mcp.include_router(tools_router)
-
-
-# Lifecycle hooks
-@mcp.initialize
-async def init():
-    """Check if the environment is healthy"""
-    if http_client:
-        await http_client.get("/health")
-    else:
-        raise ValueError("http_client is not set")
-
-
-@mcp.shutdown
-async def cleanup():
-    """Close the HTTP client"""
-    if http_client:
-        await http_client.aclose()
-
-
-if __name__ == "__main__":
-    mcp.run(transport="stdio")
diff --git a/environments/blank/server/pyproject.toml b/environments/blank/server/pyproject.toml
deleted file mode 100644
index 403f92c0..00000000
--- a/environments/blank/server/pyproject.toml
+++ /dev/null
@@ -1,19 +0,0 @@
-[project]
-name = "blank-server"
-version = "0.1.0"
-description = "MCP server for blank environment"
-requires-python = ">=3.11"
-dependencies = [
-    "hud-python>=0.4.54",
-    "httpx>=0.28.1",
-]
-
-[build-system]
-requires = ["hatchling"]
-build-backend = "hatchling.build"
-
-[tool.hatch.metadata]
-allow-direct-references = true
-
-[tool.hatch.build.targets.wheel]
-packages = ["."]
diff --git a/environments/blank/server/shared.py b/environments/blank/server/shared.py
deleted file mode 100644
index ad81fac5..00000000
--- a/environments/blank/server/shared.py
+++ /dev/null
@@ -1,15 +0,0 @@
-from __future__ import annotations
-
-import os
-import httpx
-
-# Environment port (as string to simplify formatting)
-ENV_SERVER_PORT = os.getenv("ENV_SERVER_PORT", "8005")
-
-# Shared HTTP client for talking to the environment backend
-http_client = httpx.AsyncClient(
-    base_url=f"http://localhost:{ENV_SERVER_PORT}",
-    timeout=10.0,
-)
-
-__all__ = ["ENV_SERVER_PORT", "http_client"]
diff --git a/environments/blank/server/tools.py b/environments/blank/server/tools.py
deleted file mode 100644
index 32f3c414..00000000
--- a/environments/blank/server/tools.py
+++ /dev/null
@@ -1,35 +0,0 @@
-"""Tools router for environment interaction."""
-
-from hud.server import MCPRouter
-from hud.tools.types import EvaluationResult
-from server.shared import http_client
-
-router = MCPRouter()
-
-
-@router.tool
-async def act() -> str:
-    """Perform one action step in the environment (increment the counter)."""
-    resp = await http_client.post("/act")
-    data = resp.json()
-    return f"Action #{data.get('count', 0)} performed. Current count: {data.get('count', 0)}"
-
-
-@router.tool
-async def setup() -> str:
-    """Initialize or reset the environment to its starting state."""
-    await http_client.post("/reset")
-    return "Counter reset to 0"
-
-
-@router.tool
-async def evaluate(target: int = 10) -> EvaluationResult:
-    """Evaluate progress toward the target count and return a reward and done flag."""
-    resp = await http_client.get("/state")
-    current_count = resp.json().get("count", 0)
-    delta = target - current_count
-    reward = max(0.0, 1.0 - abs(delta) / target) if target > 0 else current_count
-    done = current_count >= target
-    return EvaluationResult(
-        reward=reward, done=done, content=f"Counter at {current_count}/{target}"
-    )
diff --git a/environments/blank/tasks.json b/environments/blank/tasks.json
deleted file mode 100644
index f24e7b63..00000000
--- a/environments/blank/tasks.json
+++ /dev/null
@@ -1,44 +0,0 @@
-[
-  {
-    "prompt": "Increment the counter to reach 3",
-    "mcp_config": {
-      "local": {
-        "command": "docker",
-        "args": [
-          "run",
-          "--rm",
-          "-i",
-          "blank:latest"
-        ]
-      }
-    },
-    "agent_config": {
-      "allowed_tools": ["act"],
-      "append_setup_output": true
-    },
-    "setup_tool": {
-      "name": "setup",
-      "arguments": {}
-    },
-    "integration_test_tool": [
-      {
-        "name": "act",
-        "arguments": {}
-      },
-      {
-        "name": "act",
-        "arguments": {}
-      },
-      {
-        "name": "act",
-        "arguments": {}
-      }
-    ],
-    "evaluate_tool": {
-      "name": "evaluate",
-      "arguments": {
-        "target": 3
-      }
-    }
-  }
-]
diff --git a/environments/blank/test_task.py b/environments/blank/test_task.py
deleted file mode 100644
index 0f46690a..00000000
--- a/environments/blank/test_task.py
+++ /dev/null
@@ -1,52 +0,0 @@
-#!/usr/bin/env python
-"""
-Simple example of running tasks from tasks.json. Make sure to have run hud build.
-"""
-
-from __future__ import annotations
-
-import asyncio
-import json
-
-from hud.clients import MCPClient
-from hud.datasets import Task
-
-
-async def run_task(task_data: dict):
-    task = Task(**task_data)
-    client = MCPClient(mcp_config=task.mcp_config)
-
-    try:
-        print("Initializing client...")
-        await client.initialize()
-
-        result = await client.call_tool(task.setup_tool)  # type: ignore
-        print(f"✅ Setup: {result.content}")
-
-        print("\n🔄 Performing actions:")
-        for _ in range(10):
-            result = await client.call_tool(name="act", arguments={})
-            print(f"  {result.content}")
-
-        result = await client.call_tool(task.evaluate_tool)  # type: ignore
-        print(f"\n📊 Evaluation: {result.content}")
-
-        return result.content
-    except Exception as e:
-        if "connection" in str(e).lower():
-            print(
-                "❌ Could not connect. Make sure 'hud dev --build' is running in another terminal."
-            )
-        else:
-            raise e
-    finally:
-        await client.shutdown()
-
-
-async def main():
-    for task_data in json.load(open("tasks.json")):
-        await run_task(task_data)
-
-
-if __name__ == "__main__":
-    asyncio.run(main())
diff --git a/environments/browser/.dockerignore b/environments/browser/.dockerignore
deleted file mode 100644
index f91da037..00000000
--- a/environments/browser/.dockerignore
+++ /dev/null
@@ -1,101 +0,0 @@
-# Git
-.git
-.gitignore
-
-# Node
-environment/*/frontend/node_modules
-environment/*/frontend/.next
-environment/*/frontend/build
-environment/*/frontend/dist
-environment/*/frontend/.turbo
-environment/*/frontend/.vercel
-environment/*/frontend/next-env.d.ts
-environment/*/frontend/package-lock.json
-# General Node/Next artifacts anywhere
-node_modules
-**/node_modules
-**/.next
-**/.turbo
-**/.vercel
-*.log
-
-# Python
-__pycache__
-**/__pycache__
-*.pyc
-*.pyo
-*.pyd
-.Python
-*.egg-info
-.pytest_cache
-.mypy_cache
-.coverage
-.venv
-venv
-env
-environment/*/backend/.venv
-environment/*/backend/venv
-environment/*/backend/__pycache__
-
-# Database - exclude ALL database files
-*.db
-*.sqlite
-*.db-journal
-*.db-wal
-*.db-shm
-**/*.db
-**/*.sqlite
-**/*.db-journal
-**/*.db-wal
-**/*.db-shm
-
-# IDE
-.vscode
-.idea
-*.swp
-*.swo
-
-# OS
-.DS_Store
-Thumbs.db
-
-# Documentation
-*.md
-!app/README.md
-!launch/README.md
-
-# Unix sockets, locks, pids (can break Docker context on Windows)
-**/*.sock
-**/*.socket
-**/*.pipe
-**/*.pid
-**/*.lock
-**/*.ipc
-
-# Symlinks and special files
-**/*.lnk
-**/symlink*
-**/.venv
-**/.env
-**/venv
-**/env
-
-# Temporary and cache files
-*.tmp
-*.temp
-*.cache
-**/*.tmp
-**/*.temp
-**/*.cache
-**/tmp/
-**/temp/
-**/cache/
-
-# Lock files that might have special permissions
-yarn.lock
-poetry.lock
-Pipfile.lock
-**/yarn.lock
-**/*.lock
-environment/uv.lock
-controller/uv.lock
\ No newline at end of file
diff --git a/environments/browser/.gitignore b/environments/browser/.gitignore
deleted file mode 100644
index 5397595a..00000000
--- a/environments/browser/.gitignore
+++ /dev/null
@@ -1,100 +0,0 @@
-# Dependencies
-node_modules/
-.pnp
-.pnp.js
-
-# Testing
-coverage/
-.coverage
-.pytest_cache/
-htmlcov/
-
-# Next.js
-.next/
-out/
-build/
-*.tsbuildinfo
-next-env.d.ts
-
-# Production
-dist/
-
-# Misc
-.DS_Store
-*.pem
-Thumbs.db
-
-# Debug
-npm-debug.log*
-yarn-debug.log*
-yarn-error.log*
-.pnpm-debug.log*
-
-# Local env files
-.env
-.env.local
-.env.development.local
-.env.test.local
-.env.production.local
-
-# Vercel
-.vercel
-
-# TypeScript
-*.tsbuildinfo
-
-# Python
-__pycache__/
-*.py[cod]
-*$py.class
-*.so
-.Python
-env/
-venv/
-.venv/
-ENV/
-env.bak/
-venv.bak/
-*.egg-info/
-.installed.cfg
-*.egg
-MANIFEST
-
-# uv
-.venv/
-uv.lock
-
-# Database
-*.db
-*.sqlite
-*.sqlite3
-app.db
-
-# IDEs
-.vscode/
-.idea/
-*.swp
-*.swo
-*~
-.project
-.classpath
-.c9/
-*.launch
-.settings/
-*.sublime-workspace
-
-# OS
-.DS_Store
-.DS_Store?
-._*
-.Spotlight-V100
-.Trashes
-ehthumbs.db
-Thumbs.db
-
-# Logs
-logs/
-*.log
-
-# Docker
-.dockerignore.local 
\ No newline at end of file
diff --git a/environments/browser/Dockerfile b/environments/browser/Dockerfile
deleted file mode 100644
index e25a71f9..00000000
--- a/environments/browser/Dockerfile
+++ /dev/null
@@ -1,60 +0,0 @@
-# syntax=docker/dockerfile:1
-FROM hudevals/hud-browser-base:latest AS setup
-
-WORKDIR /app
-
-# Layer 1: Install server dependencies
-COPY server/pyproject.toml /app/server/
-RUN cd /app/server && uv pip install --system --break-system-packages .
-
-# Layer 2: Install environment dependencies
-COPY environment/pyproject.toml /app/environment/
-RUN cd /app/environment && uv pip install --system --break-system-packages .
-
-# Layer 3: Copy source code (changes here don't invalidate dependency layers)
-COPY server/ /app/server/
-COPY environment/ /app/environment/
-
-# Auto-discover and install/build all frontend apps
-RUN set -e; \
-    for pkg in $(find /app/environment -type f -path '*/frontend/package.json'); do \
-        app_dir=$(dirname "$pkg"); \
-        echo "Installing dependencies in $app_dir"; \
-        if [ -f "$app_dir/package-lock.json" ]; then \
-            (cd "$app_dir" && npm ci --no-audit --no-fund); \
-        else \
-            (cd "$app_dir" && npm install --no-audit --no-fund); \
-        fi; \
-    done && \
-    for pkg in $(find /app/environment -type f -path '*/frontend/package.json'); do \
-        app_dir=$(dirname "$pkg"); \
-        if [ -f "$app_dir/next.config.js" ]; then \
-            echo "Building Next.js app in $app_dir"; \
-            (cd "$app_dir" && npm run build); \
-        fi; \
-    done
-
-# Make scripts executable
-RUN find /app/environment -name "*.py" -type f -exec chmod +x {} \;
-
-# Environment configuration
-ENV MCP_TRANSPORT="stdio"
-ENV HUD_LOG_STREAM="stderr"
-ENV PYTHONUNBUFFERED="1"
-ENV PYTHONWARNINGS="ignore::SyntaxWarning:pyautogui"
-ENV DISPLAY=":1"
-ENV PYTHONPATH=/app
-
-# Expose ports
-EXPOSE 8000 8080 3000-3200 5000-5200
-
-# Simple startup: HUD_DEV=1 enables hot-reload; otherwise run production
-CMD ["sh", "-c", "\
-    if [ \"${HUD_DEV:-0}\" = \"1\" ]; then \
-      uvicorn environment.server:app --host 0.0.0.0 --port 8000 --reload --log-level warning >&2 & \
-      sleep 5 && cd /app/server && exec hud dev server.main --stdio; \
-    else \
-      uvicorn environment.server:app --host 0.0.0.0 --port 8000 --log-level warning >&2 & \
-      sleep 5 && cd /app/server && exec python3 -m server.main; \
-    fi\
-"]
\ No newline at end of file
diff --git a/environments/browser/Dockerfile.local b/environments/browser/Dockerfile.local
deleted file mode 100644
index c5262633..00000000
--- a/environments/browser/Dockerfile.local
+++ /dev/null
@@ -1,72 +0,0 @@
-# syntax=docker/dockerfile:1
-# Local development Dockerfile that uses local hud-python
-FROM hudevals/hud-browser-base:latest AS setup
-
-WORKDIR /app
-
-# Layer 0: Install local hud-python
-# Copy local hud-python source (build context is repo root)
-COPY hud /app/hud-python/hud/
-COPY pyproject.toml /app/hud-python/
-COPY README.md /app/hud-python/
-COPY LICENSE /app/hud-python/
-
-# Install local hud-python
-RUN cd /app/hud-python && uv pip install --system --break-system-packages -e .
-
-# Layer 1: Install server dependencies
-COPY environments/browser/server/pyproject.toml /app/server/
-RUN cd /app/server && uv pip install --system --break-system-packages .
-
-# Layer 2: Install environment dependencies
-COPY environments/browser/environment/pyproject.toml /app/environment/
-RUN cd /app/environment && uv pip install --system --break-system-packages .
-
-# Layer 3: Copy source code (changes here don't invalidate dependency layers)
-COPY environments/browser/server/ /app/server/
-COPY environments/browser/environment/ /app/environment/
-
-# Auto-discover and install/build all frontend apps
-RUN set -e; \
-    for pkg in $(find /app/environment -type f -path '*/frontend/package.json'); do \
-        app_dir=$(dirname "$pkg"); \
-        echo "Installing dependencies in $app_dir"; \
-        if [ -f "$app_dir/package-lock.json" ]; then \
-            (cd "$app_dir" && npm ci --no-audit --no-fund); \
-        else \
-            (cd "$app_dir" && npm install --no-audit --no-fund); \
-        fi; \
-    done && \
-    for pkg in $(find /app/environment -type f -path '*/frontend/package.json'); do \
-        app_dir=$(dirname "$pkg"); \
-        if [ -f "$app_dir/next.config.js" ]; then \
-            echo "Building Next.js app in $app_dir"; \
-            (cd "$app_dir" && npm run build); \
-        fi; \
-    done
-
-# Make scripts executable
-RUN find /app/environment -name "*.py" -type f -exec chmod +x {} \;
-
-# Environment configuration
-ENV MCP_TRANSPORT="stdio"
-ENV HUD_LOG_STREAM="stderr"
-ENV PYTHONUNBUFFERED="1"
-ENV PYTHONWARNINGS="ignore::SyntaxWarning:pyautogui"
-ENV DISPLAY=":1"
-ENV PYTHONPATH=/app
-
-# Expose ports
-EXPOSE 8000 8080 3000-3200 5000-5200
-
-# Simple startup: HUD_DEV=1 enables hot-reload; otherwise run production
-CMD ["sh", "-c", "\
-    if [ \"${HUD_DEV:-0}\" = \"1\" ]; then \
-      uvicorn environment.server:app --host 0.0.0.0 --port 8000 --reload --log-level warning >&2 & \
-      sleep 5 && cd /app/server && exec hud dev server.main --stdio; \
-    else \
-      uvicorn environment.server:app --host 0.0.0.0 --port 8000 --log-level warning >&2 & \
-      sleep 5 && cd /app/server && exec python3 -m server.main; \
-    fi\
-"]
-
diff --git a/environments/browser/README.md b/environments/browser/README.md
deleted file mode 100644
index 005e1333..00000000
--- a/environments/browser/README.md
+++ /dev/null
@@ -1,191 +0,0 @@
-# Browser Environment
-
-Browser automation environment with GUI access for testing web applications. Includes sample apps (2048, Todo) and supports hot-reload development.
-
-## Architecture
-
-**`environment/`** - Produces structured data
-- FastAPI backend with X11/VNC services (Linux-only)
-- Launches and manages web apps (Next.js frontends + Python backends)
-- Exposes HTTP endpoints for app control and state
-
-**`server/`** - Wraps data in MCP tools
-- Browser automation tools (Playwright, computer vision)
-- Setup tools (launch apps, seed data)
-- Evaluation tools (check game state, todo completion)
-
-**Why separate?** The environment backend requires X11/VNC/Chromium (Docker-only). The MCP server tools can be edited with hot-reload, while the heavy environment stays running.
-
-## Development
-
-This environment **requires Docker** due to X11/VNC dependencies.
-
-```bash
-# Build first (creates hud-browser:0.1.0)
-hud build
-
-# Start with hot-reload
-hud dev
-```
-
-When you run `hud dev` in an environment with a Dockerfile, it automatically:
-- Detects Docker mode is needed
-- Mounts `server/` and `environment/` as volumes
-- Enables hot-reload for both layers
-
-Edit files in `server/` or `environment/` and they reload inside the container!
-
-## Publishing Your Environment
-
-Once your environment is ready, you can share it with the community:
-
-### 1. Push to Registry
-```bash
-# Build and push your environment (requires docker hub login and hud api key)
-hud build
-hud push
-```
-
-### 2. Create a Dataset
-
-Create a dataset on HuggingFace with your tasks:
-
-**Option A: Upload manually**
-1. Upload your `tasks.json` to HuggingFace
-2. Make sure it's **public** to appear on leaderboards
-
-**Option B: Use the SDK**
-```python
-from hud.datasets import save_tasks
-import json
-
-# Load your tasks
-with open("tasks.json") as f:
-    tasks = json.load(f)
-
-# Push to HuggingFace
-save_tasks(tasks, repo_id="your-org/your-dataset")
-```
-
-### 3. Run and Track Performance
-
-```bash
-# Run Claude on your benchmark
-hud eval "your-org/your-dataset" --agent claude
-
-# View results at:
-# hud.ai/leaderboards/your-org/your-dataset
-```
-
-**Note**: Only public HuggingFace datasets appear as leaderboards!
-
-📚 Learn more: [Creating Benchmarks](https://docs.hud.ai/evaluate-agents/create-benchmarks) | [Leaderboards](https://docs.hud.ai/evaluate-agents/leaderboards)
-
-## Architecture Overview
-
-The browser environment uses a two-process architecture:
-
-1. **Context Server** (`context.py`): Long-running process that maintains persistent state
-2. **MCP Server** (`server.py`): Hot-reloadable process that handles tool requests
-
-### Key Components
-
-- **BrowserContext**: Stores persistent state (running apps, ports, playwright instance)
-- **ServiceManager**: Manages X11, VNC, and app processes
-- **BaseHub Tools**: Setup and evaluate tools organized by app (2048, todo)
-- **Multiprocessing Proxy**: Enables state sharing between processes
-
-### 1. Tool Implementation Pattern
-
-All setup and evaluate tools should follow this pattern:
-
-```python
-@setup.tool("tool_name")
-async def tool_name(param1: type, param2: type):
-    """Tool description."""
-    try:
-        # Get persistent context
-        persistent_ctx = setup.env  # or evaluate.env
-        
-        # Get app ports
-        backend_port = persistent_ctx.get_app_backend_port("app_name")
-        
-        # Make HTTP request
-        url = f"http://localhost:{backend_port}/api/endpoint"
-        async with httpx.AsyncClient() as client:
-            response = await client.method(url, json=data)
-            response.raise_for_status()
-            result = response.json()
-        
-        # Return result
-        return TextContent(
-            text=f"Success message",
-            type="text"
-        )
-    except Exception as e:
-        logger.error(f"tool_name failed: {e}")
-        return TextContent(
-            text=f"Failed: {str(e)}",
-            type="text"
-        )
-```
-
-### 2. App Launch Pattern
-
-When launching apps, ensure ports are stored in the persistent context:
-
-```python
-# In launch_app tool
-app_info = await service_manager.launch_app(app_name)
-
-# Store ports in persistent context for later access
-try:
-    backend_port = service_manager.get_app_port(app_name)
-    frontend_port = service_manager.get_app_frontend_port(app_name)
-    persistent_ctx.set_app_ports(app_name, frontend_port, backend_port)
-except Exception as e:
-    logger.error(f"Failed to store ports: {e}")
-
-# Track app in persistent context
-persistent_ctx.add_running_app(app_name)
-```
-
-### 3. Import Organization
-
-Keep imports at module level:
-
-```python
-# At top of file
-import logging
-import httpx
-from mcp.types import TextContent
-from . import setup
-
-# Not inside functions
-```
-
-## Development Workflow
-
-1. **Start the environment**: `hud dev`
-2. **Make changes**: Edit tools in `src/hud_controller/`
-3. **Test immediately**: The MCP server hot-reloads automatically
-4. **Check logs**: Look for serialization or proxy errors
-
-## Adding New Apps
-
-1. Create app directory in `apps/`
-2. Add setup tools in `src/hud_controller/setup/app_name.py`
-3. Add evaluate tools in `src/hud_controller/evaluate/app_name.py`
-4. Follow the HTTP pattern - no `call_app_api` usage
-5. Store app ports in persistent context when launching
-
-## Key Files
-
-- `context.py`: Persistent state management
-- `server.py`: MCP server and tool definitions
-- `services.py`: Process management for X11, VNC, apps
-- `setup/`: Setup tools organized by app
-- `evaluate/`: Evaluation tools organized by app
-
-Remember: When in doubt, make direct HTTP calls and store state in the persistent context!
-
diff --git a/environments/browser/browser-base/Dockerfile b/environments/browser/browser-base/Dockerfile
deleted file mode 100644
index 57eb9132..00000000
--- a/environments/browser/browser-base/Dockerfile
+++ /dev/null
@@ -1,50 +0,0 @@
-# syntax=docker/dockerfile:1
-FROM ubuntu:24.04 AS setup
-
-# Update and install core dependencies (including working Chromium browser)
-RUN apt-get update -y \
-  && apt-get install -y --no-install-recommends \
-  vim \
-  openssl \
-  ca-certificates \
-  curl \
-  wget \
-  sudo \
-  bash \
-  net-tools \
-  novnc \
-  x11vnc \
-  xvfb \
-  xfce4 \
-  locales \
-  libpq5 \
-  sqlite3 \
-  dbus-x11 \
-  xfce4-terminal \
-  xfonts-base \
-  xdotool \
-  psmisc \
-  scrot \
-  pm-utils \
-  build-essential \
-  unzip \
-  xauth \
-  gnupg \
-  gpg \
-  jq \
-  git \
-  build-essential \
-  nodejs \
-  npm
-
-RUN update-ca-certificates
-
-RUN curl -LsSf https://astral.sh/uv/install.sh | sh
-ENV PATH="/root/.local/bin:$PATH"
-
-# Install git for dependency installation
-RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
-
-# Install Playwright
-RUN uv pip install --system --break-system-packages playwright
-RUN python3 -m playwright install chromium --with-deps
\ No newline at end of file
diff --git a/environments/browser/browser-base/README.md b/environments/browser/browser-base/README.md
deleted file mode 100644
index 21999fec..00000000
--- a/environments/browser/browser-base/README.md
+++ /dev/null
@@ -1,58 +0,0 @@
-# Browser Base Image
-
-Base Docker image for browser environments with Playwright, Chromium, and VNC support.
-
-## Build
-
-```bash
-docker build -t browser-base:latest .
-```
-
-## Test with VNC Access
-
-### 1. Start the container
-
-```bash
-docker run -it --rm \
-  -p 6080:6080 \
-  -p 5900:5900 \
-  -e DISPLAY=:1 \
-  browser-base:latest \
-  bash
-```
-
-### 2. Inside the container, start display servers
-
-```bash
-Xvfb :1 -screen 0 1920x1080x24 > /dev/null 2>&1 &
-x11vnc -display :1 -nopw -listen 0.0.0.0 -forever > /dev/null 2>&1 &
-/usr/share/novnc/utils/novnc_proxy --vnc localhost:5900 --listen 6080 > /dev/null 2>&1 &
-```
-
-### 3. Test Playwright
-
-```bash
-python3 -c "
-from playwright.sync_api import sync_playwright
-with sync_playwright() as p:
-    browser = p.chromium.launch(headless=False)
-    page = browser.new_page()
-    page.goto('https://example.com')
-    print('Title:', page.title())
-    input('Press Enter to close...')
-    browser.close()
-"
-```
-
-### 4. View in browser
-
-Open `http://localhost:6080/vnc.html` to see Chromium running.
-
-## What's Included
-
-- Ubuntu 24.04
-- Desktop environment (Xvfb, x11vnc, noVNC, xfce4)
-- Node.js & npm
-- Python 3 with uv package manager
-- Playwright with Chromium
-- Development tools (git, curl, wget, etc.)
\ No newline at end of file
diff --git a/environments/browser/environment/2048/README.md b/environments/browser/environment/2048/README.md
deleted file mode 100644
index 474b0c6d..00000000
--- a/environments/browser/environment/2048/README.md
+++ /dev/null
@@ -1,103 +0,0 @@
-# 2048 Game for Browser Environment
-
-A browser-based implementation of the 2048 game with configurable target tiles and reward system for RL evaluation.
-
-## Features
-
-- **Configurable Target Tile**: Set any power of 2 as target (64, 128, 256, 512, 1024, 2048, etc.)
-- **Logarithmic Reward Scaling**: Smooth reward progression using `log(highest_tile) / log(target)`
-- **Efficiency Tracking**: Monitor score-to-moves ratio
-- **Flexible Board Size**: Support for 3x3 to 6x6 grids
-- **Full Evaluation API**: Compatible with RL evaluation system
-
-## Architecture
-
-### Backend (FastAPI)
-- Core game logic in `game.py`
-- RESTful API endpoints for game control
-- Evaluation endpoints for RL agents
-- SQLite persistence (optional)
-
-### Frontend (Next.js + React)
-- Responsive game board with smooth animations
-- Keyboard and touch controls
-- Real-time score and progress tracking
-- Customizable game parameters
-
-## Running the Game
-
-### Standalone
-```bash
-python launch.py --frontend-port 3001 --backend-port 5001
-```
-
-### With Browser Environment
-The game integrates with the browser environment's setup and evaluation system.
-
-## API Endpoints
-
-### Core Game
-- `POST /api/game/new` - Start new game
-- `GET /api/game/state` - Get current state
-- `POST /api/game/move` - Make a move
-- `POST /api/game/set_target` - Set target tile
-
-### Evaluation
-- `GET /api/eval/stats` - Get comprehensive stats
-- `GET /api/eval/max_number` - Get highest tile
-- `GET /api/eval/efficiency` - Get efficiency ratio
-- `POST /api/eval/set_board` - Set specific board
-- `POST /api/eval/reset` - Reset game
-
-## Evaluators
-
-- `game_2048_max_number` - Check if target tile reached (logarithmic reward)
-- `game_2048_efficiency` - Evaluate score/moves ratio
-- `game_2048_score_reached` - Check if target score reached
-- `game_2048_game_won` - Check if game is won
-- `game_2048_game_over` - Check if game is over
-- `game_2048_moves_made` - Check minimum moves made
-
-## Setup Tools
-
-- `game_2048_board` - Initialize game with size and target
-- `game_2048_set_board` - Set specific board state
-- `game_2048_near_win` - Set board near winning
-- `game_2048_navigate` - Navigate to game URL
-- `game_2048_reset` - Reset to initial state
-
-## Reward System
-
-The reward system matches the text-2048 environment:
-
-1. **Max Number Reward**: `min(1.0, log(highest_tile) / log(target))`
-   - Logarithmic scaling for smooth progression
-   - Reaches 1.0 when target tile is achieved
-
-2. **Efficiency Reward**: `min(1.0, ratio / min_ratio)`
-   - Linear scaling based on score/moves ratio
-   - Encourages efficient gameplay
-
-## Development
-
-### Backend Requirements
-- Python 3.8+
-- FastAPI
-- NumPy
-- uvicorn
-
-### Frontend Requirements
-- Node.js 16+
-- Next.js 14
-- React 18
-- Tailwind CSS
-
-## Testing
-
-The game can be tested with the browser environment's evaluation system:
-
-```python
-# Example evaluation
-ctx = Context()
-result = await game_2048_max_number(ctx, target=2048)
-```
\ No newline at end of file
diff --git a/environments/browser/environment/2048/backend/game.py b/environments/browser/environment/2048/backend/game.py
deleted file mode 100644
index e13f3b38..00000000
--- a/environments/browser/environment/2048/backend/game.py
+++ /dev/null
@@ -1,241 +0,0 @@
-"""2048 Game Logic for Browser Environment"""
-
-import random
-import numpy as np
-from typing import Tuple, Optional, List
-
-
-class Game2048:
-    """Browser-based 2048 game implementation with configurable target"""
-
-    def __init__(self, size: int = 4, target_tile: int = 2048):
-        self.size = size
-        self.target_tile = target_tile
-        self.board = np.zeros((size, size), dtype=int)
-        self.score = 0
-        self.game_over = False
-        self.moves_made = 0
-        self.won = False
-
-        # Start with 2 random tiles
-        self.add_random_tile()
-        self.add_random_tile()
-
-        # Track initial highest tile for reward calculation
-        self.initial_highest_tile = int(self.board.max())
-
-    def add_random_tile(self) -> bool:
-        """Add a random 2 or 4 tile to an empty position"""
-        empty_cells = [
-            (i, j) for i in range(self.size) for j in range(self.size) if self.board[i, j] == 0
-        ]
-
-        if not empty_cells:
-            return False
-
-        i, j = random.choice(empty_cells)
-        # 90% chance of 2, 10% chance of 4
-        self.board[i, j] = 2 if random.random() < 0.9 else 4
-        return True
-
-    def compress(self, row: np.ndarray) -> Tuple[np.ndarray, int]:
-        """Compress a row by moving all non-zero elements to the left and merging"""
-        new_row = np.zeros_like(row)
-        pos = 0
-        score = 0
-
-        # Move all non-zero elements to the left
-        for num in row:
-            if num != 0:
-                new_row[pos] = num
-                pos += 1
-
-        # Merge adjacent equal elements
-        i = 0
-        while i < len(new_row) - 1:
-            if new_row[i] != 0 and new_row[i] == new_row[i + 1]:
-                new_row[i] *= 2
-                score += new_row[i]
-                new_row[i + 1] = 0
-                i += 2
-            else:
-                i += 1
-
-        # Compress again after merging
-        final_row = np.zeros_like(row)
-        pos = 0
-        for num in new_row:
-            if num != 0:
-                final_row[pos] = num
-                pos += 1
-
-        return final_row, score
-
-    def move(self, direction: str) -> bool:
-        """Make a move in the specified direction"""
-        if self.game_over:
-            return False
-
-        direction = direction.lower()
-        if direction not in ["up", "down", "left", "right"]:
-            return False
-
-        original_board = self.board.copy()
-        move_score = 0
-
-        if direction == "left":
-            for i in range(self.size):
-                self.board[i], row_score = self.compress(self.board[i])
-                move_score += row_score
-
-        elif direction == "right":
-            for i in range(self.size):
-                reversed_row = self.board[i][::-1]
-                compressed, row_score = self.compress(reversed_row)
-                self.board[i] = compressed[::-1]
-                move_score += row_score
-
-        elif direction == "up":
-            for j in range(self.size):
-                column = self.board[:, j]
-                compressed, col_score = self.compress(column)
-                self.board[:, j] = compressed
-                move_score += col_score
-
-        elif direction == "down":
-            for j in range(self.size):
-                column = self.board[:, j][::-1]
-                compressed, col_score = self.compress(column)
-                self.board[:, j] = compressed[::-1]
-                move_score += col_score
-
-        # Check if the board changed
-        if not np.array_equal(original_board, self.board):
-            self.score += move_score
-            self.moves_made += 1
-            self.add_random_tile()
-            self.check_game_status()
-            return True
-
-        return False
-
-    def check_game_status(self):
-        """Check if the game is won or over"""
-        # Check if target tile is reached
-        if not self.won and self.board.max() >= self.target_tile:
-            self.won = True
-
-        # Check if game is over (no valid moves)
-        # Check for empty cells
-        if 0 in self.board:
-            self.game_over = False
-            return
-
-        # Check for possible merges
-        for i in range(self.size):
-            for j in range(self.size):
-                current = self.board[i, j]
-                # Check right neighbor
-                if j < self.size - 1 and current == self.board[i, j + 1]:
-                    self.game_over = False
-                    return
-                # Check bottom neighbor
-                if i < self.size - 1 and current == self.board[i + 1, j]:
-                    self.game_over = False
-                    return
-
-        self.game_over = True
-
-    def get_state(self) -> dict:
-        """Get the current game state as a dictionary"""
-        return {
-            "board": self.board.tolist(),
-            "score": int(self.score),
-            "moves": int(self.moves_made),
-            "game_over": bool(self.game_over),
-            "won": bool(self.won),
-            "highest_tile": int(self.board.max()),
-            "initial_highest_tile": int(self.initial_highest_tile),
-            "target_tile": self.target_tile,
-            "board_size": self.size,
-        }
-
-    def set_board(self, board: List[List[int]], score: int = 0, moves: int = 0):
-        """Set a specific board configuration (for testing)"""
-        self.board = np.array(board, dtype=int)
-        self.score = score
-        self.moves_made = moves
-        self.check_game_status()
-
-    def reset(self, size: Optional[int] = None, target_tile: Optional[int] = None):
-        """Reset the game to initial state
-
-        Args:
-            size: Optional new board size
-            target_tile: Optional new target tile
-        """
-        if size is not None:
-            self.size = size
-        if target_tile is not None:
-            self.target_tile = target_tile
-
-        self.board = np.zeros((self.size, self.size), dtype=int)
-        self.score = 0
-        self.game_over = False
-        self.won = False
-        self.moves_made = 0
-        self.add_random_tile()
-        self.add_random_tile()
-
-        # Track initial highest tile after reset
-        self.initial_highest_tile = int(self.board.max())
-
-    def can_move(self) -> dict:
-        """Check which moves are valid"""
-        valid_moves = {"up": False, "down": False, "left": False, "right": False}
-
-        if self.game_over:
-            return valid_moves
-
-        # Test each direction without modifying the actual board
-        original_board = self.board.copy()
-
-        for direction in ["up", "down", "left", "right"]:
-            test_board = original_board.copy()
-            self.board = test_board
-
-            # Try the move
-            if direction == "left":
-                for i in range(self.size):
-                    compressed, _ = self.compress(self.board[i])
-                    if not np.array_equal(self.board[i], compressed):
-                        valid_moves[direction] = True
-                        break
-
-            elif direction == "right":
-                for i in range(self.size):
-                    reversed_row = self.board[i][::-1]
-                    compressed, _ = self.compress(reversed_row)
-                    if not np.array_equal(reversed_row, compressed):
-                        valid_moves[direction] = True
-                        break
-
-            elif direction == "up":
-                for j in range(self.size):
-                    column = self.board[:, j]
-                    compressed, _ = self.compress(column)
-                    if not np.array_equal(column, compressed):
-                        valid_moves[direction] = True
-                        break
-
-            elif direction == "down":
-                for j in range(self.size):
-                    column = self.board[:, j][::-1]
-                    compressed, _ = self.compress(column)
-                    if not np.array_equal(column, compressed):
-                        valid_moves[direction] = True
-                        break
-
-        # Restore original board
-        self.board = original_board
-        return valid_moves
diff --git a/environments/browser/environment/2048/backend/main.py b/environments/browser/environment/2048/backend/main.py
deleted file mode 100644
index 8cfba5ce..00000000
--- a/environments/browser/environment/2048/backend/main.py
+++ /dev/null
@@ -1,246 +0,0 @@
-"""FastAPI backend for 2048 game"""
-
-from fastapi import FastAPI, HTTPException
-from fastapi.middleware.cors import CORSMiddleware
-from pydantic import BaseModel
-from typing import List, Optional
-from datetime import datetime
-import sqlite3
-import json
-from game import Game2048
-
-app = FastAPI(title="2048 Game API", version="1.0.0")
-
-# Configure CORS
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=["http://localhost:3001"],  # Different port from todo app
-    allow_credentials=True,
-    allow_methods=["*"],
-    allow_headers=["*"],
-)
-
-# Global game instance (in production, would use sessions/database)
-game = Game2048()
-
-
-# Pydantic models
-class NewGameRequest(BaseModel):
-    board_size: int = 4
-    target_tile: int = 2048
-
-
-class MoveRequest(BaseModel):
-    direction: str  # up, down, left, right
-
-
-class SetBoardRequest(BaseModel):
-    board: List[List[int]]
-    score: Optional[int] = 0
-    moves: Optional[int] = 0
-
-
-class SetTargetRequest(BaseModel):
-    target_tile: int
-
-
-class GameState(BaseModel):
-    board: List[List[int]]
-    score: int
-    moves: int
-    game_over: bool
-    won: bool
-    highest_tile: int
-    initial_highest_tile: int
-    target_tile: int
-    board_size: int
-
-
-class EvaluationStats(BaseModel):
-    board: List[List[int]]
-    score: int
-    moves: int
-    highest_tile: int
-    target_tile: int
-    efficiency: float
-    game_over: bool
-    won: bool
-    valid_moves: dict
-
-
-# === CORE GAME API ROUTES ===
-
-
-@app.get("/api/status")
-def status():
-    """Health check endpoint"""
-    return {"status": "ok", "timestamp": datetime.now().isoformat()}
-
-
-@app.post("/api/game/new", response_model=GameState)
-def new_game(request: NewGameRequest):
-    """Start a new game with specified parameters"""
-    global game
-    game = Game2048(size=request.board_size, target_tile=request.target_tile)
-    return game.get_state()
-
-
-@app.get("/api/game/state", response_model=GameState)
-def get_game_state():
-    """Get current game state"""
-    return game.get_state()
-
-
-@app.post("/api/game/move", response_model=GameState)
-def make_move(request: MoveRequest):
-    """Make a move in the specified direction"""
-    valid = game.move(request.direction)
-    if not valid and not game.game_over:
-        raise HTTPException(status_code=400, detail="Invalid move")
-    return game.get_state()
-
-
-@app.post("/api/game/set_target", response_model=GameState)
-def set_target(request: SetTargetRequest):
-    """Set the target tile for the game"""
-    game.target_tile = request.target_tile
-    game.check_game_status()  # Re-check win condition
-    return game.get_state()
-
-
-@app.get("/api/game/valid_moves")
-def get_valid_moves():
-    """Get which moves are currently valid"""
-    return game.can_move()
-
-
-# === EVALUATION API ROUTES ===
-
-
-@app.get("/api/eval/health")
-def eval_health():
-    """Health check endpoint for evaluation system"""
-    return {
-        "status": "healthy",
-        "game_active": not game.game_over,
-        "highest_tile": int(game.board.max()),
-        "target_tile": game.target_tile,
-        "timestamp": datetime.now().isoformat(),
-    }
-
-
-@app.get("/api/eval/stats", response_model=EvaluationStats)
-def get_evaluation_stats():
-    """Comprehensive evaluation statistics for the game"""
-    state = game.get_state()
-    efficiency = state["score"] / state["moves"] if state["moves"] > 0 else 0.0
-
-    return EvaluationStats(
-        board=state["board"],
-        score=state["score"],
-        moves=state["moves"],
-        highest_tile=state["highest_tile"],
-        target_tile=state["target_tile"],
-        efficiency=efficiency,
-        game_over=state["game_over"],
-        won=state["won"],
-        valid_moves=game.can_move(),
-    )
-
-
-@app.get("/api/eval/max_number")
-def get_max_number():
-    """Get the highest tile value for evaluation"""
-    state = game.get_state()
-    return {
-        "highest_tile": state["highest_tile"],
-        "target_tile": state["target_tile"],
-        "progress": state["highest_tile"] / state["target_tile"] if state["target_tile"] > 0 else 0,
-        "timestamp": datetime.now().isoformat(),
-    }
-
-
-@app.get("/api/eval/efficiency")
-def get_efficiency():
-    """Get the game efficiency (score/moves ratio)"""
-    state = game.get_state()
-    efficiency = state["score"] / state["moves"] if state["moves"] > 0 else 0.0
-
-    return {
-        "score": state["score"],
-        "moves": state["moves"],
-        "efficiency": efficiency,
-        "timestamp": datetime.now().isoformat(),
-    }
-
-
-@app.get("/api/eval/board")
-def get_board():
-    """Get current board state for evaluation"""
-    state = game.get_state()
-    return {
-        "board": state["board"],
-        "board_size": state["board_size"],
-        "empty_cells": sum(1 for row in state["board"] for cell in row if cell == 0),
-        "timestamp": datetime.now().isoformat(),
-    }
-
-
-@app.post("/api/eval/set_board", response_model=GameState)
-def set_board(request: SetBoardRequest):
-    """Set a specific board configuration for testing"""
-    try:
-        game.set_board(request.board, request.score, request.moves)
-        return game.get_state()
-    except Exception as e:
-        raise HTTPException(status_code=400, detail=str(e))
-
-
-@app.post("/api/eval/reset", response_model=GameState)
-def reset_game():
-    """Reset game to initial state"""
-    game.reset()
-    return game.get_state()
-
-
-@app.post("/api/eval/seed")
-def seed_test_board():
-    """Seed the board with a test configuration"""
-    # Create a board that's close to winning
-    test_board = [[1024, 512, 256, 128], [64, 32, 16, 8], [4, 2, 0, 0], [0, 0, 0, 0]]
-    game.set_board(test_board, score=10000, moves=100)
-
-    return {
-        "message": "Test board seeded successfully",
-        "highest_tile": 1024,
-        "timestamp": datetime.now().isoformat(),
-    }
-
-
-@app.post("/api/eval/seed_custom")
-def seed_custom_board(board: List[List[int]]):
-    """Seed the board with a custom configuration"""
-    try:
-        game.set_board(board)
-        state = game.get_state()
-        return {
-            "message": "Custom board seeded successfully",
-            "highest_tile": state["highest_tile"],
-            "timestamp": datetime.now().isoformat(),
-        }
-    except Exception as e:
-        raise HTTPException(status_code=400, detail=str(e))
-
-
-@app.get("/api/eval/can_move")
-def can_move():
-    """Check if any moves are available"""
-    valid_moves = game.can_move()
-    has_moves = any(valid_moves.values())
-
-    return {
-        "can_move": has_moves,
-        "valid_moves": valid_moves,
-        "game_over": game.game_over,
-        "timestamp": datetime.now().isoformat(),
-    }
diff --git a/environments/browser/environment/2048/backend/pyproject.toml b/environments/browser/environment/2048/backend/pyproject.toml
deleted file mode 100644
index d3c16ae0..00000000
--- a/environments/browser/environment/2048/backend/pyproject.toml
+++ /dev/null
@@ -1,9 +0,0 @@
-[project]
-name = "game-2048-backend"
-version = "1.0.0"
-dependencies = [
-    "fastapi",
-    "uvicorn",
-    "numpy",
-    "pydantic"
-]
\ No newline at end of file
diff --git a/environments/browser/environment/2048/frontend/app/globals.css b/environments/browser/environment/2048/frontend/app/globals.css
deleted file mode 100644
index bd6213e1..00000000
--- a/environments/browser/environment/2048/frontend/app/globals.css
+++ /dev/null
@@ -1,3 +0,0 @@
-@tailwind base;
-@tailwind components;
-@tailwind utilities;
\ No newline at end of file
diff --git a/environments/browser/environment/2048/frontend/app/layout.tsx b/environments/browser/environment/2048/frontend/app/layout.tsx
deleted file mode 100644
index bcb24f69..00000000
--- a/environments/browser/environment/2048/frontend/app/layout.tsx
+++ /dev/null
@@ -1,22 +0,0 @@
-import type { Metadata } from 'next'
-import { Inter } from 'next/font/google'
-import './globals.css'
-
-const inter = Inter({ subsets: ['latin'] })
-
-export const metadata: Metadata = {
-  title: '2048 Game',
-  description: 'A browser-based 2048 game with configurable targets',
-}
-
-export default function RootLayout({
-  children,
-}: {
-  children: React.ReactNode
-}) {
-  return (
-    <html lang="en">
-      <body className={inter.className}>{children}</body>
-    </html>
-  )
-}
\ No newline at end of file
diff --git a/environments/browser/environment/2048/frontend/app/page.tsx b/environments/browser/environment/2048/frontend/app/page.tsx
deleted file mode 100644
index 3b56cede..00000000
--- a/environments/browser/environment/2048/frontend/app/page.tsx
+++ /dev/null
@@ -1,190 +0,0 @@
-'use client';
-
-import { useState, useEffect, useCallback } from 'react';
-import GameBoard from '../components/GameBoard';
-import GameControls from '../components/GameControls';
-
-// Dynamically determine API URL based on current port
-// Backend is always on frontend_port + 1
-const getApiUrl = () => {
-  if (typeof window !== 'undefined') {
-    const currentPort = parseInt(window.location.port) || 3000;
-    return `http://localhost:${currentPort + 1}`;
-  }
-  return process.env.NEXT_PUBLIC_API_URL || 'http://localhost:5001';
-};
-
-const API_URL = getApiUrl();
-
-interface GameState {
-  board: number[][];
-  score: number;
-  moves: number;
-  game_over: boolean;
-  won: boolean;
-  highest_tile: number;
-  target_tile: number;
-  board_size: number;
-}
-
-export default function Game2048() {
-  const [gameState, setGameState] = useState<GameState | null>(null);
-  const [loading, setLoading] = useState(false);
-  const [message, setMessage] = useState('');
-
-  // Load initial game state
-  useEffect(() => {
-    fetchGameState();
-  }, []);
-
-  // Handle keyboard input
-  useEffect(() => {
-    const handleKeyPress = (e: KeyboardEvent) => {
-      if (gameState?.game_over) return;
-      
-      const keyMap: { [key: string]: string } = {
-        'ArrowUp': 'up',
-        'ArrowDown': 'down',
-        'ArrowLeft': 'left',
-        'ArrowRight': 'right',
-      };
-
-      const direction = keyMap[e.key];
-      if (direction) {
-        e.preventDefault();
-        makeMove(direction);
-      }
-    };
-
-    window.addEventListener('keydown', handleKeyPress);
-    return () => window.removeEventListener('keydown', handleKeyPress);
-  }, [gameState]);
-
-  const fetchGameState = async () => {
-    try {
-      const response = await fetch(`${API_URL}/api/game/state`);
-      const data = await response.json();
-      setGameState(data);
-    } catch (error) {
-      console.error('Error fetching game state:', error);
-      setMessage('Error loading game');
-    }
-  };
-
-  const makeMove = async (direction: string) => {
-    if (loading) return;
-    setLoading(true);
-
-    try {
-      const response = await fetch(`${API_URL}/api/game/move`, {
-        method: 'POST',
-        headers: { 'Content-Type': 'application/json' },
-        body: JSON.stringify({ direction }),
-      });
-
-      if (response.ok) {
-        const data = await response.json();
-        setGameState(data);
-        
-        if (data.won && !gameState?.won) {
-          setMessage(`🎉 You reached ${data.target_tile}!`);
-        } else if (data.game_over) {
-          setMessage('Game Over! No more moves available.');
-        }
-      } else {
-        // Invalid move, just ignore
-      }
-    } catch (error) {
-      console.error('Error making move:', error);
-    } finally {
-      setLoading(false);
-    }
-  };
-
-  const newGame = async (boardSize: number = 4, targetTile: number = 2048) => {
-    setLoading(true);
-    setMessage('');
-
-    try {
-      const response = await fetch(`${API_URL}/api/game/new`, {
-        method: 'POST',
-        headers: { 'Content-Type': 'application/json' },
-        body: JSON.stringify({ board_size: boardSize, target_tile: targetTile }),
-      });
-
-      const data = await response.json();
-      setGameState(data);
-    } catch (error) {
-      console.error('Error starting new game:', error);
-      setMessage('Error starting new game');
-    } finally {
-      setLoading(false);
-    }
-  };
-
-  // Touch/swipe handling
-  const [touchStart, setTouchStart] = useState<{ x: number; y: number } | null>(null);
-
-  const handleTouchStart = (e: React.TouchEvent) => {
-    const touch = e.touches[0];
-    setTouchStart({ x: touch.clientX, y: touch.clientY });
-  };
-
-  const handleTouchEnd = (e: React.TouchEvent) => {
-    if (!touchStart) return;
-
-    const touch = e.changedTouches[0];
-    const deltaX = touch.clientX - touchStart.x;
-    const deltaY = touch.clientY - touchStart.y;
-    const minSwipeDistance = 50;
-
-    if (Math.abs(deltaX) > Math.abs(deltaY)) {
-      // Horizontal swipe
-      if (Math.abs(deltaX) > minSwipeDistance) {
-        makeMove(deltaX > 0 ? 'right' : 'left');
-      }
-    } else {
-      // Vertical swipe
-      if (Math.abs(deltaY) > minSwipeDistance) {
-        makeMove(deltaY > 0 ? 'down' : 'up');
-      }
-    }
-
-    setTouchStart(null);
-  };
-
-  if (!gameState) {
-    return (
-      <div className="flex items-center justify-center min-h-screen bg-gray-100">
-        <div className="text-xl">Loading game...</div>
-      </div>
-    );
-  }
-
-  return (
-    <div className="min-h-screen bg-gray-100 py-8">
-      <div className="max-w-2xl mx-auto px-4">
-        <h1 className="text-4xl font-bold text-center mb-8 text-gray-800">2048</h1>
-        
-        <GameControls
-          gameState={gameState}
-          onNewGame={newGame}
-          message={message}
-        />
-        
-        <div
-          onTouchStart={handleTouchStart}
-          onTouchEnd={handleTouchEnd}
-          className="touch-none"
-        >
-          <GameBoard board={gameState.board} />
-        </div>
-        
-        <div className="mt-6 text-center text-gray-600">
-          <p className="mb-2">Use arrow keys to play</p>
-          <p className="text-sm">Combine tiles to reach {gameState.target_tile}!</p>
-        </div>
-      </div>
-    </div>
-  );
-}
\ No newline at end of file
diff --git a/environments/browser/environment/2048/frontend/components/GameBoard.tsx b/environments/browser/environment/2048/frontend/components/GameBoard.tsx
deleted file mode 100644
index d5678e41..00000000
--- a/environments/browser/environment/2048/frontend/components/GameBoard.tsx
+++ /dev/null
@@ -1,31 +0,0 @@
-import React from 'react';
-import GameTile from './GameTile';
-
-interface GameBoardProps {
-  board: number[][];
-}
-
-export default function GameBoard({ board }: GameBoardProps) {
-  const boardSize = board.length;
-
-  return (
-    <div className="relative bg-gray-300 rounded-lg p-2 shadow-lg">
-      <div 
-        className="grid gap-2"
-        style={{
-          gridTemplateColumns: `repeat(${boardSize}, 1fr)`,
-        }}
-      >
-        {board.map((row, i) =>
-          row.map((value, j) => (
-            <GameTile
-              key={`${i}-${j}`}
-              value={value}
-              position={{ row: i, col: j }}
-            />
-          ))
-        )}
-      </div>
-    </div>
-  );
-}
\ No newline at end of file
diff --git a/environments/browser/environment/2048/frontend/components/GameControls.tsx b/environments/browser/environment/2048/frontend/components/GameControls.tsx
deleted file mode 100644
index b89b3613..00000000
--- a/environments/browser/environment/2048/frontend/components/GameControls.tsx
+++ /dev/null
@@ -1,104 +0,0 @@
-import React, { useState } from 'react';
-
-interface GameState {
-  score: number;
-  moves: number;
-  game_over: boolean;
-  won: boolean;
-  highest_tile: number;
-  target_tile: number;
-}
-
-interface GameControlsProps {
-  gameState: GameState;
-  onNewGame: (boardSize: number, targetTile: number) => void;
-  message: string;
-}
-
-export default function GameControls({ gameState, onNewGame, message }: GameControlsProps) {
-  const [targetTile, setTargetTile] = useState(gameState.target_tile);
-  const [boardSize, setBoardSize] = useState(4);
-
-  const efficiency = gameState.moves > 0 
-    ? (gameState.score / gameState.moves).toFixed(1)
-    : '0.0';
-
-  return (
-    <div className="mb-6">
-      {/* Score and Stats */}
-      <div className="grid grid-cols-2 md:grid-cols-4 gap-4 mb-4">
-        <div className="bg-white rounded-lg p-3 shadow">
-          <div className="text-sm text-gray-600">Score</div>
-          <div className="text-2xl font-bold text-gray-800">{gameState.score}</div>
-        </div>
-        <div className="bg-white rounded-lg p-3 shadow">
-          <div className="text-sm text-gray-600">Moves</div>
-          <div className="text-2xl font-bold text-gray-800">{gameState.moves}</div>
-        </div>
-        <div className="bg-white rounded-lg p-3 shadow">
-          <div className="text-sm text-gray-600">Highest</div>
-          <div className="text-2xl font-bold text-gray-800">{gameState.highest_tile}</div>
-        </div>
-        <div className="bg-white rounded-lg p-3 shadow">
-          <div className="text-sm text-gray-600">Efficiency</div>
-          <div className="text-2xl font-bold text-gray-800">{efficiency}</div>
-        </div>
-      </div>
-
-      {/* Game Controls */}
-      <div className="bg-white rounded-lg p-4 shadow mb-4">
-        <div className="flex flex-col sm:flex-row gap-4 items-center">
-          <div className="flex items-center gap-2">
-            <label className="text-sm text-gray-600">Target:</label>
-            <select
-              value={targetTile}
-              onChange={(e) => setTargetTile(Number(e.target.value))}
-              className="px-2 py-1 border rounded focus:outline-none focus:ring-2 focus:ring-blue-500"
-            >
-              <option value={64}>64</option>
-              <option value={128}>128</option>
-              <option value={256}>256</option>
-              <option value={512}>512</option>
-              <option value={1024}>1024</option>
-              <option value={2048}>2048</option>
-              <option value={4096}>4096</option>
-              <option value={8192}>8192</option>
-            </select>
-          </div>
-
-          <div className="flex items-center gap-2">
-            <label className="text-sm text-gray-600">Size:</label>
-            <select
-              value={boardSize}
-              onChange={(e) => setBoardSize(Number(e.target.value))}
-              className="px-2 py-1 border rounded focus:outline-none focus:ring-2 focus:ring-blue-500"
-            >
-              <option value={3}>3x3</option>
-              <option value={4}>4x4</option>
-              <option value={5}>5x5</option>
-              <option value={6}>6x6</option>
-            </select>
-          </div>
-
-          <button
-            onClick={() => onNewGame(boardSize, targetTile)}
-            className="px-4 py-2 bg-blue-500 text-white rounded hover:bg-blue-600 transition-colors"
-          >
-            New Game
-          </button>
-        </div>
-      </div>
-
-      {/* Status Message */}
-      {message && (
-        <div className={`text-center p-3 rounded-lg ${
-          gameState.won ? 'bg-green-100 text-green-800' : 
-          gameState.game_over ? 'bg-red-100 text-red-800' : 
-          'bg-blue-100 text-blue-800'
-        }`}>
-          {message}
-        </div>
-      )}
-    </div>
-  );
-}
\ No newline at end of file
diff --git a/environments/browser/environment/2048/frontend/components/GameTile.tsx b/environments/browser/environment/2048/frontend/components/GameTile.tsx
deleted file mode 100644
index e3b4bdfc..00000000
--- a/environments/browser/environment/2048/frontend/components/GameTile.tsx
+++ /dev/null
@@ -1,53 +0,0 @@
-import React from 'react';
-
-interface GameTileProps {
-  value: number;
-  position: { row: number; col: number };
-}
-
-export default function GameTile({ value }: GameTileProps) {
-  const getTileColor = (val: number): string => {
-    const colors: { [key: number]: string } = {
-      0: 'bg-gray-200',
-      2: 'bg-yellow-100',
-      4: 'bg-yellow-200',
-      8: 'bg-orange-300',
-      16: 'bg-orange-400',
-      32: 'bg-orange-500',
-      64: 'bg-red-400',
-      128: 'bg-yellow-300',
-      256: 'bg-yellow-400',
-      512: 'bg-yellow-500',
-      1024: 'bg-yellow-600',
-      2048: 'bg-yellow-700',
-      4096: 'bg-purple-600',
-      8192: 'bg-purple-700',
-    };
-    return colors[val] || 'bg-purple-800';
-  };
-
-  const getTextSize = (val: number): string => {
-    if (val === 0) return '';
-    if (val < 100) return 'text-3xl';
-    if (val < 1000) return 'text-2xl';
-    return 'text-xl';
-  };
-
-  const getTextColor = (val: number): string => {
-    return val > 4 ? 'text-white' : 'text-gray-800';
-  };
-
-  return (
-    <div
-      className={`
-        aspect-square rounded flex items-center justify-center font-bold
-        transition-all duration-150 ease-in-out
-        ${getTileColor(value)}
-        ${getTextColor(value)}
-        ${getTextSize(value)}
-      `}
-    >
-      {value > 0 && value}
-    </div>
-  );
-}
\ No newline at end of file
diff --git a/environments/browser/environment/2048/frontend/next.config.js b/environments/browser/environment/2048/frontend/next.config.js
deleted file mode 100644
index cf97dc63..00000000
--- a/environments/browser/environment/2048/frontend/next.config.js
+++ /dev/null
@@ -1,6 +0,0 @@
-/** @type {import('next').NextConfig} */
-const nextConfig = {
-  reactStrictMode: true,
-}
-
-module.exports = nextConfig
\ No newline at end of file
diff --git a/environments/browser/environment/2048/frontend/package.json b/environments/browser/environment/2048/frontend/package.json
deleted file mode 100644
index 7a7e412c..00000000
--- a/environments/browser/environment/2048/frontend/package.json
+++ /dev/null
@@ -1,28 +0,0 @@
-{
-  "name": "game-2048-frontend",
-  "version": "1.0.0",
-  "private": true,
-  "scripts": {
-    "dev": "next dev",
-    "build": "next build",
-    "start": "next start",
-    "lint": "next lint"
-  },
-  "dependencies": {
-    "next": "14.1.0",
-    "react": "^18",
-    "react-dom": "^18",
-    "swr": "^2.2.4"
-  },
-  "devDependencies": {
-    "@types/node": "^20",
-    "@types/react": "^18",
-    "@types/react-dom": "^18",
-    "autoprefixer": "^10.0.1",
-    "eslint": "^8",
-    "eslint-config-next": "14.1.0",
-    "postcss": "^8",
-    "tailwindcss": "^3.3.0",
-    "typescript": "^5"
-  }
-}
\ No newline at end of file
diff --git a/environments/browser/environment/2048/frontend/postcss.config.js b/environments/browser/environment/2048/frontend/postcss.config.js
deleted file mode 100644
index 96bb01e7..00000000
--- a/environments/browser/environment/2048/frontend/postcss.config.js
+++ /dev/null
@@ -1,6 +0,0 @@
-module.exports = {
-  plugins: {
-    tailwindcss: {},
-    autoprefixer: {},
-  },
-}
\ No newline at end of file
diff --git a/environments/browser/environment/2048/frontend/tailwind.config.js b/environments/browser/environment/2048/frontend/tailwind.config.js
deleted file mode 100644
index 47bc0bad..00000000
--- a/environments/browser/environment/2048/frontend/tailwind.config.js
+++ /dev/null
@@ -1,12 +0,0 @@
-/** @type {import('tailwindcss').Config} */
-module.exports = {
-  content: [
-    './pages/**/*.{js,ts,jsx,tsx,mdx}',
-    './components/**/*.{js,ts,jsx,tsx,mdx}',
-    './app/**/*.{js,ts,jsx,tsx,mdx}',
-  ],
-  theme: {
-    extend: {},
-  },
-  plugins: [],
-}
\ No newline at end of file
diff --git a/environments/browser/environment/2048/frontend/tsconfig.json b/environments/browser/environment/2048/frontend/tsconfig.json
deleted file mode 100644
index 9b9948d5..00000000
--- a/environments/browser/environment/2048/frontend/tsconfig.json
+++ /dev/null
@@ -1,27 +0,0 @@
-{
-  "compilerOptions": {
-    "target": "es5",
-    "lib": ["dom", "dom.iterable", "esnext"],
-    "allowJs": true,
-    "skipLibCheck": true,
-    "strict": true,
-    "noEmit": true,
-    "esModuleInterop": true,
-    "module": "esnext",
-    "moduleResolution": "bundler",
-    "resolveJsonModule": true,
-    "isolatedModules": true,
-    "jsx": "preserve",
-    "incremental": true,
-    "plugins": [
-      {
-        "name": "next"
-      }
-    ],
-    "paths": {
-      "@/*": ["./*"]
-    }
-  },
-  "include": ["next-env.d.ts", "**/*.ts", "**/*.tsx", ".next/types/**/*.ts"],
-  "exclude": ["node_modules"]
-}
\ No newline at end of file
diff --git a/environments/browser/environment/2048/launch.py b/environments/browser/environment/2048/launch.py
deleted file mode 100644
index a5645668..00000000
--- a/environments/browser/environment/2048/launch.py
+++ /dev/null
@@ -1,284 +0,0 @@
-#!/usr/bin/env python3
-"""2048 game launcher script."""
-
-import subprocess
-import time
-import signal
-import sys
-import argparse
-import logging
-import os
-import socket
-from pathlib import Path
-from typing import Optional
-
-# Configure logging to stderr to avoid stdio contamination
-logging.basicConfig(level=logging.INFO, format="[%(asctime)s] 2048: %(message)s", stream=sys.stderr)
-
-# Global variables to track processes
-frontend_process: Optional[subprocess.Popen] = None
-backend_process: Optional[subprocess.Popen] = None
-
-
-def cleanup_processes():
-    """Clean up running processes."""
-    global frontend_process, backend_process
-    logging.info("Shutting down services...")
-
-    for proc in [frontend_process, backend_process]:
-        if proc and proc.poll() is None:
-            proc.terminate()
-            try:
-                proc.wait(timeout=5)
-            except subprocess.TimeoutExpired:
-                proc.kill()
-
-
-def signal_handler(sig, frame):
-    """Handle shutdown signals."""
-    cleanup_processes()
-    sys.exit(0)
-
-
-def check_port_available(port: int) -> bool:
-    """Check if a port is available."""
-    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-    sock.settimeout(1)
-    try:
-        result = sock.connect_ex(("localhost", port))
-        sock.close()
-        return result != 0  # Port is available if connection fails
-    except:
-        return True
-
-
-def launch_app(frontend_port: int = 3001, backend_port: int = 5001):
-    """Launch the 2048 game with frontend and backend."""
-    global frontend_process, backend_process
-
-    # Set up signal handlers
-    signal.signal(signal.SIGINT, signal_handler)
-    signal.signal(signal.SIGTERM, signal_handler)
-
-    try:
-        # Get current directory
-        app_dir = Path(__file__).parent
-        frontend_dir = app_dir / "frontend"
-        backend_dir = app_dir / "backend"
-
-        logging.info(
-            f"Starting 2048 game - Frontend port: {frontend_port}, Backend port: {backend_port}"
-        )
-
-        # Check if ports are available
-        if not check_port_available(backend_port):
-            logging.warning(f"Backend port {backend_port} is already in use")
-        if not check_port_available(frontend_port):
-            logging.warning(f"Frontend port {frontend_port} is already in use")
-
-        # Prepare backend command
-        backend_env = {
-            "PORT": str(backend_port),
-            "PYTHONPATH": str(backend_dir),
-            **dict(os.environ),
-        }
-
-        # Check if we can use uv, otherwise fall back to system python
-        try:
-            subprocess.run(["uv", "--version"], check=True, capture_output=True)
-            backend_cmd = [
-                "uv",
-                "run",
-                "uvicorn",
-                "main:app",
-                "--host",
-                "0.0.0.0",
-                "--port",
-                str(backend_port),
-            ]
-            logging.info("Using uv for backend")
-        except (subprocess.CalledProcessError, FileNotFoundError):
-            # Fall back to system python with uvicorn
-            logging.info("uv not available, using system python for backend")
-            backend_cmd = [
-                "python3",
-                "-m",
-                "uvicorn",
-                "main:app",
-                "--host",
-                "0.0.0.0",
-                "--port",
-                str(backend_port),
-            ]
-
-        # Prepare frontend command
-        frontend_env = {
-            "NEXT_PUBLIC_API_URL": f"http://localhost:{backend_port}",
-            "PORT": str(frontend_port),
-            **dict(os.environ),
-        }
-
-        # Check if dependencies are installed
-        if frontend_dir.exists():
-            node_modules = frontend_dir / "node_modules"
-            if not node_modules.exists():
-                logging.info("Installing frontend dependencies...")
-                npm_install = subprocess.run(
-                    ["npm", "install"], cwd=frontend_dir, capture_output=True
-                )
-                if npm_install.returncode != 0:
-                    logging.error(
-                        f"Failed to install npm dependencies: {npm_install.stderr.decode()}"
-                    )
-                    cleanup_processes()
-                    raise RuntimeError("npm install failed")
-
-            # Check if we have a production build
-            if (frontend_dir / ".next").exists():
-                logging.info("Running in production mode (pre-built)...")
-                frontend_cmd = [
-                    "npm",
-                    "run",
-                    "start",
-                    "--",
-                    "--port",
-                    str(frontend_port),
-                    "--hostname",
-                    "0.0.0.0",
-                ]
-            else:
-                logging.info("Running in development mode...")
-                frontend_cmd = [
-                    "npm",
-                    "run",
-                    "dev",
-                    "--",
-                    "--port",
-                    str(frontend_port),
-                    "--hostname",
-                    "0.0.0.0",
-                ]
-
-        # 🚀 START BOTH PROCESSES IN PARALLEL
-        logging.info("Starting backend and frontend in parallel...")
-
-        # Start backend - UPDATE GLOBAL VARIABLE
-        backend_process = subprocess.Popen(
-            backend_cmd,
-            cwd=backend_dir,
-            env=backend_env,
-            stdin=subprocess.DEVNULL,
-            stdout=subprocess.DEVNULL,  # Don't capture stdout - reserved for MCP
-            stderr=subprocess.DEVNULL,  # Don't capture stderr - reserved for MCP
-        )
-
-        # Start frontend immediately (in parallel) - UPDATE GLOBAL VARIABLE
-        if frontend_dir.exists():
-            frontend_process = subprocess.Popen(
-                frontend_cmd,
-                cwd=frontend_dir,
-                env=frontend_env,
-                stdin=subprocess.DEVNULL,
-                stdout=subprocess.DEVNULL,  # Don't capture stdout - reserved for MCP
-                stderr=subprocess.DEVNULL,  # Don't capture stderr - reserved for MCP
-            )
-
-        # 🚀 WAIT FOR BOTH IN PARALLEL WITH FAST POLLING
-        backend_ready = False
-        frontend_ready = False
-
-        # Use faster polling (every 200ms instead of 1s)
-        max_attempts_backend = 150  # 30 seconds at 200ms intervals
-        max_attempts_frontend = 600  # 120 seconds at 200ms intervals
-
-        for attempt in range(max(max_attempts_backend, max_attempts_frontend)):
-            # Check if processes are still alive
-            if backend_process and backend_process.poll() is not None:
-                logging.error(f"Backend process died with exit code {backend_process.returncode}")
-                cleanup_processes()
-                raise RuntimeError("Backend failed to start")
-
-            if frontend_process and frontend_process.poll() is not None:
-                logging.error(f"Frontend process died with exit code {frontend_process.returncode}")
-                cleanup_processes()
-                raise RuntimeError("Frontend failed to start")
-
-            # Check backend readiness
-            if not backend_ready and attempt < max_attempts_backend:
-                sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-                sock.settimeout(0.1)
-                try:
-                    result = sock.connect_ex(("localhost", backend_port))
-                    sock.close()
-                    if result == 0:
-                        backend_ready = True
-                        logging.info(f"Backend is ready (attempt {attempt + 1})")
-                except:
-                    pass
-
-            # Check frontend readiness
-            if not frontend_ready and attempt < max_attempts_frontend:
-                sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-                sock.settimeout(0.1)
-                try:
-                    result = sock.connect_ex(("localhost", frontend_port))
-                    sock.close()
-                    if result == 0:
-                        frontend_ready = True
-                        logging.info(f"Frontend is ready (attempt {attempt + 1})")
-                except:
-                    pass
-
-            # Exit early if both are ready
-            if backend_ready and frontend_ready:
-                break
-
-            time.sleep(0.2)  # 200ms intervals instead of 1s
-
-        # Check final status
-        if not backend_ready:
-            logging.error("Backend did not start within 30 seconds")
-            cleanup_processes()
-            raise RuntimeError("Backend startup timeout")
-
-        if not frontend_ready:
-            logging.error("Frontend did not start within 2 minutes")
-            cleanup_processes()
-            raise RuntimeError("Frontend startup timeout")
-
-        # Log startup information
-        logging.info("2048 game started successfully!")
-        logging.info(f"Frontend: http://localhost:{frontend_port}")
-        logging.info(f"Backend API: http://localhost:{backend_port}/docs")
-        logging.info("Press Ctrl+C to stop")
-
-        # Wait for processes to finish
-        while True:
-            time.sleep(1)
-            if backend_process and backend_process.poll() is not None:
-                logging.error("Backend process died unexpectedly")
-                break
-            if frontend_process and frontend_process.poll() is not None:
-                logging.error("Frontend process died unexpectedly")
-                break
-
-    except Exception as e:
-        logging.error(f"Error launching app: {e}")
-        cleanup_processes()
-        raise
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Launch 2048 Game")
-    parser.add_argument("--frontend-port", type=int, default=3001, help="Frontend port")
-    parser.add_argument("--backend-port", type=int, default=5001, help="Backend port")
-
-    args = parser.parse_args()
-
-    try:
-        launch_app(args.frontend_port, args.backend_port)
-    except KeyboardInterrupt:
-        logging.info("App interrupted by user")
-    except Exception as e:
-        logging.error(f"Failed to launch app: {e}")
-        sys.exit(1)
diff --git a/environments/browser/environment/README.md b/environments/browser/environment/README.md
deleted file mode 100644
index 2c86019e..00000000
--- a/environments/browser/environment/README.md
+++ /dev/null
@@ -1,135 +0,0 @@
-# Apps Directory
-
-Launchable web applications for the HUD browser environment. Each app is a self-contained service that can be dynamically launched.
-
-## App Specification
-
-Each app must implement:
-
-### Required Files
-- `launch.py` - Entry point script with standardized arguments
-- `backend/` - Backend service (required)
-- `frontend/` - Frontend service (optional)
-
-### Launch Script Interface
-
-```python
-# launch.py
-import argparse
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--frontend-port", type=int)
-    parser.add_argument("--backend-port", type=int, required=True)
-    args = parser.parse_args()
-    
-    # Start your services on the provided ports
-    # Backend must run on args.backend_port
-    # Frontend (if present) should run on args.frontend_port
-
-if __name__ == "__main__":
-    main()
-```
-
-### Service Requirements
-
-**Backend**
-- Must bind to the provided `--backend-port`
-- Should implement health check endpoint (`/health`)
-- Must handle graceful shutdown
-- Should use production-ready server (uvicorn, gunicorn, etc.)
-
-**Frontend** (Optional)
-- Must bind to the provided `--frontend-port`
-- Should be a static build or development server
-- Common frameworks: Next.js, React, Vue, etc.
-
-## App Lifecycle
-
-1. **Discovery** - Apps are discovered by scanning subdirectories
-2. **Launch** - Controller calls `python launch.py --backend-port=5000 --frontend-port=3000`
-3. **Registration** - Ports are registered for API access
-4. **Operation** - App services run independently
-5. **Cleanup** - Processes terminated when environment shuts down
-
-## Integration Patterns
-
-### Basic Web App
-```python
-# Minimal FastAPI backend
-from fastapi import FastAPI
-import uvicorn
-
-app = FastAPI()
-
-@app.get("/health")
-def health():
-    return {"status": "healthy"}
-
-if __name__ == "__main__":
-    import sys
-    port = int(sys.argv[sys.argv.index("--backend-port") + 1])
-    uvicorn.run(app, host="0.0.0.0", port=port)
-```
-
-### Full-Stack App
-```python
-# launch.py for app with both frontend and backend
-import subprocess
-import sys
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--frontend-port", type=int)
-    parser.add_argument("--backend-port", type=int, required=True)
-    args = parser.parse_args()
-    
-    # Start backend
-    backend_proc = subprocess.Popen([
-        "uvicorn", "backend.main:app",
-        "--host", "0.0.0.0",
-        "--port", str(args.backend_port)
-    ])
-    
-    # Start frontend (if port provided)
-    if args.frontend_port:
-        frontend_proc = subprocess.Popen([
-            "npm", "run", "dev", "--", "--port", str(args.frontend_port)
-        ], cwd="frontend")
-    
-    # Wait for processes
-    try:
-        backend_proc.wait()
-    except KeyboardInterrupt:
-        backend_proc.terminate()
-        if args.frontend_port:
-            frontend_proc.terminate()
-```
-
-## Optional Integrations
-
-### Evaluation APIs
-Apps can optionally provide evaluation endpoints for testing:
-- `GET /api/eval/health` - Health check
-- `GET /api/eval/stats` - Application statistics
-- Additional endpoints as needed
-
-### Environment Access
-Apps can access the browser environment through:
-- Shared network (communicate with controller)
-- File system (shared volumes)
-- Environment variables
-
-## Development Guidelines
-
-- **Port Binding** - Always use provided ports, never hardcode
-- **Health Checks** - Implement basic health endpoints
-- **Logging** - Use structured logging for debugging
-- **Dependencies** - Manage dependencies with lockfiles
-- **Graceful Shutdown** - Handle SIGTERM properly
-- **Error Handling** - Return meaningful error responses
-
-## Examples
-
-- `todo/` - Full-stack Next.js + FastAPI application with evaluation integration
-- See individual app READMEs for specific implementation details 
\ No newline at end of file
diff --git a/environments/browser/environment/__init__.py b/environments/browser/environment/__init__.py
deleted file mode 100644
index 36902690..00000000
--- a/environments/browser/environment/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-"""Browser environment server package."""
-
-__version__ = "0.1.0"
diff --git a/environments/browser/environment/pyproject.toml b/environments/browser/environment/pyproject.toml
deleted file mode 100644
index f6f853f8..00000000
--- a/environments/browser/environment/pyproject.toml
+++ /dev/null
@@ -1,23 +0,0 @@
-[project]
-name = "hud-browser-environment"
-version = "0.1.0"
-description = "HUD Browser Environment Backend"
-requires-python = ">=3.11,<3.14"
-dependencies = [
-    "fastapi>=0.104.1",
-    "uvicorn[standard]>=0.24.0",
-    "python-multipart>=0.0.6",
-    "pydantic>=2.6,<3",
-    "pydantic-settings>=2.2,<3",
-    "httpx",
-]
-
-[build-system]
-requires = ["hatchling"]
-build-backend = "hatchling.build"
-
-[tool.hatch.metadata]
-allow-direct-references = true
-
-[tool.hatch.build.targets.wheel]
-packages = ["environment"]
diff --git a/environments/browser/environment/server.py b/environments/browser/environment/server.py
deleted file mode 100644
index bd1297c7..00000000
--- a/environments/browser/environment/server.py
+++ /dev/null
@@ -1,503 +0,0 @@
-"""
-FastAPI server for browser environment.
-Exposes API endpoints to interact with the environment and its subcomponents.
-"""
-
-import asyncio
-import subprocess
-import os
-import logging
-from pathlib import Path
-from typing import Optional, Dict, List, Any, Set
-import socket
-from contextlib import asynccontextmanager
-import shutil
-import httpx
-
-from fastapi import FastAPI, HTTPException
-from pydantic import BaseModel
-
-# Configure logging
-logging.basicConfig(
-    level=logging.INFO, format="[%(levelname)s] %(asctime)s | %(name)s | %(message)s"
-)
-logger = logging.getLogger(__name__)
-
-
-class AppInfo(BaseModel):
-    """Information about a launched app."""
-
-    name: str
-    frontend_port: int
-    backend_port: int
-    url: str
-    status: str
-
-
-class ServiceStatus(BaseModel):
-    """Status of environment services."""
-
-    x11: bool
-    vnc: bool
-    websockify: bool
-    apps: List[AppInfo]
-
-
-class LaunchAppRequest(BaseModel):
-    """Request to launch an app."""
-
-    app_name: str
-
-
-class LaunchAppResponse(BaseModel):
-    """Response after launching an app."""
-
-    name: str
-    url: str
-    frontend_port: int
-    backend_port: int
-
-
-class ServiceManager:
-    """Manages environment services (X11, VNC, apps)."""
-
-    def __init__(self):
-        self.x11_proc: Optional[subprocess.Popen] = None
-        self.vnc_proc: Optional[subprocess.Popen] = None
-        self.websockify_proc: Optional[subprocess.Popen] = None
-        self.chrome_proc: Optional[subprocess.Popen] = None
-        self.cdp_port: Optional[int] = None
-        self._launched_apps: Dict[str, AppInfo] = {}
-        self._playwright = None
-        self._browser = None
-        self._app_processes: Dict[str, subprocess.Popen] = {}
-        self._allocated_ports: Set[int] = set()
-
-    async def start_core_services(self):
-        """Start X11, VNC, and websockify services."""
-        # Check if X11 is already running
-        if Path("/tmp/.X11-unix/X1").exists():
-            logger.info("X11 display :1 already running")
-        else:
-            # Start Xvfb if not already running
-            self.x11_proc = subprocess.Popen(
-                ["Xvfb", ":1", "-screen", "0", "1920x1080x24"],
-                stdin=subprocess.DEVNULL,
-                stdout=subprocess.DEVNULL,
-                stderr=subprocess.PIPE,
-            )
-            logger.info("Started Xvfb on display :1")
-
-        # Wait for X11
-        await self._wait_for_x11()
-
-        # Start VNC and websockify
-        await self._start_vnc_services()
-
-    async def _wait_for_x11(self):
-        """Wait for X11 display to be ready."""
-        for i in range(100):  # 10 seconds max
-            if Path("/tmp/.X11-unix/X1").exists():
-                logger.info("X11 display :1 is ready")
-                os.environ["DISPLAY"] = ":1"
-                return
-            await asyncio.sleep(0.1)
-        raise TimeoutError("X11 failed to start")
-
-    async def _start_vnc_services(self):
-        """Start VNC and websockify services."""
-        # Start x11vnc
-        self.vnc_proc = subprocess.Popen(
-            ["x11vnc", "-display", ":1", "-forever", "-shared", "-nopw"],
-            stdin=subprocess.DEVNULL,
-            stdout=subprocess.DEVNULL,
-            stderr=subprocess.PIPE,
-            env={**os.environ, "DISPLAY": ":1"},
-        )
-        logger.info("Started x11vnc")
-
-        # Start websockify
-        self.websockify_proc = subprocess.Popen(
-            ["websockify", "--web", "/usr/share/novnc", "8080", "localhost:5900"],
-            stdin=subprocess.DEVNULL,
-            stdout=subprocess.DEVNULL,
-            stderr=subprocess.PIPE,
-        )
-        logger.info("Started websockify on port 8080")
-
-        # Wait for both services
-        await asyncio.gather(
-            self._wait_for_port(5900, "VNC"), self._wait_for_port(8080, "websockify")
-        )
-        logger.info("noVNC available at: http://localhost:8080/vnc.html")
-
-        # Start Playwright's Chromium browser
-        logger.info("Starting Playwright's Chromium browser")
-        try:
-            from playwright.async_api import async_playwright
-
-            self._playwright = await async_playwright().start()
-            # Get a free port for CDP
-            self.cdp_port = self._get_next_port()
-
-            self._browser = await self._playwright.chromium.launch(
-                headless=False,
-                args=[
-                    f"--remote-debugging-port={self.cdp_port}",
-                    "--no-sandbox",
-                    "--disable-dev-shm-usage",
-                    "--disable-gpu",
-                    "--disable-web-security",
-                    "--disable-features=IsolateOrigins,site-per-process",
-                    "--display=:1",
-                    "--start-maximized",
-                ],
-                env={**os.environ, "DISPLAY": ":1"},
-            )
-
-            logger.info(f"Started Playwright Chromium with CDP on port {self.cdp_port}")
-
-            # Wait for CDP to be ready
-            await self._wait_for_port(self.cdp_port, "CDP", timeout=30)
-
-            # Open a default page so the browser window is visible
-            default_context = await self._browser.new_context(
-                viewport={"width": 1920, "height": 1080}, no_viewport=False
-            )
-            default_page = await default_context.new_page()
-            await default_page.goto("about:blank")
-            logger.info("Opened default browser page")
-
-        except ImportError:
-            logger.error("Playwright not installed")
-            raise RuntimeError("Playwright is required. The Docker image should have installed it.")
-        except Exception as e:
-            logger.error(f"Failed to start Playwright browser: {e}")
-            raise
-
-    async def launch_app(self, app_name: str) -> LaunchAppResponse:
-        """Launch a specific app dynamically."""
-        # Check if app is already running
-        if app_name in self._launched_apps:
-            app_info = self._launched_apps[app_name]
-            if app_info.status == "running":
-                return LaunchAppResponse(
-                    name=app_info.name,
-                    url=app_info.url,
-                    frontend_port=app_info.frontend_port,
-                    backend_port=app_info.backend_port,
-                )
-
-        app_path = Path(f"/app/environment/{app_name}")
-        if not app_path.exists():
-            raise ValueError(f"App '{app_name}' not found at {app_path}")
-
-        # Check if app has a launch script
-        launch_script = app_path / "launch.py"
-        if not launch_script.exists():
-            raise ValueError(f"App '{app_name}' missing launch.py")
-
-        # Get unique ports for frontend and backend
-        frontend_port = self._get_next_port()
-        backend_port = self._get_next_port()
-
-        # Launch the app
-        proc = subprocess.Popen(
-            [
-                "python3",
-                str(launch_script),
-                "--frontend-port",
-                str(frontend_port),
-                "--backend-port",
-                str(backend_port),
-            ],
-            cwd=app_path,
-            stdin=subprocess.DEVNULL,
-            stdout=subprocess.DEVNULL,
-            stderr=subprocess.DEVNULL,
-            env={**os.environ, "DISPLAY": ":1"},
-        )
-
-        self._app_processes[app_name] = proc
-
-        try:
-            # Wait for both ports
-            await asyncio.gather(
-                self._wait_for_port(frontend_port, f"app '{app_name}' frontend", timeout=60),
-                self._wait_for_port(backend_port, f"app '{app_name}' backend", timeout=60),
-            )
-
-            logger.info(
-                f"Launched app '{app_name}' - Frontend: {frontend_port}, Backend: {backend_port}"
-            )
-
-            # Store app information
-            app_info = AppInfo(
-                name=app_name,
-                frontend_port=frontend_port,
-                backend_port=backend_port,
-                url=f"http://localhost:{frontend_port}",
-                status="running",
-            )
-            self._launched_apps[app_name] = app_info
-
-            return LaunchAppResponse(
-                name=app_name,
-                url=app_info.url,
-                frontend_port=frontend_port,
-                backend_port=backend_port,
-            )
-
-        except TimeoutError:
-            # Check if process is still running
-            if proc.poll() is not None:
-                logger.error(f"App '{app_name}' process exited with code {proc.returncode}")
-            else:
-                logger.error(f"App '{app_name}' failed to become ready within timeout")
-            raise
-
-    def get_service_status(self) -> ServiceStatus:
-        """Get status of all services."""
-        # Update app statuses
-        for app_name, proc in self._app_processes.items():
-            if app_name in self._launched_apps:
-                if proc.poll() is None:
-                    self._launched_apps[app_name].status = "running"
-                else:
-                    self._launched_apps[app_name].status = "stopped"
-
-        return ServiceStatus(
-            x11=self.x11_proc is not None and self.x11_proc.poll() is None
-            if self.x11_proc
-            else Path("/tmp/.X11-unix/X1").exists(),
-            vnc=self.vnc_proc is not None and self.vnc_proc.poll() is None
-            if self.vnc_proc
-            else self._is_port_open(5900),
-            websockify=self.websockify_proc is not None and self.websockify_proc.poll() is None
-            if self.websockify_proc
-            else self._is_port_open(8080),
-            apps=list(self._launched_apps.values()),
-        )
-
-    def get_app_info(self, app_name: str) -> AppInfo:
-        """Get information about a specific app."""
-        if app_name not in self._launched_apps:
-            raise ValueError(f"App '{app_name}' not found")
-        return self._launched_apps[app_name]
-
-    async def shutdown(self):
-        """Shutdown all services gracefully."""
-        # Stop app processes
-        for name, proc in self._app_processes.items():
-            if proc.poll() is None:
-                proc.terminate()
-                await asyncio.sleep(1)
-                if proc.poll() is None:
-                    proc.kill()
-                logger.info(f"Terminated app '{name}'")
-
-        # Clear app tracking
-        self._app_processes.clear()
-        self._launched_apps.clear()
-        self._allocated_ports.clear()
-
-        # Close Playwright browser
-        if self._browser:
-            try:
-                await self._browser.close()
-                logger.info("Closed Playwright browser")
-            except Exception as e:
-                logger.error(f"Error closing browser: {e}")
-
-        if self._playwright:
-            try:
-                await self._playwright.stop()
-                logger.info("Stopped Playwright")
-            except Exception as e:
-                logger.error(f"Error stopping playwright: {e}")
-
-        # Stop services in reverse order
-        for proc, name in [
-            (self.websockify_proc, "websockify"),
-            (self.vnc_proc, "x11vnc"),
-            (self.x11_proc, "Xvfb"),
-        ]:
-            if proc and proc.poll() is None:
-                proc.terminate()
-                await asyncio.sleep(0.5)
-                if proc.poll() is None:
-                    proc.kill()
-                logger.info(f"Stopped {name}")
-
-    def _is_port_open(self, port: int) -> bool:
-        """Check if a port is open."""
-        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-        sock.settimeout(0.1)
-        try:
-            result = sock.connect_ex(("localhost", port))
-            sock.close()
-            return result == 0
-        except:
-            return False
-
-    def _get_next_port(self) -> int:
-        """Get next available port for apps."""
-        base_port = 3000
-        for offset in range(200):  # Support up to 200 ports
-            port = base_port + offset
-            if not self._is_port_open(port) and port not in self._allocated_ports:
-                self._allocated_ports.add(port)
-                return port
-        raise RuntimeError("No available ports")
-
-    async def _wait_for_port(self, port: int, service_name: str = "service", timeout: int = 30):
-        """Wait for a port to become available."""
-        for _ in range(timeout * 5):  # Check every 200ms
-            if self._is_port_open(port):
-                logger.info(f"{service_name} is ready on port {port}")
-                return
-            await asyncio.sleep(0.2)
-        raise TimeoutError(f"Port {port} did not become available for {service_name}")
-
-    async def get_cdp_websocket_url(self) -> str | None:
-        """Discover the actual CDP WebSocket URL from Chrome's /json/version endpoint."""
-        if not self.cdp_port:
-            return None
-
-        try:
-            async with httpx.AsyncClient() as client:
-                response = await client.get(
-                    f"http://localhost:{self.cdp_port}/json/version", timeout=5.0
-                )
-                if response.status_code == 200:
-                    data = response.json()
-                    # Chrome returns webSocketDebuggerUrl in /json/version response
-                    websocket_url = data.get("webSocketDebuggerUrl")
-                    if websocket_url:
-                        return websocket_url
-
-                # Fallback: try /json/list to find a browser target
-                response = await client.get(
-                    f"http://localhost:{self.cdp_port}/json/list", timeout=5.0
-                )
-                if response.status_code == 200:
-                    targets = response.json()
-                    # Look for a browser target (type 'page' or title containing 'about:blank')
-                    for target in targets:
-                        if target.get("type") == "page" or "about:blank" in target.get("url", ""):
-                            websocket_url = target.get("webSocketDebuggerUrl")
-                            if websocket_url:
-                                return websocket_url
-
-        except Exception as e:
-            logger.warning(f"Failed to discover CDP WebSocket URL: {e}")
-
-        # Final fallback to generic path (may not work)
-        return f"ws://localhost:{self.cdp_port}/devtools/browser"
-
-
-# Global service manager instance
-service_manager = ServiceManager()
-
-
-@asynccontextmanager
-async def lifespan(app: FastAPI):
-    """Manage application lifecycle."""
-    # Startup
-    logger.info("Starting browser environment server...")
-    await service_manager.start_core_services()
-    logger.info("Browser environment server ready")
-
-    yield
-
-    # Shutdown
-    logger.info("Shutting down browser environment server...")
-    await service_manager.shutdown()
-
-
-# Create FastAPI app
-app = FastAPI(
-    title="Browser Environment API",
-    description="API for managing browser environment services and applications",
-    version="1.0.0",
-    lifespan=lifespan,
-)
-
-
-@app.get("/health")
-async def health_check():
-    """Health check endpoint."""
-    return {"status": "healthy"}
-
-
-@app.get("/status", response_model=ServiceStatus)
-async def get_status():
-    """Get status of all environment services."""
-    return service_manager.get_service_status()
-
-
-@app.post("/apps/launch", response_model=LaunchAppResponse)
-async def launch_app(request: LaunchAppRequest):
-    """Launch a specific application."""
-    try:
-        return await service_manager.launch_app(request.app_name)
-    except ValueError as e:
-        raise HTTPException(status_code=404, detail=str(e))
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=str(e))
-
-
-@app.get("/apps/{app_name}", response_model=AppInfo)
-async def get_app_info(app_name: str):
-    """Get information about a specific app."""
-    try:
-        return service_manager.get_app_info(app_name)
-    except ValueError as e:
-        raise HTTPException(status_code=404, detail=str(e))
-
-
-@app.get("/vnc/url")
-async def get_vnc_url():
-    """Get the VNC viewer URL."""
-    return {"url": "http://localhost:8080/vnc.html"}
-
-
-@app.get("/display")
-async def get_display():
-    """Get the X11 display information."""
-    return {
-        "display": os.environ.get("DISPLAY", ":1"),
-        "x11_running": Path("/tmp/.X11-unix/X1").exists(),
-    }
-
-
-@app.get("/cdp")
-async def get_cdp():
-    """Return the CDP websocket URL for connecting Playwright/Chromium clients."""
-    if service_manager.cdp_port is None:
-        raise HTTPException(status_code=503, detail="CDP not available")
-
-    # Discover the actual CDP WebSocket URL from Chrome
-    websocket_url = await service_manager.get_cdp_websocket_url()
-    if not websocket_url:
-        raise HTTPException(status_code=503, detail="CDP WebSocket URL not available")
-
-    return {"ws": websocket_url}
-
-
-@app.post("/shutdown")
-async def shutdown_env():
-    """Gracefully stop services and request server shutdown."""
-    try:
-        await service_manager.shutdown()
-    except Exception as e:
-        logger.warning(f"Error during environment shutdown: {e}")
-    # Signal uvicorn to exit via lifespan shutdown
-    # FastAPI/uvicorn doesn't expose server here; we rely on process signal from caller.
-    return {"status": "shutting_down"}
-
-
-if __name__ == "__main__":
-    import uvicorn
-
-    uvicorn.run(app, host="0.0.0.0", port=8000)
diff --git a/environments/browser/environment/todo/README.md b/environments/browser/environment/todo/README.md
deleted file mode 100644
index 7d2460e9..00000000
--- a/environments/browser/environment/todo/README.md
+++ /dev/null
@@ -1,85 +0,0 @@
-# Todo App
-
-Simple todo list application with Next.js frontend and FastAPI backend, fully integrated with the HUD evaluation system.
-
-## Tech Stack
-
-- **Frontend**: Next.js, TypeScript, Tailwind CSS
-- **Backend**: FastAPI, SQLite, uv for dependency management
-- **Evaluation**: Comprehensive API endpoints for testing
-
-## Development
-
-```bash
-# Backend
-cd backend && uv run uvicorn main:app --reload
-
-# Frontend  
-cd frontend && npm install && npm run dev
-```
-
-## Launching
-
-```python
-await client.call_tool("launch_app", {"app_name": "todo"})
-```
-
-## Evaluation Integration
-
-### Backend API Endpoints
-- `GET /api/eval/health` - Health check
-- `GET /api/eval/stats` - Comprehensive statistics
-- `GET /api/eval/has_todo?text=` - Check if todo exists
-- `GET /api/eval/completion_rate` - Completion percentage
-- `POST /api/eval/seed` - Seed test data
-- `DELETE /api/eval/reset` - Reset database
-
-### Controller Components
-- **Evaluators**: `TodoCompletedEvaluator`, `TodoExistsEvaluator`, `CompositeEvaluator`
-- **Setup Tools**: `TodoSeedSetup`, `TodoResetSetup`, `TodoCustomSeedSetup`
-- **Problems**: `TodoBasicUsageProblem`, `TodoCompositeWeightedProblem`
-
-### Usage Examples
-
-```python
-# Complete problem execution
-await setup({"name": "todo_basic_usage"})
-await evaluate({"name": "todo_basic_usage"})
-
-# Direct function calls
-await setup({"name": "todo_reset", "arguments": {}})
-await evaluate({"name": "todo_completion_rate", "arguments": {"min_rate": 0.5}})
-
-# MCP resource discovery
-todo_evaluators = await client.read_resource("evaluators://todo")
-```
-
-## Database Schema
-
-```sql
-CREATE TABLE items (
-    id INTEGER PRIMARY KEY AUTOINCREMENT,
-    title TEXT NOT NULL,
-    description TEXT,
-    completed BOOLEAN DEFAULT FALSE,
-    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
-);
-```
-
-## Testing
-
-### Manual
-1. Launch app: `await launch_app("todo")`
-2. Access at http://localhost:3000
-3. Run evaluations
-
-### Automated
-```bash
-# Test APIs
-curl http://localhost:5000/api/eval/health
-curl http://localhost:5000/api/eval/stats
-
-# Test MCP tools
-await setup({"name": "todo_basic_usage"})
-await evaluate({"name": "todo_basic_usage"})
-``` 
\ No newline at end of file
diff --git a/environments/browser/environment/todo/backend/main.py b/environments/browser/environment/todo/backend/main.py
deleted file mode 100644
index 5839fa85..00000000
--- a/environments/browser/environment/todo/backend/main.py
+++ /dev/null
@@ -1,391 +0,0 @@
-from fastapi import FastAPI, HTTPException
-from fastapi.middleware.cors import CORSMiddleware
-from pydantic import BaseModel
-from typing import List, Optional
-from datetime import datetime
-import sqlite3
-import json
-
-app = FastAPI(title="Todo API with Evaluation", version="0.2.0")
-
-# Configure CORS
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=["http://localhost:3000"],
-    allow_credentials=True,
-    allow_methods=["*"],
-    allow_headers=["*"],
-)
-
-
-# Pydantic models
-class Item(BaseModel):
-    id: Optional[int] = None
-    title: str
-    description: str
-    completed: bool = False
-    created_at: Optional[datetime] = None
-
-
-class ItemCreate(BaseModel):
-    title: str
-    description: str
-    completed: bool = False
-
-
-class BulkUpdateRequest(BaseModel):
-    item_ids: List[int]
-    completed: Optional[bool] = None
-
-
-class EvaluationStats(BaseModel):
-    total_items: int
-    completed_items: int
-    pending_items: int
-    completion_rate: float
-    items: List[Item]
-    timestamps: dict
-
-
-# Database setup
-def init_db():
-    conn = sqlite3.connect("app.db")
-    c = conn.cursor()
-    c.execute("""
-        CREATE TABLE IF NOT EXISTS items (
-            id INTEGER PRIMARY KEY AUTOINCREMENT,
-            title TEXT NOT NULL,
-            description TEXT,
-            completed BOOLEAN NOT NULL DEFAULT 0,
-            created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
-        )
-    """)
-    conn.commit()
-    conn.close()
-
-
-init_db()
-
-
-# === CORE TODO API ROUTES ===
-
-
-@app.get("/api/status")
-def status():
-    return {"status": "ok", "timestamp": datetime.now().isoformat()}
-
-
-@app.get("/api/items", response_model=List[Item])
-def get_items():
-    conn = sqlite3.connect("app.db")
-    conn.row_factory = sqlite3.Row
-    c = conn.cursor()
-    c.execute("SELECT * FROM items ORDER BY created_at DESC")
-    items = [dict(row) for row in c.fetchall()]
-    conn.close()
-    return items
-
-
-@app.post("/api/items", response_model=Item)
-def create_item(item: ItemCreate):
-    conn = sqlite3.connect("app.db")
-    c = conn.cursor()
-    c.execute(
-        "INSERT INTO items (title, description, completed) VALUES (?, ?, ?)",
-        (item.title, item.description, item.completed),
-    )
-    item_id = c.lastrowid
-    conn.commit()
-    conn.close()
-
-    return get_item(item_id)
-
-
-@app.get("/api/items/{item_id}", response_model=Item)
-def get_item(item_id: int):
-    conn = sqlite3.connect("app.db")
-    conn.row_factory = sqlite3.Row
-    c = conn.cursor()
-    c.execute("SELECT * FROM items WHERE id = ?", (item_id,))
-    item = c.fetchone()
-    conn.close()
-
-    if not item:
-        raise HTTPException(status_code=404, detail="Item not found")
-
-    return dict(item)
-
-
-@app.put("/api/items/{item_id}", response_model=Item)
-def update_item(item_id: int, item: ItemCreate):
-    conn = sqlite3.connect("app.db")
-    c = conn.cursor()
-    c.execute(
-        "UPDATE items SET title = ?, description = ?, completed = ? WHERE id = ?",
-        (item.title, item.description, item.completed, item_id),
-    )
-    conn.commit()
-
-    if c.rowcount == 0:
-        conn.close()
-        raise HTTPException(status_code=404, detail="Item not found")
-
-    conn.close()
-    return get_item(item_id)
-
-
-@app.delete("/api/items/{item_id}")
-def delete_item(item_id: int):
-    conn = sqlite3.connect("app.db")
-    c = conn.cursor()
-    c.execute("DELETE FROM items WHERE id = ?", (item_id,))
-    conn.commit()
-
-    if c.rowcount == 0:
-        conn.close()
-        raise HTTPException(status_code=404, detail="Item not found")
-
-    conn.close()
-    return {"message": "Item deleted successfully"}
-
-
-# === EVALUATION API ROUTES ===
-
-
-@app.get("/api/eval/health")
-def eval_health():
-    """Health check endpoint for evaluation system."""
-    try:
-        conn = sqlite3.connect("app.db")
-        c = conn.cursor()
-        c.execute("SELECT COUNT(*) FROM items")
-        count = c.fetchone()[0]
-        conn.close()
-
-        return {
-            "status": "healthy",
-            "database_accessible": True,
-            "total_items": count,
-            "timestamp": datetime.now().isoformat(),
-        }
-    except Exception as e:
-        return {"status": "unhealthy", "error": str(e), "timestamp": datetime.now().isoformat()}
-
-
-@app.get("/api/eval/stats", response_model=EvaluationStats)
-def get_evaluation_stats():
-    """Comprehensive evaluation statistics for the todo app."""
-    conn = sqlite3.connect("app.db")
-    conn.row_factory = sqlite3.Row
-    c = conn.cursor()
-
-    # Get total counts
-    c.execute("SELECT COUNT(*) as total FROM items")
-    total = c.fetchone()[0]
-
-    c.execute("SELECT COUNT(*) as completed FROM items WHERE completed = 1")
-    completed = c.fetchone()[0]
-
-    # Get all items with details
-    c.execute("SELECT * FROM items ORDER BY created_at DESC")
-    items = [dict(row) for row in c.fetchall()]
-
-    # Get timing information
-    c.execute("""
-        SELECT created_at 
-        FROM items 
-        ORDER BY created_at DESC 
-        LIMIT 1
-    """)
-    last_created_row = c.fetchone()
-    last_created = last_created_row[0] if last_created_row else None
-
-    c.execute("""
-        SELECT created_at 
-        FROM items 
-        WHERE completed = 1 
-        ORDER BY created_at DESC 
-        LIMIT 1
-    """)
-    last_completed_row = c.fetchone()
-    last_completed = last_completed_row[0] if last_completed_row else None
-
-    conn.close()
-
-    return EvaluationStats(
-        total_items=total,
-        completed_items=completed,
-        pending_items=total - completed,
-        completion_rate=completed / total if total > 0 else 0.0,
-        items=items,
-        timestamps={"last_created": last_created, "last_completed": last_completed},
-    )
-
-
-@app.get("/api/eval/todos", response_model=List[Item])
-def get_todos_for_evaluation():
-    """Get all todos for evaluation purposes (alias for /api/items)."""
-    return get_items()
-
-
-@app.get("/api/eval/has_todo")
-def check_todo_exists(text: str):
-    """Check if a todo item exists with specific text in title or description."""
-    conn = sqlite3.connect("app.db")
-    conn.row_factory = sqlite3.Row
-    c = conn.cursor()
-    c.execute(
-        """
-        SELECT * FROM items 
-        WHERE title LIKE ? OR description LIKE ?
-        ORDER BY created_at DESC
-    """,
-        (f"%{text}%", f"%{text}%"),
-    )
-
-    items = [dict(row) for row in c.fetchall()]
-    conn.close()
-
-    return {
-        "exists": len(items) > 0,
-        "count": len(items),
-        "search_text": text,
-        "matches": items,
-        "timestamp": datetime.now().isoformat(),
-    }
-
-
-@app.post("/api/eval/bulk_update")
-def bulk_update_items(request: BulkUpdateRequest):
-    """Update multiple items at once for evaluation purposes."""
-    conn = sqlite3.connect("app.db")
-    c = conn.cursor()
-
-    updated_count = 0
-    if request.completed is not None:
-        for item_id in request.item_ids:
-            c.execute("UPDATE items SET completed = ? WHERE id = ?", (request.completed, item_id))
-            if c.rowcount > 0:
-                updated_count += 1
-
-    conn.commit()
-    conn.close()
-
-    return {
-        "message": f"Updated {updated_count} items",
-        "updated_count": updated_count,
-        "requested_ids": request.item_ids,
-        "timestamp": datetime.now().isoformat(),
-    }
-
-
-@app.get("/api/eval/completion_rate")
-def get_completion_rate():
-    """Get the current completion rate as a percentage."""
-    conn = sqlite3.connect("app.db")
-    c = conn.cursor()
-
-    c.execute("SELECT COUNT(*) as total FROM items")
-    total = c.fetchone()[0]
-
-    c.execute("SELECT COUNT(*) as completed FROM items WHERE completed = 1")
-    completed = c.fetchone()[0]
-
-    conn.close()
-
-    rate = completed / total if total > 0 else 0.0
-
-    return {
-        "completion_rate": rate,
-        "completion_percentage": rate * 100,
-        "completed_items": completed,
-        "total_items": total,
-        "timestamp": datetime.now().isoformat(),
-    }
-
-
-# === EVALUATION UTILITY ROUTES ===
-
-
-@app.post("/api/eval/seed")
-def seed_test_data():
-    """Seed the database with test data for evaluation purposes."""
-    test_items = [
-        {"title": "Buy groceries", "description": "Get milk, eggs, and bread", "completed": True},
-        {
-            "title": "Walk the dog",
-            "description": "Take Max for a 30-minute walk",
-            "completed": True,
-        },
-        {
-            "title": "Finish project",
-            "description": "Complete the Q4 presentation",
-            "completed": False,
-        },
-        {"title": "Call mom", "description": "Weekly check-in call", "completed": False},
-        {
-            "title": "Schedule dentist",
-            "description": "Book appointment for cleaning",
-            "completed": False,
-        },
-    ]
-
-    conn = sqlite3.connect("app.db")
-    c = conn.cursor()
-
-    for item in test_items:
-        c.execute(
-            """
-            INSERT INTO items (title, description, completed) 
-            VALUES (?, ?, ?)
-        """,
-            (item["title"], item["description"], item["completed"]),
-        )
-
-    conn.commit()
-    conn.close()
-
-    return {
-        "message": "Test data seeded successfully",
-        "items_added": len(test_items),
-        "timestamp": datetime.now().isoformat(),
-    }
-
-
-@app.post("/api/eval/seed_custom")
-def seed_custom_data(items: List[ItemCreate]):
-    """Seed the database with custom test data for evaluation purposes."""
-    conn = sqlite3.connect("app.db")
-    c = conn.cursor()
-
-    items_added = 0
-    for item in items:
-        c.execute(
-            """
-            INSERT INTO items (title, description, completed) 
-            VALUES (?, ?, ?)
-        """,
-            (item.title, item.description if hasattr(item, "description") else "", item.completed),
-        )
-        items_added += 1
-
-    conn.commit()
-    conn.close()
-
-    return {
-        "message": "Custom test data seeded successfully",
-        "items_added": items_added,
-        "timestamp": datetime.now().isoformat(),
-    }
-
-
-@app.delete("/api/eval/reset")
-def reset_database():
-    """Reset the database to empty state for clean evaluation."""
-    conn = sqlite3.connect("app.db")
-    c = conn.cursor()
-    c.execute("DELETE FROM items")
-    conn.commit()
-    conn.close()
-
-    return {"message": "Database reset successfully", "timestamp": datetime.now().isoformat()}
diff --git a/environments/browser/environment/todo/backend/pyproject.toml b/environments/browser/environment/todo/backend/pyproject.toml
deleted file mode 100644
index 493627d5..00000000
--- a/environments/browser/environment/todo/backend/pyproject.toml
+++ /dev/null
@@ -1,15 +0,0 @@
-[project]
-name = "sample-backend"
-version = "0.1.0"
-description = "FastAPI backend for sample app"
-requires-python = ">=3.10"
-dependencies = [
-    "fastapi==0.109.0",
-    "uvicorn[standard]==0.27.0",
-    "sqlalchemy==2.0.25",
-    "pydantic==2.5.3",
-    "python-multipart==0.0.6",
-]
-
-[tool.uv]
-dev-dependencies = [] 
\ No newline at end of file
diff --git a/environments/browser/environment/todo/frontend/app/globals.css b/environments/browser/environment/todo/frontend/app/globals.css
deleted file mode 100644
index de4d11a2..00000000
--- a/environments/browser/environment/todo/frontend/app/globals.css
+++ /dev/null
@@ -1,3 +0,0 @@
-@tailwind base;
-@tailwind components;
-@tailwind utilities; 
\ No newline at end of file
diff --git a/environments/browser/environment/todo/frontend/app/layout.tsx b/environments/browser/environment/todo/frontend/app/layout.tsx
deleted file mode 100644
index 0acab9a4..00000000
--- a/environments/browser/environment/todo/frontend/app/layout.tsx
+++ /dev/null
@@ -1,22 +0,0 @@
-import type { Metadata } from 'next'
-import { Inter } from 'next/font/google'
-import './globals.css'
-
-const inter = Inter({ subsets: ['latin'] })
-
-export const metadata: Metadata = {
-  title: 'Sample App',
-  description: 'A sample Next.js app with FastAPI backend',
-}
-
-export default function RootLayout({
-  children,
-}: {
-  children: React.ReactNode
-}) {
-  return (
-    <html lang="en">
-      <body className={inter.className}>{children}</body>
-    </html>
-  )
-} 
\ No newline at end of file
diff --git a/environments/browser/environment/todo/frontend/app/page.tsx b/environments/browser/environment/todo/frontend/app/page.tsx
deleted file mode 100644
index c5de6422..00000000
--- a/environments/browser/environment/todo/frontend/app/page.tsx
+++ /dev/null
@@ -1,289 +0,0 @@
-'use client'
-
-import { useState, useEffect } from 'react'
-
-interface Item {
-  id: number
-  title: string
-  description: string
-  completed: boolean
-  created_at: string
-}
-
-type FilterType = 'all' | 'active' | 'completed'
-
-// Dynamically determine API URL based on current port
-// Backend is always on frontend_port + 1
-const getApiUrl = () => {
-  if (typeof window !== 'undefined') {
-    const currentPort = parseInt(window.location.port) || 3000;
-    return `http://localhost:${currentPort + 1}`;
-  }
-  return process.env.NEXT_PUBLIC_API_URL || 'http://localhost:5000';
-};
-
-const API_URL = getApiUrl();
-
-export default function Home() {
-  const [items, setItems] = useState<Item[]>([])
-  const [newTitle, setNewTitle] = useState('')
-  const [newDescription, setNewDescription] = useState('')
-  const [loading, setLoading] = useState(true)
-  const [filter, setFilter] = useState<FilterType>('all')
-  const [searchTerm, setSearchTerm] = useState('')
-
-  useEffect(() => {
-    fetchItems()
-  }, [])
-
-  const fetchItems = async () => {
-    try {
-      const response = await fetch(`${API_URL}/api/items`)
-      const data = await response.json()
-      setItems(data)
-    } catch (error) {
-      console.error('Error fetching items:', error)
-    } finally {
-      setLoading(false)
-    }
-  }
-
-  const createItem = async (e: React.FormEvent) => {
-    e.preventDefault()
-    if (!newTitle.trim()) return
-
-    try {
-      const response = await fetch(`${API_URL}/api/items`, {
-        method: 'POST',
-        headers: { 'Content-Type': 'application/json' },
-        body: JSON.stringify({
-          title: newTitle,
-          description: newDescription,
-          completed: false
-        })
-      })
-      
-      if (response.ok) {
-        setNewTitle('')
-        setNewDescription('')
-        fetchItems()
-      }
-    } catch (error) {
-      console.error('Error creating item:', error)
-    }
-  }
-
-  const toggleItem = async (id: number, item: Item) => {
-    try {
-      const response = await fetch(`${API_URL}/api/items/${id}`, {
-        method: 'PUT',
-        headers: { 'Content-Type': 'application/json' },
-        body: JSON.stringify({
-          ...item,
-          completed: !item.completed
-        })
-      })
-      
-      if (response.ok) {
-        fetchItems()
-      }
-    } catch (error) {
-      console.error('Error updating item:', error)
-    }
-  }
-
-  const deleteItem = async (id: number) => {
-    try {
-      const response = await fetch(`${API_URL}/api/items/${id}`, {
-        method: 'DELETE'
-      })
-      
-      if (response.ok) {
-        fetchItems()
-      }
-    } catch (error) {
-      console.error('Error deleting item:', error)
-    }
-  }
-
-  const markAllComplete = async () => {
-    const activeItems = items.filter(item => !item.completed)
-    for (const item of activeItems) {
-      await toggleItem(item.id, item)
-    }
-  }
-
-  const deleteCompleted = async () => {
-    const completedItems = items.filter(item => item.completed)
-    for (const item of completedItems) {
-      await deleteItem(item.id)
-    }
-  }
-
-  // Filter and search logic
-  const filteredItems = items
-    .filter(item => {
-      if (filter === 'active') return !item.completed
-      if (filter === 'completed') return item.completed
-      return true
-    })
-    .filter(item => {
-      if (!searchTerm) return true
-      const term = searchTerm.toLowerCase()
-      return item.title.toLowerCase().includes(term) || 
-             item.description.toLowerCase().includes(term)
-    })
-
-  const stats = {
-    total: items.length,
-    active: items.filter(i => !i.completed).length,
-    completed: items.filter(i => i.completed).length
-  }
-
-  return (
-    <main className="min-h-screen bg-gray-100 py-8">
-      <div className="max-w-4xl mx-auto px-4">
-        <h1 className="text-3xl font-bold text-gray-900 mb-8">Todo App</h1>
-        
-        {/* Stats Bar */}
-        <div className="bg-white rounded-lg shadow-md p-4 mb-6">
-          <div className="flex justify-between items-center">
-            <div className="flex gap-6">
-              <span className="text-sm text-gray-600">
-                Total: <strong>{stats.total}</strong>
-              </span>
-              <span className="text-sm text-gray-600">
-                Active: <strong>{stats.active}</strong>
-              </span>
-              <span className="text-sm text-gray-600">
-                Completed: <strong>{stats.completed}</strong>
-              </span>
-            </div>
-            <div className="flex gap-2">
-              <button
-                onClick={markAllComplete}
-                className="text-sm bg-green-500 text-white px-3 py-1 rounded hover:bg-green-600"
-                disabled={stats.active === 0}
-              >
-                Mark All Complete
-              </button>
-              <button
-                onClick={deleteCompleted}
-                className="text-sm bg-red-500 text-white px-3 py-1 rounded hover:bg-red-600"
-                disabled={stats.completed === 0}
-              >
-                Delete Completed
-              </button>
-            </div>
-          </div>
-        </div>
-
-        {/* Add Item Form */}
-        <form onSubmit={createItem} className="bg-white rounded-lg shadow-md p-6 mb-6">
-          <h2 className="text-xl font-semibold mb-4">Add New Item</h2>
-          <div className="space-y-4">
-            <input
-              type="text"
-              placeholder="Title"
-              value={newTitle}
-              onChange={(e) => setNewTitle(e.target.value)}
-              className="w-full px-4 py-2 border border-gray-300 rounded-md focus:outline-none focus:ring-2 focus:ring-blue-500"
-            />
-            <textarea
-              placeholder="Description"
-              value={newDescription}
-              onChange={(e) => setNewDescription(e.target.value)}
-              className="w-full px-4 py-2 border border-gray-300 rounded-md focus:outline-none focus:ring-2 focus:ring-blue-500"
-              rows={3}
-            />
-            <button
-              type="submit"
-              className="bg-blue-500 text-white px-6 py-2 rounded-md hover:bg-blue-600 transition-colors"
-            >
-              Add Item
-            </button>
-          </div>
-        </form>
-
-        {/* Filter and Search Bar */}
-        <div className="bg-white rounded-lg shadow-md p-4 mb-6">
-          <div className="flex flex-col sm:flex-row gap-4">
-            <div className="flex gap-2">
-              <button
-                onClick={() => setFilter('all')}
-                className={`px-4 py-2 rounded ${filter === 'all' ? 'bg-blue-500 text-white' : 'bg-gray-200 text-gray-700'}`}
-              >
-                All
-              </button>
-              <button
-                onClick={() => setFilter('active')}
-                className={`px-4 py-2 rounded ${filter === 'active' ? 'bg-blue-500 text-white' : 'bg-gray-200 text-gray-700'}`}
-              >
-                Active
-              </button>
-              <button
-                onClick={() => setFilter('completed')}
-                className={`px-4 py-2 rounded ${filter === 'completed' ? 'bg-blue-500 text-white' : 'bg-gray-200 text-gray-700'}`}
-              >
-                Completed
-              </button>
-            </div>
-            <input
-              type="text"
-              placeholder="Search todos..."
-              value={searchTerm}
-              onChange={(e) => setSearchTerm(e.target.value)}
-              className="flex-1 px-4 py-2 border border-gray-300 rounded-md focus:outline-none focus:ring-2 focus:ring-blue-500"
-            />
-          </div>
-        </div>
-
-        {/* Items List */}
-        <div className="space-y-4">
-          {loading ? (
-            <p className="text-gray-500">Loading...</p>
-          ) : filteredItems.length === 0 ? (
-            <p className="text-gray-500">
-              {searchTerm ? 'No items match your search.' : 
-               filter !== 'all' ? `No ${filter} items.` : 
-               'No items yet. Create one above!'}
-            </p>
-          ) : (
-            filteredItems.map((item) => (
-              <div
-                key={item.id}
-                className="bg-white rounded-lg shadow-md p-6 flex items-start justify-between hover:shadow-lg transition-shadow"
-              >
-                <div className="flex-1">
-                  <div className="flex items-center">
-                    <input
-                      type="checkbox"
-                      checked={item.completed}
-                      onChange={() => toggleItem(item.id, item)}
-                      className="mr-3 h-5 w-5 text-blue-600"
-                    />
-                    <h3 className={`text-lg font-semibold ${item.completed ? 'line-through text-gray-500' : 'text-gray-900'}`}>
-                      {item.title}
-                    </h3>
-                  </div>
-                  {item.description && (
-                    <p className="mt-2 text-gray-600 ml-8">{item.description}</p>
-                  )}
-                  <p className="mt-2 text-sm text-gray-400 ml-8">
-                    Created: {new Date(item.created_at).toLocaleString()}
-                  </p>
-                </div>
-                <button
-                  onClick={() => deleteItem(item.id)}
-                  className="ml-4 text-red-500 hover:text-red-700 transition-colors"
-                >
-                  Delete
-                </button>
-              </div>
-            ))
-          )}
-        </div>
-      </div>
-    </main>
-  )
-}
\ No newline at end of file
diff --git a/environments/browser/environment/todo/frontend/next.config.js b/environments/browser/environment/todo/frontend/next.config.js
deleted file mode 100644
index bef2d0a0..00000000
--- a/environments/browser/environment/todo/frontend/next.config.js
+++ /dev/null
@@ -1,13 +0,0 @@
-/** @type {import('next').NextConfig} */
-const nextConfig = {
-  async rewrites() {
-    return [
-      {
-        source: '/api/:path*',
-        destination: 'http://localhost:5000/api/:path*',
-      },
-    ]
-  },
-}
-
-module.exports = nextConfig 
\ No newline at end of file
diff --git a/environments/browser/environment/todo/frontend/package-lock.json b/environments/browser/environment/todo/frontend/package-lock.json
deleted file mode 100644
index 8abeb36b..00000000
--- a/environments/browser/environment/todo/frontend/package-lock.json
+++ /dev/null
@@ -1,6532 +0,0 @@
-{
-  "name": "sample-frontend",
-  "version": "0.1.0",
-  "lockfileVersion": 3,
-  "requires": true,
-  "packages": {
-    "": {
-      "name": "sample-frontend",
-      "version": "0.1.0",
-      "dependencies": {
-        "next": "14.1.0",
-        "react": "^18",
-        "react-dom": "^18",
-        "swr": "^2.2.4"
-      },
-      "devDependencies": {
-        "@types/node": "^20",
-        "@types/react": "^18",
-        "@types/react-dom": "^18",
-        "autoprefixer": "^10.0.1",
-        "eslint": "^9",
-        "eslint-config-next": "16.0.3",
-        "postcss": "^8",
-        "tailwindcss": "^3.3.0",
-        "typescript": "^5"
-      }
-    },
-    "node_modules/@alloc/quick-lru": {
-      "version": "5.2.0",
-      "resolved": "https://registry.npmjs.org/@alloc/quick-lru/-/quick-lru-5.2.0.tgz",
-      "integrity": "sha512-UrcABB+4bUrFABwbluTIBErXwvbsU/V7TZWfmbgJfbkwiBuziS9gxdODUyuiecfdGQ85jglMW6juS3+z5TsKLw==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">=10"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/sindresorhus"
-      }
-    },
-    "node_modules/@babel/code-frame": {
-      "version": "7.27.1",
-      "resolved": "https://registry.npmjs.org/@babel/code-frame/-/code-frame-7.27.1.tgz",
-      "integrity": "sha512-cjQ7ZlQ0Mv3b47hABuTevyTuYN4i+loJKGeV9flcCgIK37cCXRh+L1bd3iBHlynerhQ7BhCkn2BPbQUL+rGqFg==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "@babel/helper-validator-identifier": "^7.27.1",
-        "js-tokens": "^4.0.0",
-        "picocolors": "^1.1.1"
-      },
-      "engines": {
-        "node": ">=6.9.0"
-      }
-    },
-    "node_modules/@babel/compat-data": {
-      "version": "7.28.5",
-      "resolved": "https://registry.npmjs.org/@babel/compat-data/-/compat-data-7.28.5.tgz",
-      "integrity": "sha512-6uFXyCayocRbqhZOB+6XcuZbkMNimwfVGFji8CTZnCzOHVGvDqzvitu1re2AU5LROliz7eQPhB8CpAMvnx9EjA==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">=6.9.0"
-      }
-    },
-    "node_modules/@babel/core": {
-      "version": "7.28.5",
-      "resolved": "https://registry.npmjs.org/@babel/core/-/core-7.28.5.tgz",
-      "integrity": "sha512-e7jT4DxYvIDLk1ZHmU/m/mB19rex9sv0c2ftBtjSBv+kVM/902eh0fINUzD7UwLLNR+jU585GxUJ8/EBfAM5fw==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "@babel/code-frame": "^7.27.1",
-        "@babel/generator": "^7.28.5",
-        "@babel/helper-compilation-targets": "^7.27.2",
-        "@babel/helper-module-transforms": "^7.28.3",
-        "@babel/helpers": "^7.28.4",
-        "@babel/parser": "^7.28.5",
-        "@babel/template": "^7.27.2",
-        "@babel/traverse": "^7.28.5",
-        "@babel/types": "^7.28.5",
-        "@jridgewell/remapping": "^2.3.5",
-        "convert-source-map": "^2.0.0",
-        "debug": "^4.1.0",
-        "gensync": "^1.0.0-beta.2",
-        "json5": "^2.2.3",
-        "semver": "^6.3.1"
-      },
-      "engines": {
-        "node": ">=6.9.0"
-      },
-      "funding": {
-        "type": "opencollective",
-        "url": "https://opencollective.com/babel"
-      }
-    },
-    "node_modules/@babel/core/node_modules/json5": {
-      "version": "2.2.3",
-      "resolved": "https://registry.npmjs.org/json5/-/json5-2.2.3.tgz",
-      "integrity": "sha512-XmOWe7eyHYH14cLdVPoyg+GOH3rYX++KpzrylJwSW98t3Nk+U8XOl8FWKOgwtzdb8lXGf6zYwDUzeHMWfxasyg==",
-      "dev": true,
-      "license": "MIT",
-      "bin": {
-        "json5": "lib/cli.js"
-      },
-      "engines": {
-        "node": ">=6"
-      }
-    },
-    "node_modules/@babel/core/node_modules/semver": {
-      "version": "6.3.1",
-      "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.1.tgz",
-      "integrity": "sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA==",
-      "dev": true,
-      "license": "ISC",
-      "bin": {
-        "semver": "bin/semver.js"
-      }
-    },
-    "node_modules/@babel/generator": {
-      "version": "7.28.5",
-      "resolved": "https://registry.npmjs.org/@babel/generator/-/generator-7.28.5.tgz",
-      "integrity": "sha512-3EwLFhZ38J4VyIP6WNtt2kUdW9dokXA9Cr4IVIFHuCpZ3H8/YFOl5JjZHisrn1fATPBmKKqXzDFvh9fUwHz6CQ==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "@babel/parser": "^7.28.5",
-        "@babel/types": "^7.28.5",
-        "@jridgewell/gen-mapping": "^0.3.12",
-        "@jridgewell/trace-mapping": "^0.3.28",
-        "jsesc": "^3.0.2"
-      },
-      "engines": {
-        "node": ">=6.9.0"
-      }
-    },
-    "node_modules/@babel/helper-compilation-targets": {
-      "version": "7.27.2",
-      "resolved": "https://registry.npmjs.org/@babel/helper-compilation-targets/-/helper-compilation-targets-7.27.2.tgz",
-      "integrity": "sha512-2+1thGUUWWjLTYTHZWK1n8Yga0ijBz1XAhUXcKy81rd5g6yh7hGqMp45v7cadSbEHc9G3OTv45SyneRN3ps4DQ==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "@babel/compat-data": "^7.27.2",
-        "@babel/helper-validator-option": "^7.27.1",
-        "browserslist": "^4.24.0",
-        "lru-cache": "^5.1.1",
-        "semver": "^6.3.1"
-      },
-      "engines": {
-        "node": ">=6.9.0"
-      }
-    },
-    "node_modules/@babel/helper-compilation-targets/node_modules/lru-cache": {
-      "version": "5.1.1",
-      "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-5.1.1.tgz",
-      "integrity": "sha512-KpNARQA3Iwv+jTA0utUVVbrh+Jlrr1Fv0e56GGzAFOXN7dk/FviaDW8LHmK52DlcH4WP2n6gI8vN1aesBFgo9w==",
-      "dev": true,
-      "license": "ISC",
-      "dependencies": {
-        "yallist": "^3.0.2"
-      }
-    },
-    "node_modules/@babel/helper-compilation-targets/node_modules/semver": {
-      "version": "6.3.1",
-      "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.1.tgz",
-      "integrity": "sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA==",
-      "dev": true,
-      "license": "ISC",
-      "bin": {
-        "semver": "bin/semver.js"
-      }
-    },
-    "node_modules/@babel/helper-globals": {
-      "version": "7.28.0",
-      "resolved": "https://registry.npmjs.org/@babel/helper-globals/-/helper-globals-7.28.0.tgz",
-      "integrity": "sha512-+W6cISkXFa1jXsDEdYA8HeevQT/FULhxzR99pxphltZcVaugps53THCeiWA8SguxxpSp3gKPiuYfSWopkLQ4hw==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">=6.9.0"
-      }
-    },
-    "node_modules/@babel/helper-module-imports": {
-      "version": "7.27.1",
-      "resolved": "https://registry.npmjs.org/@babel/helper-module-imports/-/helper-module-imports-7.27.1.tgz",
-      "integrity": "sha512-0gSFWUPNXNopqtIPQvlD5WgXYI5GY2kP2cCvoT8kczjbfcfuIljTbcWrulD1CIPIX2gt1wghbDy08yE1p+/r3w==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "@babel/traverse": "^7.27.1",
-        "@babel/types": "^7.27.1"
-      },
-      "engines": {
-        "node": ">=6.9.0"
-      }
-    },
-    "node_modules/@babel/helper-module-transforms": {
-      "version": "7.28.3",
-      "resolved": "https://registry.npmjs.org/@babel/helper-module-transforms/-/helper-module-transforms-7.28.3.tgz",
-      "integrity": "sha512-gytXUbs8k2sXS9PnQptz5o0QnpLL51SwASIORY6XaBKF88nsOT0Zw9szLqlSGQDP/4TljBAD5y98p2U1fqkdsw==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "@babel/helper-module-imports": "^7.27.1",
-        "@babel/helper-validator-identifier": "^7.27.1",
-        "@babel/traverse": "^7.28.3"
-      },
-      "engines": {
-        "node": ">=6.9.0"
-      },
-      "peerDependencies": {
-        "@babel/core": "^7.0.0"
-      }
-    },
-    "node_modules/@babel/helper-string-parser": {
-      "version": "7.27.1",
-      "resolved": "https://registry.npmjs.org/@babel/helper-string-parser/-/helper-string-parser-7.27.1.tgz",
-      "integrity": "sha512-qMlSxKbpRlAridDExk92nSobyDdpPijUq2DW6oDnUqd0iOGxmQjyqhMIihI9+zv4LPyZdRje2cavWPbCbWm3eA==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">=6.9.0"
-      }
-    },
-    "node_modules/@babel/helper-validator-identifier": {
-      "version": "7.28.5",
-      "resolved": "https://registry.npmjs.org/@babel/helper-validator-identifier/-/helper-validator-identifier-7.28.5.tgz",
-      "integrity": "sha512-qSs4ifwzKJSV39ucNjsvc6WVHs6b7S03sOh2OcHF9UHfVPqWWALUsNUVzhSBiItjRZoLHx7nIarVjqKVusUZ1Q==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">=6.9.0"
-      }
-    },
-    "node_modules/@babel/helper-validator-option": {
-      "version": "7.27.1",
-      "resolved": "https://registry.npmjs.org/@babel/helper-validator-option/-/helper-validator-option-7.27.1.tgz",
-      "integrity": "sha512-YvjJow9FxbhFFKDSuFnVCe2WxXk1zWc22fFePVNEaWJEu8IrZVlda6N0uHwzZrUM1il7NC9Mlp4MaJYbYd9JSg==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">=6.9.0"
-      }
-    },
-    "node_modules/@babel/helpers": {
-      "version": "7.28.4",
-      "resolved": "https://registry.npmjs.org/@babel/helpers/-/helpers-7.28.4.tgz",
-      "integrity": "sha512-HFN59MmQXGHVyYadKLVumYsA9dBFun/ldYxipEjzA4196jpLZd8UjEEBLkbEkvfYreDqJhZxYAWFPtrfhNpj4w==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "@babel/template": "^7.27.2",
-        "@babel/types": "^7.28.4"
-      },
-      "engines": {
-        "node": ">=6.9.0"
-      }
-    },
-    "node_modules/@babel/parser": {
-      "version": "7.28.5",
-      "resolved": "https://registry.npmjs.org/@babel/parser/-/parser-7.28.5.tgz",
-      "integrity": "sha512-KKBU1VGYR7ORr3At5HAtUQ+TV3SzRCXmA/8OdDZiLDBIZxVyzXuztPjfLd3BV1PRAQGCMWWSHYhL0F8d5uHBDQ==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "@babel/types": "^7.28.5"
-      },
-      "bin": {
-        "parser": "bin/babel-parser.js"
-      },
-      "engines": {
-        "node": ">=6.0.0"
-      }
-    },
-    "node_modules/@babel/template": {
-      "version": "7.27.2",
-      "resolved": "https://registry.npmjs.org/@babel/template/-/template-7.27.2.tgz",
-      "integrity": "sha512-LPDZ85aEJyYSd18/DkjNh4/y1ntkE5KwUHWTiqgRxruuZL2F1yuHligVHLvcHY2vMHXttKFpJn6LwfI7cw7ODw==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "@babel/code-frame": "^7.27.1",
-        "@babel/parser": "^7.27.2",
-        "@babel/types": "^7.27.1"
-      },
-      "engines": {
-        "node": ">=6.9.0"
-      }
-    },
-    "node_modules/@babel/traverse": {
-      "version": "7.28.5",
-      "resolved": "https://registry.npmjs.org/@babel/traverse/-/traverse-7.28.5.tgz",
-      "integrity": "sha512-TCCj4t55U90khlYkVV/0TfkJkAkUg3jZFA3Neb7unZT8CPok7iiRfaX0F+WnqWqt7OxhOn0uBKXCw4lbL8W0aQ==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "@babel/code-frame": "^7.27.1",
-        "@babel/generator": "^7.28.5",
-        "@babel/helper-globals": "^7.28.0",
-        "@babel/parser": "^7.28.5",
-        "@babel/template": "^7.27.2",
-        "@babel/types": "^7.28.5",
-        "debug": "^4.3.1"
-      },
-      "engines": {
-        "node": ">=6.9.0"
-      }
-    },
-    "node_modules/@babel/types": {
-      "version": "7.28.5",
-      "resolved": "https://registry.npmjs.org/@babel/types/-/types-7.28.5.tgz",
-      "integrity": "sha512-qQ5m48eI/MFLQ5PxQj4PFaprjyCTLI37ElWMmNs0K8Lk3dVeOdNpB3ks8jc7yM5CDmVC73eMVk/trk3fgmrUpA==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "@babel/helper-string-parser": "^7.27.1",
-        "@babel/helper-validator-identifier": "^7.28.5"
-      },
-      "engines": {
-        "node": ">=6.9.0"
-      }
-    },
-    "node_modules/@emnapi/core": {
-      "version": "1.4.5",
-      "resolved": "https://registry.npmjs.org/@emnapi/core/-/core-1.4.5.tgz",
-      "integrity": "sha512-XsLw1dEOpkSX/WucdqUhPWP7hDxSvZiY+fsUC14h+FtQ2Ifni4znbBt8punRX+Uj2JG/uDb8nEHVKvrVlvdZ5Q==",
-      "dev": true,
-      "license": "MIT",
-      "optional": true,
-      "dependencies": {
-        "@emnapi/wasi-threads": "1.0.4",
-        "tslib": "^2.4.0"
-      }
-    },
-    "node_modules/@emnapi/runtime": {
-      "version": "1.4.5",
-      "resolved": "https://registry.npmjs.org/@emnapi/runtime/-/runtime-1.4.5.tgz",
-      "integrity": "sha512-++LApOtY0pEEz1zrd9vy1/zXVaVJJ/EbAF3u0fXIzPJEDtnITsBGbbK0EkM72amhl/R5b+5xx0Y/QhcVOpuulg==",
-      "dev": true,
-      "license": "MIT",
-      "optional": true,
-      "dependencies": {
-        "tslib": "^2.4.0"
-      }
-    },
-    "node_modules/@emnapi/wasi-threads": {
-      "version": "1.0.4",
-      "resolved": "https://registry.npmjs.org/@emnapi/wasi-threads/-/wasi-threads-1.0.4.tgz",
-      "integrity": "sha512-PJR+bOmMOPH8AtcTGAyYNiuJ3/Fcoj2XN/gBEWzDIKh254XO+mM9XoXHk5GNEhodxeMznbg7BlRojVbKN+gC6g==",
-      "dev": true,
-      "license": "MIT",
-      "optional": true,
-      "dependencies": {
-        "tslib": "^2.4.0"
-      }
-    },
-    "node_modules/@eslint-community/eslint-utils": {
-      "version": "4.9.0",
-      "resolved": "https://registry.npmjs.org/@eslint-community/eslint-utils/-/eslint-utils-4.9.0.tgz",
-      "integrity": "sha512-ayVFHdtZ+hsq1t2Dy24wCmGXGe4q9Gu3smhLYALJrr473ZH27MsnSL+LKUlimp4BWJqMDMLmPpx/Q9R3OAlL4g==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "eslint-visitor-keys": "^3.4.3"
-      },
-      "engines": {
-        "node": "^12.22.0 || ^14.17.0 || >=16.0.0"
-      },
-      "funding": {
-        "url": "https://opencollective.com/eslint"
-      },
-      "peerDependencies": {
-        "eslint": "^6.0.0 || ^7.0.0 || >=8.0.0"
-      }
-    },
-    "node_modules/@eslint-community/regexpp": {
-      "version": "4.12.1",
-      "resolved": "https://registry.npmjs.org/@eslint-community/regexpp/-/regexpp-4.12.1.tgz",
-      "integrity": "sha512-CCZCDJuduB9OUkFkY2IgppNZMi2lBQgD2qzwXkEia16cge2pijY/aXi96CJMquDMn3nJdlPV1A5KrJEXwfLNzQ==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": "^12.0.0 || ^14.0.0 || >=16.0.0"
-      }
-    },
-    "node_modules/@eslint/config-array": {
-      "version": "0.21.1",
-      "resolved": "https://registry.npmjs.org/@eslint/config-array/-/config-array-0.21.1.tgz",
-      "integrity": "sha512-aw1gNayWpdI/jSYVgzN5pL0cfzU02GT3NBpeT/DXbx1/1x7ZKxFPd9bwrzygx/qiwIQiJ1sw/zD8qY/kRvlGHA==",
-      "dev": true,
-      "license": "Apache-2.0",
-      "dependencies": {
-        "@eslint/object-schema": "^2.1.7",
-        "debug": "^4.3.1",
-        "minimatch": "^3.1.2"
-      },
-      "engines": {
-        "node": "^18.18.0 || ^20.9.0 || >=21.1.0"
-      }
-    },
-    "node_modules/@eslint/config-helpers": {
-      "version": "0.4.2",
-      "resolved": "https://registry.npmjs.org/@eslint/config-helpers/-/config-helpers-0.4.2.tgz",
-      "integrity": "sha512-gBrxN88gOIf3R7ja5K9slwNayVcZgK6SOUORm2uBzTeIEfeVaIhOpCtTox3P6R7o2jLFwLFTLnC7kU/RGcYEgw==",
-      "dev": true,
-      "license": "Apache-2.0",
-      "dependencies": {
-        "@eslint/core": "^0.17.0"
-      },
-      "engines": {
-        "node": "^18.18.0 || ^20.9.0 || >=21.1.0"
-      }
-    },
-    "node_modules/@eslint/core": {
-      "version": "0.17.0",
-      "resolved": "https://registry.npmjs.org/@eslint/core/-/core-0.17.0.tgz",
-      "integrity": "sha512-yL/sLrpmtDaFEiUj1osRP4TI2MDz1AddJL+jZ7KSqvBuliN4xqYY54IfdN8qD8Toa6g1iloph1fxQNkjOxrrpQ==",
-      "dev": true,
-      "license": "Apache-2.0",
-      "dependencies": {
-        "@types/json-schema": "^7.0.15"
-      },
-      "engines": {
-        "node": "^18.18.0 || ^20.9.0 || >=21.1.0"
-      }
-    },
-    "node_modules/@eslint/eslintrc": {
-      "version": "3.3.1",
-      "resolved": "https://registry.npmjs.org/@eslint/eslintrc/-/eslintrc-3.3.1.tgz",
-      "integrity": "sha512-gtF186CXhIl1p4pJNGZw8Yc6RlshoePRvE0X91oPGb3vZ8pM3qOS9W9NGPat9LziaBV7XrJWGylNQXkGcnM3IQ==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "ajv": "^6.12.4",
-        "debug": "^4.3.2",
-        "espree": "^10.0.1",
-        "globals": "^14.0.0",
-        "ignore": "^5.2.0",
-        "import-fresh": "^3.2.1",
-        "js-yaml": "^4.1.0",
-        "minimatch": "^3.1.2",
-        "strip-json-comments": "^3.1.1"
-      },
-      "engines": {
-        "node": "^18.18.0 || ^20.9.0 || >=21.1.0"
-      },
-      "funding": {
-        "url": "https://opencollective.com/eslint"
-      }
-    },
-    "node_modules/@eslint/js": {
-      "version": "9.39.1",
-      "resolved": "https://registry.npmjs.org/@eslint/js/-/js-9.39.1.tgz",
-      "integrity": "sha512-S26Stp4zCy88tH94QbBv3XCuzRQiZ9yXofEILmglYTh/Ug/a9/umqvgFtYBAo3Lp0nsI/5/qH1CCrbdK3AP1Tw==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": "^18.18.0 || ^20.9.0 || >=21.1.0"
-      },
-      "funding": {
-        "url": "https://eslint.org/donate"
-      }
-    },
-    "node_modules/@eslint/object-schema": {
-      "version": "2.1.7",
-      "resolved": "https://registry.npmjs.org/@eslint/object-schema/-/object-schema-2.1.7.tgz",
-      "integrity": "sha512-VtAOaymWVfZcmZbp6E2mympDIHvyjXs/12LqWYjVw6qjrfF+VK+fyG33kChz3nnK+SU5/NeHOqrTEHS8sXO3OA==",
-      "dev": true,
-      "license": "Apache-2.0",
-      "engines": {
-        "node": "^18.18.0 || ^20.9.0 || >=21.1.0"
-      }
-    },
-    "node_modules/@eslint/plugin-kit": {
-      "version": "0.4.1",
-      "resolved": "https://registry.npmjs.org/@eslint/plugin-kit/-/plugin-kit-0.4.1.tgz",
-      "integrity": "sha512-43/qtrDUokr7LJqoF2c3+RInu/t4zfrpYdoSDfYyhg52rwLV6TnOvdG4fXm7IkSB3wErkcmJS9iEhjVtOSEjjA==",
-      "dev": true,
-      "license": "Apache-2.0",
-      "dependencies": {
-        "@eslint/core": "^0.17.0",
-        "levn": "^0.4.1"
-      },
-      "engines": {
-        "node": "^18.18.0 || ^20.9.0 || >=21.1.0"
-      }
-    },
-    "node_modules/@humanfs/core": {
-      "version": "0.19.1",
-      "resolved": "https://registry.npmjs.org/@humanfs/core/-/core-0.19.1.tgz",
-      "integrity": "sha512-5DyQ4+1JEUzejeK1JGICcideyfUbGixgS9jNgex5nqkW+cY7WZhxBigmieN5Qnw9ZosSNVC9KQKyb+GUaGyKUA==",
-      "dev": true,
-      "license": "Apache-2.0",
-      "engines": {
-        "node": ">=18.18.0"
-      }
-    },
-    "node_modules/@humanfs/node": {
-      "version": "0.16.7",
-      "resolved": "https://registry.npmjs.org/@humanfs/node/-/node-0.16.7.tgz",
-      "integrity": "sha512-/zUx+yOsIrG4Y43Eh2peDeKCxlRt/gET6aHfaKpuq267qXdYDFViVHfMaLyygZOnl0kGWxFIgsBy8QFuTLUXEQ==",
-      "dev": true,
-      "license": "Apache-2.0",
-      "dependencies": {
-        "@humanfs/core": "^0.19.1",
-        "@humanwhocodes/retry": "^0.4.0"
-      },
-      "engines": {
-        "node": ">=18.18.0"
-      }
-    },
-    "node_modules/@humanwhocodes/module-importer": {
-      "version": "1.0.1",
-      "resolved": "https://registry.npmjs.org/@humanwhocodes/module-importer/-/module-importer-1.0.1.tgz",
-      "integrity": "sha512-bxveV4V8v5Yb4ncFTT3rPSgZBOpCkjfK0y4oVVVJwIuDVBRMDXrPyXRL988i5ap9m9bnyEEjWfm5WkBmtffLfA==",
-      "dev": true,
-      "license": "Apache-2.0",
-      "engines": {
-        "node": ">=12.22"
-      },
-      "funding": {
-        "type": "github",
-        "url": "https://github.com/sponsors/nzakas"
-      }
-    },
-    "node_modules/@humanwhocodes/retry": {
-      "version": "0.4.3",
-      "resolved": "https://registry.npmjs.org/@humanwhocodes/retry/-/retry-0.4.3.tgz",
-      "integrity": "sha512-bV0Tgo9K4hfPCek+aMAn81RppFKv2ySDQeMoSZuvTASywNTnVJCArCZE2FWqpvIatKu7VMRLWlR1EazvVhDyhQ==",
-      "dev": true,
-      "license": "Apache-2.0",
-      "engines": {
-        "node": ">=18.18"
-      },
-      "funding": {
-        "type": "github",
-        "url": "https://github.com/sponsors/nzakas"
-      }
-    },
-    "node_modules/@isaacs/cliui": {
-      "version": "8.0.2",
-      "resolved": "https://registry.npmjs.org/@isaacs/cliui/-/cliui-8.0.2.tgz",
-      "integrity": "sha512-O8jcjabXaleOG9DQ0+ARXWZBTfnP4WNAqzuiJK7ll44AmxGKv/J2M4TPjxjY3znBCfvBXFzucm1twdyFybFqEA==",
-      "dev": true,
-      "license": "ISC",
-      "dependencies": {
-        "string-width": "^5.1.2",
-        "string-width-cjs": "npm:string-width@^4.2.0",
-        "strip-ansi": "^7.0.1",
-        "strip-ansi-cjs": "npm:strip-ansi@^6.0.1",
-        "wrap-ansi": "^8.1.0",
-        "wrap-ansi-cjs": "npm:wrap-ansi@^7.0.0"
-      },
-      "engines": {
-        "node": ">=12"
-      }
-    },
-    "node_modules/@isaacs/cliui/node_modules/ansi-regex": {
-      "version": "6.2.2",
-      "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-6.2.2.tgz",
-      "integrity": "sha512-Bq3SmSpyFHaWjPk8If9yc6svM8c56dB5BAtW4Qbw5jHTwwXXcTLoRMkpDJp6VL0XzlWaCHTXrkFURMYmD0sLqg==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">=12"
-      },
-      "funding": {
-        "url": "https://github.com/chalk/ansi-regex?sponsor=1"
-      }
-    },
-    "node_modules/@isaacs/cliui/node_modules/strip-ansi": {
-      "version": "7.1.2",
-      "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-7.1.2.tgz",
-      "integrity": "sha512-gmBGslpoQJtgnMAvOVqGZpEz9dyoKTCzy2nfz/n8aIFhN/jCE/rCmcxabB6jOOHV+0WNnylOxaxBQPSvcWklhA==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "ansi-regex": "^6.0.1"
-      },
-      "engines": {
-        "node": ">=12"
-      },
-      "funding": {
-        "url": "https://github.com/chalk/strip-ansi?sponsor=1"
-      }
-    },
-    "node_modules/@jridgewell/gen-mapping": {
-      "version": "0.3.12",
-      "resolved": "https://registry.npmjs.org/@jridgewell/gen-mapping/-/gen-mapping-0.3.12.tgz",
-      "integrity": "sha512-OuLGC46TjB5BbN1dH8JULVVZY4WTdkF7tV9Ys6wLL1rubZnCMstOhNHueU5bLCrnRuDhKPDM4g6sw4Bel5Gzqg==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "@jridgewell/sourcemap-codec": "^1.5.0",
-        "@jridgewell/trace-mapping": "^0.3.24"
-      }
-    },
-    "node_modules/@jridgewell/remapping": {
-      "version": "2.3.5",
-      "resolved": "https://registry.npmjs.org/@jridgewell/remapping/-/remapping-2.3.5.tgz",
-      "integrity": "sha512-LI9u/+laYG4Ds1TDKSJW2YPrIlcVYOwi2fUC6xB43lueCjgxV4lffOCZCtYFiH6TNOX+tQKXx97T4IKHbhyHEQ==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "@jridgewell/gen-mapping": "^0.3.5",
-        "@jridgewell/trace-mapping": "^0.3.24"
-      }
-    },
-    "node_modules/@jridgewell/resolve-uri": {
-      "version": "3.1.2",
-      "resolved": "https://registry.npmjs.org/@jridgewell/resolve-uri/-/resolve-uri-3.1.2.tgz",
-      "integrity": "sha512-bRISgCIjP20/tbWSPWMEi54QVPRZExkuD9lJL+UIxUKtwVJA8wW1Trb1jMs1RFXo1CBTNZ/5hpC9QvmKWdopKw==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">=6.0.0"
-      }
-    },
-    "node_modules/@jridgewell/sourcemap-codec": {
-      "version": "1.5.4",
-      "resolved": "https://registry.npmjs.org/@jridgewell/sourcemap-codec/-/sourcemap-codec-1.5.4.tgz",
-      "integrity": "sha512-VT2+G1VQs/9oz078bLrYbecdZKs912zQlkelYpuf+SXF+QvZDYJlbx/LSx+meSAwdDFnF8FVXW92AVjjkVmgFw==",
-      "dev": true,
-      "license": "MIT"
-    },
-    "node_modules/@jridgewell/trace-mapping": {
-      "version": "0.3.29",
-      "resolved": "https://registry.npmjs.org/@jridgewell/trace-mapping/-/trace-mapping-0.3.29.tgz",
-      "integrity": "sha512-uw6guiW/gcAGPDhLmd77/6lW8QLeiV5RUTsAX46Db6oLhGaVj4lhnPwb184s1bkc8kdVg/+h988dro8GRDpmYQ==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "@jridgewell/resolve-uri": "^3.1.0",
-        "@jridgewell/sourcemap-codec": "^1.4.14"
-      }
-    },
-    "node_modules/@napi-rs/wasm-runtime": {
-      "version": "0.2.12",
-      "resolved": "https://registry.npmjs.org/@napi-rs/wasm-runtime/-/wasm-runtime-0.2.12.tgz",
-      "integrity": "sha512-ZVWUcfwY4E/yPitQJl481FjFo3K22D6qF0DuFH6Y/nbnE11GY5uguDxZMGXPQ8WQ0128MXQD7TnfHyK4oWoIJQ==",
-      "dev": true,
-      "license": "MIT",
-      "optional": true,
-      "dependencies": {
-        "@emnapi/core": "^1.4.3",
-        "@emnapi/runtime": "^1.4.3",
-        "@tybys/wasm-util": "^0.10.0"
-      }
-    },
-    "node_modules/@next/env": {
-      "version": "14.1.0",
-      "resolved": "https://registry.npmjs.org/@next/env/-/env-14.1.0.tgz",
-      "integrity": "sha512-Py8zIo+02ht82brwwhTg36iogzFqGLPXlRGKQw5s+qP/kMNc4MAyDeEwBKDijk6zTIbegEgu8Qy7C1LboslQAw==",
-      "license": "MIT"
-    },
-    "node_modules/@next/eslint-plugin-next": {
-      "version": "16.0.3",
-      "resolved": "https://registry.npmjs.org/@next/eslint-plugin-next/-/eslint-plugin-next-16.0.3.tgz",
-      "integrity": "sha512-6sPWmZetzFWMsz7Dhuxsdmbu3fK+/AxKRtj7OB0/3OZAI2MHB/v2FeYh271LZ9abvnM1WIwWc/5umYjx0jo5sQ==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "fast-glob": "3.3.1"
-      }
-    },
-    "node_modules/@next/eslint-plugin-next/node_modules/fast-glob": {
-      "version": "3.3.1",
-      "resolved": "https://registry.npmjs.org/fast-glob/-/fast-glob-3.3.1.tgz",
-      "integrity": "sha512-kNFPyjhh5cKjrUltxs+wFx+ZkbRaxxmZ+X0ZU31SOsxCEtP9VPgtq2teZw1DebupL5GmDaNQ6yKMMVcM41iqDg==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "@nodelib/fs.stat": "^2.0.2",
-        "@nodelib/fs.walk": "^1.2.3",
-        "glob-parent": "^5.1.2",
-        "merge2": "^1.3.0",
-        "micromatch": "^4.0.4"
-      },
-      "engines": {
-        "node": ">=8.6.0"
-      }
-    },
-    "node_modules/@next/eslint-plugin-next/node_modules/glob-parent": {
-      "version": "5.1.2",
-      "resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-5.1.2.tgz",
-      "integrity": "sha512-AOIgSQCepiJYwP3ARnGx+5VnTu2HBYdzbGP45eLw1vr3zB3vZLeyed1sC9hnbcOc9/SrMyM5RPQrkGz4aS9Zow==",
-      "dev": true,
-      "license": "ISC",
-      "dependencies": {
-        "is-glob": "^4.0.1"
-      },
-      "engines": {
-        "node": ">= 6"
-      }
-    },
-    "node_modules/@next/swc-darwin-arm64": {
-      "version": "14.1.0",
-      "resolved": "https://registry.npmjs.org/@next/swc-darwin-arm64/-/swc-darwin-arm64-14.1.0.tgz",
-      "integrity": "sha512-nUDn7TOGcIeyQni6lZHfzNoo9S0euXnu0jhsbMOmMJUBfgsnESdjN97kM7cBqQxZa8L/bM9om/S5/1dzCrW6wQ==",
-      "cpu": [
-        "arm64"
-      ],
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "darwin"
-      ],
-      "engines": {
-        "node": ">= 10"
-      }
-    },
-    "node_modules/@next/swc-darwin-x64": {
-      "version": "14.1.0",
-      "resolved": "https://registry.npmjs.org/@next/swc-darwin-x64/-/swc-darwin-x64-14.1.0.tgz",
-      "integrity": "sha512-1jgudN5haWxiAl3O1ljUS2GfupPmcftu2RYJqZiMJmmbBT5M1XDffjUtRUzP4W3cBHsrvkfOFdQ71hAreNQP6g==",
-      "cpu": [
-        "x64"
-      ],
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "darwin"
-      ],
-      "engines": {
-        "node": ">= 10"
-      }
-    },
-    "node_modules/@next/swc-linux-arm64-gnu": {
-      "version": "14.1.0",
-      "resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-gnu/-/swc-linux-arm64-gnu-14.1.0.tgz",
-      "integrity": "sha512-RHo7Tcj+jllXUbK7xk2NyIDod3YcCPDZxj1WLIYxd709BQ7WuRYl3OWUNG+WUfqeQBds6kvZYlc42NJJTNi4tQ==",
-      "cpu": [
-        "arm64"
-      ],
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "linux"
-      ],
-      "engines": {
-        "node": ">= 10"
-      }
-    },
-    "node_modules/@next/swc-linux-arm64-musl": {
-      "version": "14.1.0",
-      "resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-musl/-/swc-linux-arm64-musl-14.1.0.tgz",
-      "integrity": "sha512-v6kP8sHYxjO8RwHmWMJSq7VZP2nYCkRVQ0qolh2l6xroe9QjbgV8siTbduED4u0hlk0+tjS6/Tuy4n5XCp+l6g==",
-      "cpu": [
-        "arm64"
-      ],
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "linux"
-      ],
-      "engines": {
-        "node": ">= 10"
-      }
-    },
-    "node_modules/@next/swc-linux-x64-gnu": {
-      "version": "14.1.0",
-      "resolved": "https://registry.npmjs.org/@next/swc-linux-x64-gnu/-/swc-linux-x64-gnu-14.1.0.tgz",
-      "integrity": "sha512-zJ2pnoFYB1F4vmEVlb/eSe+VH679zT1VdXlZKX+pE66grOgjmKJHKacf82g/sWE4MQ4Rk2FMBCRnX+l6/TVYzQ==",
-      "cpu": [
-        "x64"
-      ],
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "linux"
-      ],
-      "engines": {
-        "node": ">= 10"
-      }
-    },
-    "node_modules/@next/swc-linux-x64-musl": {
-      "version": "14.1.0",
-      "resolved": "https://registry.npmjs.org/@next/swc-linux-x64-musl/-/swc-linux-x64-musl-14.1.0.tgz",
-      "integrity": "sha512-rbaIYFt2X9YZBSbH/CwGAjbBG2/MrACCVu2X0+kSykHzHnYH5FjHxwXLkcoJ10cX0aWCEynpu+rP76x0914atg==",
-      "cpu": [
-        "x64"
-      ],
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "linux"
-      ],
-      "engines": {
-        "node": ">= 10"
-      }
-    },
-    "node_modules/@next/swc-win32-arm64-msvc": {
-      "version": "14.1.0",
-      "resolved": "https://registry.npmjs.org/@next/swc-win32-arm64-msvc/-/swc-win32-arm64-msvc-14.1.0.tgz",
-      "integrity": "sha512-o1N5TsYc8f/HpGt39OUQpQ9AKIGApd3QLueu7hXk//2xq5Z9OxmV6sQfNp8C7qYmiOlHYODOGqNNa0e9jvchGQ==",
-      "cpu": [
-        "arm64"
-      ],
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "win32"
-      ],
-      "engines": {
-        "node": ">= 10"
-      }
-    },
-    "node_modules/@next/swc-win32-ia32-msvc": {
-      "version": "14.1.0",
-      "resolved": "https://registry.npmjs.org/@next/swc-win32-ia32-msvc/-/swc-win32-ia32-msvc-14.1.0.tgz",
-      "integrity": "sha512-XXIuB1DBRCFwNO6EEzCTMHT5pauwaSj4SWs7CYnME57eaReAKBXCnkUE80p/pAZcewm7hs+vGvNqDPacEXHVkw==",
-      "cpu": [
-        "ia32"
-      ],
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "win32"
-      ],
-      "engines": {
-        "node": ">= 10"
-      }
-    },
-    "node_modules/@next/swc-win32-x64-msvc": {
-      "version": "14.1.0",
-      "resolved": "https://registry.npmjs.org/@next/swc-win32-x64-msvc/-/swc-win32-x64-msvc-14.1.0.tgz",
-      "integrity": "sha512-9WEbVRRAqJ3YFVqEZIxUqkiO8l1nool1LmNxygr5HWF8AcSYsEpneUDhmjUVJEzO2A04+oPtZdombzzPPkTtgg==",
-      "cpu": [
-        "x64"
-      ],
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "win32"
-      ],
-      "engines": {
-        "node": ">= 10"
-      }
-    },
-    "node_modules/@nodelib/fs.scandir": {
-      "version": "2.1.5",
-      "resolved": "https://registry.npmjs.org/@nodelib/fs.scandir/-/fs.scandir-2.1.5.tgz",
-      "integrity": "sha512-vq24Bq3ym5HEQm2NKCr3yXDwjc7vTsEThRDnkp2DK9p1uqLR+DHurm/NOTo0KG7HYHU7eppKZj3MyqYuMBf62g==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "@nodelib/fs.stat": "2.0.5",
-        "run-parallel": "^1.1.9"
-      },
-      "engines": {
-        "node": ">= 8"
-      }
-    },
-    "node_modules/@nodelib/fs.stat": {
-      "version": "2.0.5",
-      "resolved": "https://registry.npmjs.org/@nodelib/fs.stat/-/fs.stat-2.0.5.tgz",
-      "integrity": "sha512-RkhPPp2zrqDAQA/2jNhnztcPAlv64XdhIp7a7454A5ovI7Bukxgt7MX7udwAu3zg1DcpPU0rz3VV1SeaqvY4+A==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">= 8"
-      }
-    },
-    "node_modules/@nodelib/fs.walk": {
-      "version": "1.2.8",
-      "resolved": "https://registry.npmjs.org/@nodelib/fs.walk/-/fs.walk-1.2.8.tgz",
-      "integrity": "sha512-oGB+UxlgWcgQkgwo8GcEGwemoTFt3FIO9ababBmaGwXIoBKZ+GTy0pP185beGg7Llih/NSHSV2XAs1lnznocSg==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "@nodelib/fs.scandir": "2.1.5",
-        "fastq": "^1.6.0"
-      },
-      "engines": {
-        "node": ">= 8"
-      }
-    },
-    "node_modules/@nolyfill/is-core-module": {
-      "version": "1.0.39",
-      "resolved": "https://registry.npmjs.org/@nolyfill/is-core-module/-/is-core-module-1.0.39.tgz",
-      "integrity": "sha512-nn5ozdjYQpUCZlWGuxcJY/KpxkWQs4DcbMCmKojjyrYDEAGy4Ce19NN4v5MduafTwJlbKc99UA8YhSVqq9yPZA==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">=12.4.0"
-      }
-    },
-    "node_modules/@pkgjs/parseargs": {
-      "version": "0.11.0",
-      "resolved": "https://registry.npmjs.org/@pkgjs/parseargs/-/parseargs-0.11.0.tgz",
-      "integrity": "sha512-+1VkjdD0QBLPodGrJUeqarH8VAIvQODIbwh9XpP5Syisf7YoQgsJKPNFoqqLQlu+VQ/tVSshMR6loPMn8U+dPg==",
-      "dev": true,
-      "license": "MIT",
-      "optional": true,
-      "engines": {
-        "node": ">=14"
-      }
-    },
-    "node_modules/@rtsao/scc": {
-      "version": "1.1.0",
-      "resolved": "https://registry.npmjs.org/@rtsao/scc/-/scc-1.1.0.tgz",
-      "integrity": "sha512-zt6OdqaDoOnJ1ZYsCYGt9YmWzDXl4vQdKTyJev62gFhRGKdx7mcT54V9KIjg+d2wi9EXsPvAPKe7i7WjfVWB8g==",
-      "dev": true,
-      "license": "MIT"
-    },
-    "node_modules/@swc/helpers": {
-      "version": "0.5.2",
-      "resolved": "https://registry.npmjs.org/@swc/helpers/-/helpers-0.5.2.tgz",
-      "integrity": "sha512-E4KcWTpoLHqwPHLxidpOqQbcrZVgi0rsmmZXUle1jXmJfuIf/UWpczUJ7MZZ5tlxytgJXyp0w4PGkkeLiuIdZw==",
-      "license": "Apache-2.0",
-      "dependencies": {
-        "tslib": "^2.4.0"
-      }
-    },
-    "node_modules/@tybys/wasm-util": {
-      "version": "0.10.0",
-      "resolved": "https://registry.npmjs.org/@tybys/wasm-util/-/wasm-util-0.10.0.tgz",
-      "integrity": "sha512-VyyPYFlOMNylG45GoAe0xDoLwWuowvf92F9kySqzYh8vmYm7D2u4iUJKa1tOUpS70Ku13ASrOkS4ScXFsTaCNQ==",
-      "dev": true,
-      "license": "MIT",
-      "optional": true,
-      "dependencies": {
-        "tslib": "^2.4.0"
-      }
-    },
-    "node_modules/@types/estree": {
-      "version": "1.0.8",
-      "resolved": "https://registry.npmjs.org/@types/estree/-/estree-1.0.8.tgz",
-      "integrity": "sha512-dWHzHa2WqEXI/O1E9OjrocMTKJl2mSrEolh1Iomrv6U+JuNwaHXsXx9bLu5gG7BUWFIN0skIQJQ/L1rIex4X6w==",
-      "dev": true,
-      "license": "MIT"
-    },
-    "node_modules/@types/json-schema": {
-      "version": "7.0.15",
-      "resolved": "https://registry.npmjs.org/@types/json-schema/-/json-schema-7.0.15.tgz",
-      "integrity": "sha512-5+fP8P8MFNC+AyZCDxrB2pkZFPGzqQWUzpSeuuVLvm8VMcorNYavBqoFcxK8bQz4Qsbn4oUEEem4wDLfcysGHA==",
-      "dev": true,
-      "license": "MIT"
-    },
-    "node_modules/@types/json5": {
-      "version": "0.0.29",
-      "resolved": "https://registry.npmjs.org/@types/json5/-/json5-0.0.29.tgz",
-      "integrity": "sha512-dRLjCWHYg4oaA77cxO64oO+7JwCwnIzkZPdrrC71jQmQtlhM556pwKo5bUzqvZndkVbeFLIIi+9TC40JNF5hNQ==",
-      "dev": true,
-      "license": "MIT"
-    },
-    "node_modules/@types/node": {
-      "version": "20.19.9",
-      "resolved": "https://registry.npmjs.org/@types/node/-/node-20.19.9.tgz",
-      "integrity": "sha512-cuVNgarYWZqxRJDQHEB58GEONhOK79QVR/qYx4S7kcUObQvUwvFnYxJuuHUKm2aieN9X3yZB4LZsuYNU1Qphsw==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "undici-types": "~6.21.0"
-      }
-    },
-    "node_modules/@types/prop-types": {
-      "version": "15.7.15",
-      "resolved": "https://registry.npmjs.org/@types/prop-types/-/prop-types-15.7.15.tgz",
-      "integrity": "sha512-F6bEyamV9jKGAFBEmlQnesRPGOQqS2+Uwi0Em15xenOxHaf2hv6L8YCVn3rPdPJOiJfPiCnLIRyvwVaqMY3MIw==",
-      "dev": true,
-      "license": "MIT"
-    },
-    "node_modules/@types/react": {
-      "version": "18.3.23",
-      "resolved": "https://registry.npmjs.org/@types/react/-/react-18.3.23.tgz",
-      "integrity": "sha512-/LDXMQh55EzZQ0uVAZmKKhfENivEvWz6E+EYzh+/MCjMhNsotd+ZHhBGIjFDTi6+fz0OhQQQLbTgdQIxxCsC0w==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "@types/prop-types": "*",
-        "csstype": "^3.0.2"
-      }
-    },
-    "node_modules/@types/react-dom": {
-      "version": "18.3.7",
-      "resolved": "https://registry.npmjs.org/@types/react-dom/-/react-dom-18.3.7.tgz",
-      "integrity": "sha512-MEe3UeoENYVFXzoXEWsvcpg6ZvlrFNlOQ7EOsvhI3CfAXwzPfO8Qwuxd40nepsYKqyyVQnTdEfv68q91yLcKrQ==",
-      "dev": true,
-      "license": "MIT",
-      "peerDependencies": {
-        "@types/react": "^18.0.0"
-      }
-    },
-    "node_modules/@typescript-eslint/eslint-plugin": {
-      "version": "8.47.0",
-      "resolved": "https://registry.npmjs.org/@typescript-eslint/eslint-plugin/-/eslint-plugin-8.47.0.tgz",
-      "integrity": "sha512-fe0rz9WJQ5t2iaLfdbDc9T80GJy0AeO453q8C3YCilnGozvOyCG5t+EZtg7j7D88+c3FipfP/x+wzGnh1xp8ZA==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "@eslint-community/regexpp": "^4.10.0",
-        "@typescript-eslint/scope-manager": "8.47.0",
-        "@typescript-eslint/type-utils": "8.47.0",
-        "@typescript-eslint/utils": "8.47.0",
-        "@typescript-eslint/visitor-keys": "8.47.0",
-        "graphemer": "^1.4.0",
-        "ignore": "^7.0.0",
-        "natural-compare": "^1.4.0",
-        "ts-api-utils": "^2.1.0"
-      },
-      "engines": {
-        "node": "^18.18.0 || ^20.9.0 || >=21.1.0"
-      },
-      "funding": {
-        "type": "opencollective",
-        "url": "https://opencollective.com/typescript-eslint"
-      },
-      "peerDependencies": {
-        "@typescript-eslint/parser": "^8.47.0",
-        "eslint": "^8.57.0 || ^9.0.0",
-        "typescript": ">=4.8.4 <6.0.0"
-      }
-    },
-    "node_modules/@typescript-eslint/eslint-plugin/node_modules/ignore": {
-      "version": "7.0.5",
-      "resolved": "https://registry.npmjs.org/ignore/-/ignore-7.0.5.tgz",
-      "integrity": "sha512-Hs59xBNfUIunMFgWAbGX5cq6893IbWg4KnrjbYwX3tx0ztorVgTDA6B2sxf8ejHJ4wz8BqGUMYlnzNBer5NvGg==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">= 4"
-      }
-    },
-    "node_modules/@typescript-eslint/parser": {
-      "version": "8.47.0",
-      "resolved": "https://registry.npmjs.org/@typescript-eslint/parser/-/parser-8.47.0.tgz",
-      "integrity": "sha512-lJi3PfxVmo0AkEY93ecfN+r8SofEqZNGByvHAI3GBLrvt1Cw6H5k1IM02nSzu0RfUafr2EvFSw0wAsZgubNplQ==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "@typescript-eslint/scope-manager": "8.47.0",
-        "@typescript-eslint/types": "8.47.0",
-        "@typescript-eslint/typescript-estree": "8.47.0",
-        "@typescript-eslint/visitor-keys": "8.47.0",
-        "debug": "^4.3.4"
-      },
-      "engines": {
-        "node": "^18.18.0 || ^20.9.0 || >=21.1.0"
-      },
-      "funding": {
-        "type": "opencollective",
-        "url": "https://opencollective.com/typescript-eslint"
-      },
-      "peerDependencies": {
-        "eslint": "^8.57.0 || ^9.0.0",
-        "typescript": ">=4.8.4 <6.0.0"
-      }
-    },
-    "node_modules/@typescript-eslint/project-service": {
-      "version": "8.47.0",
-      "resolved": "https://registry.npmjs.org/@typescript-eslint/project-service/-/project-service-8.47.0.tgz",
-      "integrity": "sha512-2X4BX8hUeB5JcA1TQJ7GjcgulXQ+5UkNb0DL8gHsHUHdFoiCTJoYLTpib3LtSDPZsRET5ygN4qqIWrHyYIKERA==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "@typescript-eslint/tsconfig-utils": "^8.47.0",
-        "@typescript-eslint/types": "^8.47.0",
-        "debug": "^4.3.4"
-      },
-      "engines": {
-        "node": "^18.18.0 || ^20.9.0 || >=21.1.0"
-      },
-      "funding": {
-        "type": "opencollective",
-        "url": "https://opencollective.com/typescript-eslint"
-      },
-      "peerDependencies": {
-        "typescript": ">=4.8.4 <6.0.0"
-      }
-    },
-    "node_modules/@typescript-eslint/scope-manager": {
-      "version": "8.47.0",
-      "resolved": "https://registry.npmjs.org/@typescript-eslint/scope-manager/-/scope-manager-8.47.0.tgz",
-      "integrity": "sha512-a0TTJk4HXMkfpFkL9/WaGTNuv7JWfFTQFJd6zS9dVAjKsojmv9HT55xzbEpnZoY+VUb+YXLMp+ihMLz/UlZfDg==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "@typescript-eslint/types": "8.47.0",
-        "@typescript-eslint/visitor-keys": "8.47.0"
-      },
-      "engines": {
-        "node": "^18.18.0 || ^20.9.0 || >=21.1.0"
-      },
-      "funding": {
-        "type": "opencollective",
-        "url": "https://opencollective.com/typescript-eslint"
-      }
-    },
-    "node_modules/@typescript-eslint/tsconfig-utils": {
-      "version": "8.47.0",
-      "resolved": "https://registry.npmjs.org/@typescript-eslint/tsconfig-utils/-/tsconfig-utils-8.47.0.tgz",
-      "integrity": "sha512-ybUAvjy4ZCL11uryalkKxuT3w3sXJAuWhOoGS3T/Wu+iUu1tGJmk5ytSY8gbdACNARmcYEB0COksD2j6hfGK2g==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": "^18.18.0 || ^20.9.0 || >=21.1.0"
-      },
-      "funding": {
-        "type": "opencollective",
-        "url": "https://opencollective.com/typescript-eslint"
-      },
-      "peerDependencies": {
-        "typescript": ">=4.8.4 <6.0.0"
-      }
-    },
-    "node_modules/@typescript-eslint/type-utils": {
-      "version": "8.47.0",
-      "resolved": "https://registry.npmjs.org/@typescript-eslint/type-utils/-/type-utils-8.47.0.tgz",
-      "integrity": "sha512-QC9RiCmZ2HmIdCEvhd1aJELBlD93ErziOXXlHEZyuBo3tBiAZieya0HLIxp+DoDWlsQqDawyKuNEhORyku+P8A==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "@typescript-eslint/types": "8.47.0",
-        "@typescript-eslint/typescript-estree": "8.47.0",
-        "@typescript-eslint/utils": "8.47.0",
-        "debug": "^4.3.4",
-        "ts-api-utils": "^2.1.0"
-      },
-      "engines": {
-        "node": "^18.18.0 || ^20.9.0 || >=21.1.0"
-      },
-      "funding": {
-        "type": "opencollective",
-        "url": "https://opencollective.com/typescript-eslint"
-      },
-      "peerDependencies": {
-        "eslint": "^8.57.0 || ^9.0.0",
-        "typescript": ">=4.8.4 <6.0.0"
-      }
-    },
-    "node_modules/@typescript-eslint/types": {
-      "version": "8.47.0",
-      "resolved": "https://registry.npmjs.org/@typescript-eslint/types/-/types-8.47.0.tgz",
-      "integrity": "sha512-nHAE6bMKsizhA2uuYZbEbmp5z2UpffNrPEqiKIeN7VsV6UY/roxanWfoRrf6x/k9+Obf+GQdkm0nPU+vnMXo9A==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": "^18.18.0 || ^20.9.0 || >=21.1.0"
-      },
-      "funding": {
-        "type": "opencollective",
-        "url": "https://opencollective.com/typescript-eslint"
-      }
-    },
-    "node_modules/@typescript-eslint/typescript-estree": {
-      "version": "8.47.0",
-      "resolved": "https://registry.npmjs.org/@typescript-eslint/typescript-estree/-/typescript-estree-8.47.0.tgz",
-      "integrity": "sha512-k6ti9UepJf5NpzCjH31hQNLHQWupTRPhZ+KFF8WtTuTpy7uHPfeg2NM7cP27aCGajoEplxJDFVCEm9TGPYyiVg==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "@typescript-eslint/project-service": "8.47.0",
-        "@typescript-eslint/tsconfig-utils": "8.47.0",
-        "@typescript-eslint/types": "8.47.0",
-        "@typescript-eslint/visitor-keys": "8.47.0",
-        "debug": "^4.3.4",
-        "fast-glob": "^3.3.2",
-        "is-glob": "^4.0.3",
-        "minimatch": "^9.0.4",
-        "semver": "^7.6.0",
-        "ts-api-utils": "^2.1.0"
-      },
-      "engines": {
-        "node": "^18.18.0 || ^20.9.0 || >=21.1.0"
-      },
-      "funding": {
-        "type": "opencollective",
-        "url": "https://opencollective.com/typescript-eslint"
-      },
-      "peerDependencies": {
-        "typescript": ">=4.8.4 <6.0.0"
-      }
-    },
-    "node_modules/@typescript-eslint/typescript-estree/node_modules/brace-expansion": {
-      "version": "2.0.2",
-      "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-2.0.2.tgz",
-      "integrity": "sha512-Jt0vHyM+jmUBqojB7E1NIYadt0vI0Qxjxd2TErW94wDz+E2LAm5vKMXXwg6ZZBTHPuUlDgQHKXvjGBdfcF1ZDQ==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "balanced-match": "^1.0.0"
-      }
-    },
-    "node_modules/@typescript-eslint/typescript-estree/node_modules/minimatch": {
-      "version": "9.0.5",
-      "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-9.0.5.tgz",
-      "integrity": "sha512-G6T0ZX48xgozx7587koeX9Ys2NYy6Gmv//P89sEte9V9whIapMNF4idKxnW2QtCcLiTWlb/wfCabAtAFWhhBow==",
-      "dev": true,
-      "license": "ISC",
-      "dependencies": {
-        "brace-expansion": "^2.0.1"
-      },
-      "engines": {
-        "node": ">=16 || 14 >=14.17"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/isaacs"
-      }
-    },
-    "node_modules/@typescript-eslint/utils": {
-      "version": "8.47.0",
-      "resolved": "https://registry.npmjs.org/@typescript-eslint/utils/-/utils-8.47.0.tgz",
-      "integrity": "sha512-g7XrNf25iL4TJOiPqatNuaChyqt49a/onq5YsJ9+hXeugK+41LVg7AxikMfM02PC6jbNtZLCJj6AUcQXJS/jGQ==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "@eslint-community/eslint-utils": "^4.7.0",
-        "@typescript-eslint/scope-manager": "8.47.0",
-        "@typescript-eslint/types": "8.47.0",
-        "@typescript-eslint/typescript-estree": "8.47.0"
-      },
-      "engines": {
-        "node": "^18.18.0 || ^20.9.0 || >=21.1.0"
-      },
-      "funding": {
-        "type": "opencollective",
-        "url": "https://opencollective.com/typescript-eslint"
-      },
-      "peerDependencies": {
-        "eslint": "^8.57.0 || ^9.0.0",
-        "typescript": ">=4.8.4 <6.0.0"
-      }
-    },
-    "node_modules/@typescript-eslint/visitor-keys": {
-      "version": "8.47.0",
-      "resolved": "https://registry.npmjs.org/@typescript-eslint/visitor-keys/-/visitor-keys-8.47.0.tgz",
-      "integrity": "sha512-SIV3/6eftCy1bNzCQoPmbWsRLujS8t5iDIZ4spZOBHqrM+yfX2ogg8Tt3PDTAVKw3sSCiUgg30uOAvK2r9zGjQ==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "@typescript-eslint/types": "8.47.0",
-        "eslint-visitor-keys": "^4.2.1"
-      },
-      "engines": {
-        "node": "^18.18.0 || ^20.9.0 || >=21.1.0"
-      },
-      "funding": {
-        "type": "opencollective",
-        "url": "https://opencollective.com/typescript-eslint"
-      }
-    },
-    "node_modules/@typescript-eslint/visitor-keys/node_modules/eslint-visitor-keys": {
-      "version": "4.2.1",
-      "resolved": "https://registry.npmjs.org/eslint-visitor-keys/-/eslint-visitor-keys-4.2.1.tgz",
-      "integrity": "sha512-Uhdk5sfqcee/9H/rCOJikYz67o0a2Tw2hGRPOG2Y1R2dg7brRe1uG0yaNQDHu+TO/uQPF/5eCapvYSmHUjt7JQ==",
-      "dev": true,
-      "license": "Apache-2.0",
-      "engines": {
-        "node": "^18.18.0 || ^20.9.0 || >=21.1.0"
-      },
-      "funding": {
-        "url": "https://opencollective.com/eslint"
-      }
-    },
-    "node_modules/@unrs/resolver-binding-android-arm-eabi": {
-      "version": "1.11.1",
-      "resolved": "https://registry.npmjs.org/@unrs/resolver-binding-android-arm-eabi/-/resolver-binding-android-arm-eabi-1.11.1.tgz",
-      "integrity": "sha512-ppLRUgHVaGRWUx0R0Ut06Mjo9gBaBkg3v/8AxusGLhsIotbBLuRk51rAzqLC8gq6NyyAojEXglNjzf6R948DNw==",
-      "cpu": [
-        "arm"
-      ],
-      "dev": true,
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "android"
-      ]
-    },
-    "node_modules/@unrs/resolver-binding-android-arm64": {
-      "version": "1.11.1",
-      "resolved": "https://registry.npmjs.org/@unrs/resolver-binding-android-arm64/-/resolver-binding-android-arm64-1.11.1.tgz",
-      "integrity": "sha512-lCxkVtb4wp1v+EoN+HjIG9cIIzPkX5OtM03pQYkG+U5O/wL53LC4QbIeazgiKqluGeVEeBlZahHalCaBvU1a2g==",
-      "cpu": [
-        "arm64"
-      ],
-      "dev": true,
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "android"
-      ]
-    },
-    "node_modules/@unrs/resolver-binding-darwin-arm64": {
-      "version": "1.11.1",
-      "resolved": "https://registry.npmjs.org/@unrs/resolver-binding-darwin-arm64/-/resolver-binding-darwin-arm64-1.11.1.tgz",
-      "integrity": "sha512-gPVA1UjRu1Y/IsB/dQEsp2V1pm44Of6+LWvbLc9SDk1c2KhhDRDBUkQCYVWe6f26uJb3fOK8saWMgtX8IrMk3g==",
-      "cpu": [
-        "arm64"
-      ],
-      "dev": true,
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "darwin"
-      ]
-    },
-    "node_modules/@unrs/resolver-binding-darwin-x64": {
-      "version": "1.11.1",
-      "resolved": "https://registry.npmjs.org/@unrs/resolver-binding-darwin-x64/-/resolver-binding-darwin-x64-1.11.1.tgz",
-      "integrity": "sha512-cFzP7rWKd3lZaCsDze07QX1SC24lO8mPty9vdP+YVa3MGdVgPmFc59317b2ioXtgCMKGiCLxJ4HQs62oz6GfRQ==",
-      "cpu": [
-        "x64"
-      ],
-      "dev": true,
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "darwin"
-      ]
-    },
-    "node_modules/@unrs/resolver-binding-freebsd-x64": {
-      "version": "1.11.1",
-      "resolved": "https://registry.npmjs.org/@unrs/resolver-binding-freebsd-x64/-/resolver-binding-freebsd-x64-1.11.1.tgz",
-      "integrity": "sha512-fqtGgak3zX4DCB6PFpsH5+Kmt/8CIi4Bry4rb1ho6Av2QHTREM+47y282Uqiu3ZRF5IQioJQ5qWRV6jduA+iGw==",
-      "cpu": [
-        "x64"
-      ],
-      "dev": true,
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "freebsd"
-      ]
-    },
-    "node_modules/@unrs/resolver-binding-linux-arm-gnueabihf": {
-      "version": "1.11.1",
-      "resolved": "https://registry.npmjs.org/@unrs/resolver-binding-linux-arm-gnueabihf/-/resolver-binding-linux-arm-gnueabihf-1.11.1.tgz",
-      "integrity": "sha512-u92mvlcYtp9MRKmP+ZvMmtPN34+/3lMHlyMj7wXJDeXxuM0Vgzz0+PPJNsro1m3IZPYChIkn944wW8TYgGKFHw==",
-      "cpu": [
-        "arm"
-      ],
-      "dev": true,
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "linux"
-      ]
-    },
-    "node_modules/@unrs/resolver-binding-linux-arm-musleabihf": {
-      "version": "1.11.1",
-      "resolved": "https://registry.npmjs.org/@unrs/resolver-binding-linux-arm-musleabihf/-/resolver-binding-linux-arm-musleabihf-1.11.1.tgz",
-      "integrity": "sha512-cINaoY2z7LVCrfHkIcmvj7osTOtm6VVT16b5oQdS4beibX2SYBwgYLmqhBjA1t51CarSaBuX5YNsWLjsqfW5Cw==",
-      "cpu": [
-        "arm"
-      ],
-      "dev": true,
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "linux"
-      ]
-    },
-    "node_modules/@unrs/resolver-binding-linux-arm64-gnu": {
-      "version": "1.11.1",
-      "resolved": "https://registry.npmjs.org/@unrs/resolver-binding-linux-arm64-gnu/-/resolver-binding-linux-arm64-gnu-1.11.1.tgz",
-      "integrity": "sha512-34gw7PjDGB9JgePJEmhEqBhWvCiiWCuXsL9hYphDF7crW7UgI05gyBAi6MF58uGcMOiOqSJ2ybEeCvHcq0BCmQ==",
-      "cpu": [
-        "arm64"
-      ],
-      "dev": true,
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "linux"
-      ]
-    },
-    "node_modules/@unrs/resolver-binding-linux-arm64-musl": {
-      "version": "1.11.1",
-      "resolved": "https://registry.npmjs.org/@unrs/resolver-binding-linux-arm64-musl/-/resolver-binding-linux-arm64-musl-1.11.1.tgz",
-      "integrity": "sha512-RyMIx6Uf53hhOtJDIamSbTskA99sPHS96wxVE/bJtePJJtpdKGXO1wY90oRdXuYOGOTuqjT8ACccMc4K6QmT3w==",
-      "cpu": [
-        "arm64"
-      ],
-      "dev": true,
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "linux"
-      ]
-    },
-    "node_modules/@unrs/resolver-binding-linux-ppc64-gnu": {
-      "version": "1.11.1",
-      "resolved": "https://registry.npmjs.org/@unrs/resolver-binding-linux-ppc64-gnu/-/resolver-binding-linux-ppc64-gnu-1.11.1.tgz",
-      "integrity": "sha512-D8Vae74A4/a+mZH0FbOkFJL9DSK2R6TFPC9M+jCWYia/q2einCubX10pecpDiTmkJVUH+y8K3BZClycD8nCShA==",
-      "cpu": [
-        "ppc64"
-      ],
-      "dev": true,
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "linux"
-      ]
-    },
-    "node_modules/@unrs/resolver-binding-linux-riscv64-gnu": {
-      "version": "1.11.1",
-      "resolved": "https://registry.npmjs.org/@unrs/resolver-binding-linux-riscv64-gnu/-/resolver-binding-linux-riscv64-gnu-1.11.1.tgz",
-      "integrity": "sha512-frxL4OrzOWVVsOc96+V3aqTIQl1O2TjgExV4EKgRY09AJ9leZpEg8Ak9phadbuX0BA4k8U5qtvMSQQGGmaJqcQ==",
-      "cpu": [
-        "riscv64"
-      ],
-      "dev": true,
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "linux"
-      ]
-    },
-    "node_modules/@unrs/resolver-binding-linux-riscv64-musl": {
-      "version": "1.11.1",
-      "resolved": "https://registry.npmjs.org/@unrs/resolver-binding-linux-riscv64-musl/-/resolver-binding-linux-riscv64-musl-1.11.1.tgz",
-      "integrity": "sha512-mJ5vuDaIZ+l/acv01sHoXfpnyrNKOk/3aDoEdLO/Xtn9HuZlDD6jKxHlkN8ZhWyLJsRBxfv9GYM2utQ1SChKew==",
-      "cpu": [
-        "riscv64"
-      ],
-      "dev": true,
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "linux"
-      ]
-    },
-    "node_modules/@unrs/resolver-binding-linux-s390x-gnu": {
-      "version": "1.11.1",
-      "resolved": "https://registry.npmjs.org/@unrs/resolver-binding-linux-s390x-gnu/-/resolver-binding-linux-s390x-gnu-1.11.1.tgz",
-      "integrity": "sha512-kELo8ebBVtb9sA7rMe1Cph4QHreByhaZ2QEADd9NzIQsYNQpt9UkM9iqr2lhGr5afh885d/cB5QeTXSbZHTYPg==",
-      "cpu": [
-        "s390x"
-      ],
-      "dev": true,
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "linux"
-      ]
-    },
-    "node_modules/@unrs/resolver-binding-linux-x64-gnu": {
-      "version": "1.11.1",
-      "resolved": "https://registry.npmjs.org/@unrs/resolver-binding-linux-x64-gnu/-/resolver-binding-linux-x64-gnu-1.11.1.tgz",
-      "integrity": "sha512-C3ZAHugKgovV5YvAMsxhq0gtXuwESUKc5MhEtjBpLoHPLYM+iuwSj3lflFwK3DPm68660rZ7G8BMcwSro7hD5w==",
-      "cpu": [
-        "x64"
-      ],
-      "dev": true,
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "linux"
-      ]
-    },
-    "node_modules/@unrs/resolver-binding-linux-x64-musl": {
-      "version": "1.11.1",
-      "resolved": "https://registry.npmjs.org/@unrs/resolver-binding-linux-x64-musl/-/resolver-binding-linux-x64-musl-1.11.1.tgz",
-      "integrity": "sha512-rV0YSoyhK2nZ4vEswT/QwqzqQXw5I6CjoaYMOX0TqBlWhojUf8P94mvI7nuJTeaCkkds3QE4+zS8Ko+GdXuZtA==",
-      "cpu": [
-        "x64"
-      ],
-      "dev": true,
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "linux"
-      ]
-    },
-    "node_modules/@unrs/resolver-binding-wasm32-wasi": {
-      "version": "1.11.1",
-      "resolved": "https://registry.npmjs.org/@unrs/resolver-binding-wasm32-wasi/-/resolver-binding-wasm32-wasi-1.11.1.tgz",
-      "integrity": "sha512-5u4RkfxJm+Ng7IWgkzi3qrFOvLvQYnPBmjmZQ8+szTK/b31fQCnleNl1GgEt7nIsZRIf5PLhPwT0WM+q45x/UQ==",
-      "cpu": [
-        "wasm32"
-      ],
-      "dev": true,
-      "license": "MIT",
-      "optional": true,
-      "dependencies": {
-        "@napi-rs/wasm-runtime": "^0.2.11"
-      },
-      "engines": {
-        "node": ">=14.0.0"
-      }
-    },
-    "node_modules/@unrs/resolver-binding-win32-arm64-msvc": {
-      "version": "1.11.1",
-      "resolved": "https://registry.npmjs.org/@unrs/resolver-binding-win32-arm64-msvc/-/resolver-binding-win32-arm64-msvc-1.11.1.tgz",
-      "integrity": "sha512-nRcz5Il4ln0kMhfL8S3hLkxI85BXs3o8EYoattsJNdsX4YUU89iOkVn7g0VHSRxFuVMdM4Q1jEpIId1Ihim/Uw==",
-      "cpu": [
-        "arm64"
-      ],
-      "dev": true,
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "win32"
-      ]
-    },
-    "node_modules/@unrs/resolver-binding-win32-ia32-msvc": {
-      "version": "1.11.1",
-      "resolved": "https://registry.npmjs.org/@unrs/resolver-binding-win32-ia32-msvc/-/resolver-binding-win32-ia32-msvc-1.11.1.tgz",
-      "integrity": "sha512-DCEI6t5i1NmAZp6pFonpD5m7i6aFrpofcp4LA2i8IIq60Jyo28hamKBxNrZcyOwVOZkgsRp9O2sXWBWP8MnvIQ==",
-      "cpu": [
-        "ia32"
-      ],
-      "dev": true,
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "win32"
-      ]
-    },
-    "node_modules/@unrs/resolver-binding-win32-x64-msvc": {
-      "version": "1.11.1",
-      "resolved": "https://registry.npmjs.org/@unrs/resolver-binding-win32-x64-msvc/-/resolver-binding-win32-x64-msvc-1.11.1.tgz",
-      "integrity": "sha512-lrW200hZdbfRtztbygyaq/6jP6AKE8qQN2KvPcJ+x7wiD038YtnYtZ82IMNJ69GJibV7bwL3y9FgK+5w/pYt6g==",
-      "cpu": [
-        "x64"
-      ],
-      "dev": true,
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "win32"
-      ]
-    },
-    "node_modules/acorn": {
-      "version": "8.15.0",
-      "resolved": "https://registry.npmjs.org/acorn/-/acorn-8.15.0.tgz",
-      "integrity": "sha512-NZyJarBfL7nWwIq+FDL6Zp/yHEhePMNnnJ0y3qfieCrmNvYct8uvtiV41UvlSe6apAfk0fY1FbWx+NwfmpvtTg==",
-      "dev": true,
-      "license": "MIT",
-      "bin": {
-        "acorn": "bin/acorn"
-      },
-      "engines": {
-        "node": ">=0.4.0"
-      }
-    },
-    "node_modules/acorn-jsx": {
-      "version": "5.3.2",
-      "resolved": "https://registry.npmjs.org/acorn-jsx/-/acorn-jsx-5.3.2.tgz",
-      "integrity": "sha512-rq9s+JNhf0IChjtDXxllJ7g41oZk5SlXtp0LHwyA5cejwn7vKmKp4pPri6YEePv2PU65sAsegbXtIinmDFDXgQ==",
-      "dev": true,
-      "license": "MIT",
-      "peerDependencies": {
-        "acorn": "^6.0.0 || ^7.0.0 || ^8.0.0"
-      }
-    },
-    "node_modules/ajv": {
-      "version": "6.12.6",
-      "resolved": "https://registry.npmjs.org/ajv/-/ajv-6.12.6.tgz",
-      "integrity": "sha512-j3fVLgvTo527anyYyJOGTYJbG+vnnQYvE0m5mmkc1TK+nxAppkCLMIL0aZ4dblVCNoGShhm+kzE4ZUykBoMg4g==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "fast-deep-equal": "^3.1.1",
-        "fast-json-stable-stringify": "^2.0.0",
-        "json-schema-traverse": "^0.4.1",
-        "uri-js": "^4.2.2"
-      },
-      "funding": {
-        "type": "github",
-        "url": "https://github.com/sponsors/epoberezkin"
-      }
-    },
-    "node_modules/ansi-regex": {
-      "version": "5.0.1",
-      "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.1.tgz",
-      "integrity": "sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">=8"
-      }
-    },
-    "node_modules/ansi-styles": {
-      "version": "4.3.0",
-      "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz",
-      "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "color-convert": "^2.0.1"
-      },
-      "engines": {
-        "node": ">=8"
-      },
-      "funding": {
-        "url": "https://github.com/chalk/ansi-styles?sponsor=1"
-      }
-    },
-    "node_modules/any-promise": {
-      "version": "1.3.0",
-      "resolved": "https://registry.npmjs.org/any-promise/-/any-promise-1.3.0.tgz",
-      "integrity": "sha512-7UvmKalWRt1wgjL1RrGxoSJW/0QZFIegpeGvZG9kjp8vrRu55XTHbwnqq2GpXm9uLbcuhxm3IqX9OB4MZR1b2A==",
-      "dev": true,
-      "license": "MIT"
-    },
-    "node_modules/anymatch": {
-      "version": "3.1.3",
-      "resolved": "https://registry.npmjs.org/anymatch/-/anymatch-3.1.3.tgz",
-      "integrity": "sha512-KMReFUr0B4t+D+OBkjR3KYqvocp2XaSzO55UcB6mgQMd3KbcE+mWTyvVV7D/zsdEbNnV6acZUutkiHQXvTr1Rw==",
-      "dev": true,
-      "license": "ISC",
-      "dependencies": {
-        "normalize-path": "^3.0.0",
-        "picomatch": "^2.0.4"
-      },
-      "engines": {
-        "node": ">= 8"
-      }
-    },
-    "node_modules/arg": {
-      "version": "5.0.2",
-      "resolved": "https://registry.npmjs.org/arg/-/arg-5.0.2.tgz",
-      "integrity": "sha512-PYjyFOLKQ9y57JvQ6QLo8dAgNqswh8M1RMJYdQduT6xbWSgK36P/Z/v+p888pM69jMMfS8Xd8F6I1kQ/I9HUGg==",
-      "dev": true,
-      "license": "MIT"
-    },
-    "node_modules/argparse": {
-      "version": "2.0.1",
-      "resolved": "https://registry.npmjs.org/argparse/-/argparse-2.0.1.tgz",
-      "integrity": "sha512-8+9WqebbFzpX9OR+Wa6O29asIogeRMzcGtAINdpMHHyAg10f05aSFVBbcEqGf/PXw1EjAZ+q2/bEBg3DvurK3Q==",
-      "dev": true,
-      "license": "Python-2.0"
-    },
-    "node_modules/aria-query": {
-      "version": "5.3.2",
-      "resolved": "https://registry.npmjs.org/aria-query/-/aria-query-5.3.2.tgz",
-      "integrity": "sha512-COROpnaoap1E2F000S62r6A60uHZnmlvomhfyT2DlTcrY1OrBKn2UhH7qn5wTC9zMvD0AY7csdPSNwKP+7WiQw==",
-      "dev": true,
-      "license": "Apache-2.0",
-      "engines": {
-        "node": ">= 0.4"
-      }
-    },
-    "node_modules/array-buffer-byte-length": {
-      "version": "1.0.2",
-      "resolved": "https://registry.npmjs.org/array-buffer-byte-length/-/array-buffer-byte-length-1.0.2.tgz",
-      "integrity": "sha512-LHE+8BuR7RYGDKvnrmcuSq3tDcKv9OFEXQt/HpbZhY7V6h0zlUXutnAD82GiFx9rdieCMjkvtcsPqBwgUl1Iiw==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "call-bound": "^1.0.3",
-        "is-array-buffer": "^3.0.5"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/array-includes": {
-      "version": "3.1.9",
-      "resolved": "https://registry.npmjs.org/array-includes/-/array-includes-3.1.9.tgz",
-      "integrity": "sha512-FmeCCAenzH0KH381SPT5FZmiA/TmpndpcaShhfgEN9eCVjnFBqq3l1xrI42y8+PPLI6hypzou4GXw00WHmPBLQ==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "call-bind": "^1.0.8",
-        "call-bound": "^1.0.4",
-        "define-properties": "^1.2.1",
-        "es-abstract": "^1.24.0",
-        "es-object-atoms": "^1.1.1",
-        "get-intrinsic": "^1.3.0",
-        "is-string": "^1.1.1",
-        "math-intrinsics": "^1.1.0"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/array.prototype.findlast": {
-      "version": "1.2.5",
-      "resolved": "https://registry.npmjs.org/array.prototype.findlast/-/array.prototype.findlast-1.2.5.tgz",
-      "integrity": "sha512-CVvd6FHg1Z3POpBLxO6E6zr+rSKEQ9L6rZHAaY7lLfhKsWYUBBOuMs0e9o24oopj6H+geRCX0YJ+TJLBK2eHyQ==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "call-bind": "^1.0.7",
-        "define-properties": "^1.2.1",
-        "es-abstract": "^1.23.2",
-        "es-errors": "^1.3.0",
-        "es-object-atoms": "^1.0.0",
-        "es-shim-unscopables": "^1.0.2"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/array.prototype.findlastindex": {
-      "version": "1.2.6",
-      "resolved": "https://registry.npmjs.org/array.prototype.findlastindex/-/array.prototype.findlastindex-1.2.6.tgz",
-      "integrity": "sha512-F/TKATkzseUExPlfvmwQKGITM3DGTK+vkAsCZoDc5daVygbJBnjEUCbgkAvVFsgfXfX4YIqZ/27G3k3tdXrTxQ==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "call-bind": "^1.0.8",
-        "call-bound": "^1.0.4",
-        "define-properties": "^1.2.1",
-        "es-abstract": "^1.23.9",
-        "es-errors": "^1.3.0",
-        "es-object-atoms": "^1.1.1",
-        "es-shim-unscopables": "^1.1.0"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/array.prototype.flat": {
-      "version": "1.3.3",
-      "resolved": "https://registry.npmjs.org/array.prototype.flat/-/array.prototype.flat-1.3.3.tgz",
-      "integrity": "sha512-rwG/ja1neyLqCuGZ5YYrznA62D4mZXg0i1cIskIUKSiqF3Cje9/wXAls9B9s1Wa2fomMsIv8czB8jZcPmxCXFg==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "call-bind": "^1.0.8",
-        "define-properties": "^1.2.1",
-        "es-abstract": "^1.23.5",
-        "es-shim-unscopables": "^1.0.2"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/array.prototype.flatmap": {
-      "version": "1.3.3",
-      "resolved": "https://registry.npmjs.org/array.prototype.flatmap/-/array.prototype.flatmap-1.3.3.tgz",
-      "integrity": "sha512-Y7Wt51eKJSyi80hFrJCePGGNo5ktJCslFuboqJsbf57CCPcm5zztluPlc4/aD8sWsKvlwatezpV4U1efk8kpjg==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "call-bind": "^1.0.8",
-        "define-properties": "^1.2.1",
-        "es-abstract": "^1.23.5",
-        "es-shim-unscopables": "^1.0.2"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/array.prototype.tosorted": {
-      "version": "1.1.4",
-      "resolved": "https://registry.npmjs.org/array.prototype.tosorted/-/array.prototype.tosorted-1.1.4.tgz",
-      "integrity": "sha512-p6Fx8B7b7ZhL/gmUsAy0D15WhvDccw3mnGNbZpi3pmeJdxtWsj2jEaI4Y6oo3XiHfzuSgPwKc04MYt6KgvC/wA==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "call-bind": "^1.0.7",
-        "define-properties": "^1.2.1",
-        "es-abstract": "^1.23.3",
-        "es-errors": "^1.3.0",
-        "es-shim-unscopables": "^1.0.2"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      }
-    },
-    "node_modules/arraybuffer.prototype.slice": {
-      "version": "1.0.4",
-      "resolved": "https://registry.npmjs.org/arraybuffer.prototype.slice/-/arraybuffer.prototype.slice-1.0.4.tgz",
-      "integrity": "sha512-BNoCY6SXXPQ7gF2opIP4GBE+Xw7U+pHMYKuzjgCN3GwiaIR09UUeKfheyIry77QtrCBlC0KK0q5/TER/tYh3PQ==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "array-buffer-byte-length": "^1.0.1",
-        "call-bind": "^1.0.8",
-        "define-properties": "^1.2.1",
-        "es-abstract": "^1.23.5",
-        "es-errors": "^1.3.0",
-        "get-intrinsic": "^1.2.6",
-        "is-array-buffer": "^3.0.4"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/ast-types-flow": {
-      "version": "0.0.8",
-      "resolved": "https://registry.npmjs.org/ast-types-flow/-/ast-types-flow-0.0.8.tgz",
-      "integrity": "sha512-OH/2E5Fg20h2aPrbe+QL8JZQFko0YZaF+j4mnQ7BGhfavO7OpSLa8a0y9sBwomHdSbkhTS8TQNayBfnW5DwbvQ==",
-      "dev": true,
-      "license": "MIT"
-    },
-    "node_modules/async-function": {
-      "version": "1.0.0",
-      "resolved": "https://registry.npmjs.org/async-function/-/async-function-1.0.0.tgz",
-      "integrity": "sha512-hsU18Ae8CDTR6Kgu9DYf0EbCr/a5iGL0rytQDobUcdpYOKokk8LEjVphnXkDkgpi0wYVsqrXuP0bZxJaTqdgoA==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">= 0.4"
-      }
-    },
-    "node_modules/autoprefixer": {
-      "version": "10.4.21",
-      "resolved": "https://registry.npmjs.org/autoprefixer/-/autoprefixer-10.4.21.tgz",
-      "integrity": "sha512-O+A6LWV5LDHSJD3LjHYoNi4VLsj/Whi7k6zG12xTYaU4cQ8oxQGckXNX8cRHK5yOZ/ppVHe0ZBXGzSV9jXdVbQ==",
-      "dev": true,
-      "funding": [
-        {
-          "type": "opencollective",
-          "url": "https://opencollective.com/postcss/"
-        },
-        {
-          "type": "tidelift",
-          "url": "https://tidelift.com/funding/github/npm/autoprefixer"
-        },
-        {
-          "type": "github",
-          "url": "https://github.com/sponsors/ai"
-        }
-      ],
-      "license": "MIT",
-      "dependencies": {
-        "browserslist": "^4.24.4",
-        "caniuse-lite": "^1.0.30001702",
-        "fraction.js": "^4.3.7",
-        "normalize-range": "^0.1.2",
-        "picocolors": "^1.1.1",
-        "postcss-value-parser": "^4.2.0"
-      },
-      "bin": {
-        "autoprefixer": "bin/autoprefixer"
-      },
-      "engines": {
-        "node": "^10 || ^12 || >=14"
-      },
-      "peerDependencies": {
-        "postcss": "^8.1.0"
-      }
-    },
-    "node_modules/available-typed-arrays": {
-      "version": "1.0.7",
-      "resolved": "https://registry.npmjs.org/available-typed-arrays/-/available-typed-arrays-1.0.7.tgz",
-      "integrity": "sha512-wvUjBtSGN7+7SjNpq/9M2Tg350UZD3q62IFZLbRAR1bSMlCo1ZaeW+BJ+D090e4hIIZLBcTDWe4Mh4jvUDajzQ==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "possible-typed-array-names": "^1.0.0"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/axe-core": {
-      "version": "4.10.3",
-      "resolved": "https://registry.npmjs.org/axe-core/-/axe-core-4.10.3.tgz",
-      "integrity": "sha512-Xm7bpRXnDSX2YE2YFfBk2FnF0ep6tmG7xPh8iHee8MIcrgq762Nkce856dYtJYLkuIoYZvGfTs/PbZhideTcEg==",
-      "dev": true,
-      "license": "MPL-2.0",
-      "engines": {
-        "node": ">=4"
-      }
-    },
-    "node_modules/axobject-query": {
-      "version": "4.1.0",
-      "resolved": "https://registry.npmjs.org/axobject-query/-/axobject-query-4.1.0.tgz",
-      "integrity": "sha512-qIj0G9wZbMGNLjLmg1PT6v2mE9AH2zlnADJD/2tC6E00hgmhUOfEB6greHPAfLRSufHqROIUTkw6E+M3lH0PTQ==",
-      "dev": true,
-      "license": "Apache-2.0",
-      "engines": {
-        "node": ">= 0.4"
-      }
-    },
-    "node_modules/balanced-match": {
-      "version": "1.0.2",
-      "resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-1.0.2.tgz",
-      "integrity": "sha512-3oSeUO0TMV67hN1AmbXsK4yaqU7tjiHlbxRDZOpH0KW9+CeX4bRAaX0Anxt0tx2MrpRpWwQaPwIlISEJhYU5Pw==",
-      "dev": true,
-      "license": "MIT"
-    },
-    "node_modules/binary-extensions": {
-      "version": "2.3.0",
-      "resolved": "https://registry.npmjs.org/binary-extensions/-/binary-extensions-2.3.0.tgz",
-      "integrity": "sha512-Ceh+7ox5qe7LJuLHoY0feh3pHuUDHAcRUeyL2VYghZwfpkNIy/+8Ocg0a3UuSoYzavmylwuLWQOf3hl0jjMMIw==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">=8"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/sindresorhus"
-      }
-    },
-    "node_modules/brace-expansion": {
-      "version": "1.1.12",
-      "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.12.tgz",
-      "integrity": "sha512-9T9UjW3r0UW5c1Q7GTwllptXwhvYmEzFhzMfZ9H7FQWt+uZePjZPjBP/W1ZEyZ1twGWom5/56TF4lPcqjnDHcg==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "balanced-match": "^1.0.0",
-        "concat-map": "0.0.1"
-      }
-    },
-    "node_modules/braces": {
-      "version": "3.0.3",
-      "resolved": "https://registry.npmjs.org/braces/-/braces-3.0.3.tgz",
-      "integrity": "sha512-yQbXgO/OSZVD2IsiLlro+7Hf6Q18EJrKSEsdoMzKePKXct3gvD8oLcOQdIzGupr5Fj+EDe8gO/lxc1BzfMpxvA==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "fill-range": "^7.1.1"
-      },
-      "engines": {
-        "node": ">=8"
-      }
-    },
-    "node_modules/browserslist": {
-      "version": "4.25.1",
-      "resolved": "https://registry.npmjs.org/browserslist/-/browserslist-4.25.1.tgz",
-      "integrity": "sha512-KGj0KoOMXLpSNkkEI6Z6mShmQy0bc1I+T7K9N81k4WWMrfz+6fQ6es80B/YLAeRoKvjYE1YSHHOW1qe9xIVzHw==",
-      "dev": true,
-      "funding": [
-        {
-          "type": "opencollective",
-          "url": "https://opencollective.com/browserslist"
-        },
-        {
-          "type": "tidelift",
-          "url": "https://tidelift.com/funding/github/npm/browserslist"
-        },
-        {
-          "type": "github",
-          "url": "https://github.com/sponsors/ai"
-        }
-      ],
-      "license": "MIT",
-      "dependencies": {
-        "caniuse-lite": "^1.0.30001726",
-        "electron-to-chromium": "^1.5.173",
-        "node-releases": "^2.0.19",
-        "update-browserslist-db": "^1.1.3"
-      },
-      "bin": {
-        "browserslist": "cli.js"
-      },
-      "engines": {
-        "node": "^6 || ^7 || ^8 || ^9 || ^10 || ^11 || ^12 || >=13.7"
-      }
-    },
-    "node_modules/busboy": {
-      "version": "1.6.0",
-      "resolved": "https://registry.npmjs.org/busboy/-/busboy-1.6.0.tgz",
-      "integrity": "sha512-8SFQbg/0hQ9xy3UNTB0YEnsNBbWfhf7RtnzpL7TkBiTBRfrQ9Fxcnz7VJsleJpyp6rVLvXiuORqjlHi5q+PYuA==",
-      "dependencies": {
-        "streamsearch": "^1.1.0"
-      },
-      "engines": {
-        "node": ">=10.16.0"
-      }
-    },
-    "node_modules/call-bind": {
-      "version": "1.0.8",
-      "resolved": "https://registry.npmjs.org/call-bind/-/call-bind-1.0.8.tgz",
-      "integrity": "sha512-oKlSFMcMwpUg2ednkhQ454wfWiU/ul3CkJe/PEHcTKuiX6RpbehUiFMXu13HalGZxfUwCQzZG747YXBn1im9ww==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "call-bind-apply-helpers": "^1.0.0",
-        "es-define-property": "^1.0.0",
-        "get-intrinsic": "^1.2.4",
-        "set-function-length": "^1.2.2"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/call-bind-apply-helpers": {
-      "version": "1.0.2",
-      "resolved": "https://registry.npmjs.org/call-bind-apply-helpers/-/call-bind-apply-helpers-1.0.2.tgz",
-      "integrity": "sha512-Sp1ablJ0ivDkSzjcaJdxEunN5/XvksFJ2sMBFfq6x0ryhQV/2b/KwFe21cMpmHtPOSij8K99/wSfoEuTObmuMQ==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "es-errors": "^1.3.0",
-        "function-bind": "^1.1.2"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      }
-    },
-    "node_modules/call-bound": {
-      "version": "1.0.4",
-      "resolved": "https://registry.npmjs.org/call-bound/-/call-bound-1.0.4.tgz",
-      "integrity": "sha512-+ys997U96po4Kx/ABpBCqhA9EuxJaQWDQg7295H4hBphv3IZg0boBKuwYpt4YXp6MZ5AmZQnU/tyMTlRpaSejg==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "call-bind-apply-helpers": "^1.0.2",
-        "get-intrinsic": "^1.3.0"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/callsites": {
-      "version": "3.1.0",
-      "resolved": "https://registry.npmjs.org/callsites/-/callsites-3.1.0.tgz",
-      "integrity": "sha512-P8BjAsXvZS+VIDUI11hHCQEv74YT67YUi5JJFNWIqL235sBmjX4+qx9Muvls5ivyNENctx46xQLQ3aTuE7ssaQ==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">=6"
-      }
-    },
-    "node_modules/camelcase-css": {
-      "version": "2.0.1",
-      "resolved": "https://registry.npmjs.org/camelcase-css/-/camelcase-css-2.0.1.tgz",
-      "integrity": "sha512-QOSvevhslijgYwRx6Rv7zKdMF8lbRmx+uQGx2+vDc+KI/eBnsy9kit5aj23AgGu3pa4t9AgwbnXWqS+iOY+2aA==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">= 6"
-      }
-    },
-    "node_modules/caniuse-lite": {
-      "version": "1.0.30001727",
-      "resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001727.tgz",
-      "integrity": "sha512-pB68nIHmbN6L/4C6MH1DokyR3bYqFwjaSs/sWDHGj4CTcFtQUQMuJftVwWkXq7mNWOybD3KhUv3oWHoGxgP14Q==",
-      "funding": [
-        {
-          "type": "opencollective",
-          "url": "https://opencollective.com/browserslist"
-        },
-        {
-          "type": "tidelift",
-          "url": "https://tidelift.com/funding/github/npm/caniuse-lite"
-        },
-        {
-          "type": "github",
-          "url": "https://github.com/sponsors/ai"
-        }
-      ],
-      "license": "CC-BY-4.0"
-    },
-    "node_modules/chalk": {
-      "version": "4.1.2",
-      "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz",
-      "integrity": "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "ansi-styles": "^4.1.0",
-        "supports-color": "^7.1.0"
-      },
-      "engines": {
-        "node": ">=10"
-      },
-      "funding": {
-        "url": "https://github.com/chalk/chalk?sponsor=1"
-      }
-    },
-    "node_modules/chokidar": {
-      "version": "3.6.0",
-      "resolved": "https://registry.npmjs.org/chokidar/-/chokidar-3.6.0.tgz",
-      "integrity": "sha512-7VT13fmjotKpGipCW9JEQAusEPE+Ei8nl6/g4FBAmIm0GOOLMua9NDDo/DWp0ZAxCr3cPq5ZpBqmPAQgDda2Pw==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "anymatch": "~3.1.2",
-        "braces": "~3.0.2",
-        "glob-parent": "~5.1.2",
-        "is-binary-path": "~2.1.0",
-        "is-glob": "~4.0.1",
-        "normalize-path": "~3.0.0",
-        "readdirp": "~3.6.0"
-      },
-      "engines": {
-        "node": ">= 8.10.0"
-      },
-      "funding": {
-        "url": "https://paulmillr.com/funding/"
-      },
-      "optionalDependencies": {
-        "fsevents": "~2.3.2"
-      }
-    },
-    "node_modules/chokidar/node_modules/glob-parent": {
-      "version": "5.1.2",
-      "resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-5.1.2.tgz",
-      "integrity": "sha512-AOIgSQCepiJYwP3ARnGx+5VnTu2HBYdzbGP45eLw1vr3zB3vZLeyed1sC9hnbcOc9/SrMyM5RPQrkGz4aS9Zow==",
-      "dev": true,
-      "license": "ISC",
-      "dependencies": {
-        "is-glob": "^4.0.1"
-      },
-      "engines": {
-        "node": ">= 6"
-      }
-    },
-    "node_modules/client-only": {
-      "version": "0.0.1",
-      "resolved": "https://registry.npmjs.org/client-only/-/client-only-0.0.1.tgz",
-      "integrity": "sha512-IV3Ou0jSMzZrd3pZ48nLkT9DA7Ag1pnPzaiQhpW7c3RbcqqzvzzVu+L8gfqMp/8IM2MQtSiqaCxrrcfu8I8rMA==",
-      "license": "MIT"
-    },
-    "node_modules/color-convert": {
-      "version": "2.0.1",
-      "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz",
-      "integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "color-name": "~1.1.4"
-      },
-      "engines": {
-        "node": ">=7.0.0"
-      }
-    },
-    "node_modules/color-name": {
-      "version": "1.1.4",
-      "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz",
-      "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==",
-      "dev": true,
-      "license": "MIT"
-    },
-    "node_modules/commander": {
-      "version": "4.1.1",
-      "resolved": "https://registry.npmjs.org/commander/-/commander-4.1.1.tgz",
-      "integrity": "sha512-NOKm8xhkzAjzFx8B2v5OAHT+u5pRQc2UCa2Vq9jYL/31o2wi9mxBA7LIFs3sV5VSC49z6pEhfbMULvShKj26WA==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">= 6"
-      }
-    },
-    "node_modules/concat-map": {
-      "version": "0.0.1",
-      "resolved": "https://registry.npmjs.org/concat-map/-/concat-map-0.0.1.tgz",
-      "integrity": "sha512-/Srv4dswyQNBfohGpz9o6Yb3Gz3SrUDqBH5rTuhGR7ahtlbYKnVxw2bCFMRljaA7EXHaXZ8wsHdodFvbkhKmqg==",
-      "dev": true,
-      "license": "MIT"
-    },
-    "node_modules/convert-source-map": {
-      "version": "2.0.0",
-      "resolved": "https://registry.npmjs.org/convert-source-map/-/convert-source-map-2.0.0.tgz",
-      "integrity": "sha512-Kvp459HrV2FEJ1CAsi1Ku+MY3kasH19TFykTz2xWmMeq6bk2NU3XXvfJ+Q61m0xktWwt+1HSYf3JZsTms3aRJg==",
-      "dev": true,
-      "license": "MIT"
-    },
-    "node_modules/cross-spawn": {
-      "version": "7.0.6",
-      "resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-7.0.6.tgz",
-      "integrity": "sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "path-key": "^3.1.0",
-        "shebang-command": "^2.0.0",
-        "which": "^2.0.1"
-      },
-      "engines": {
-        "node": ">= 8"
-      }
-    },
-    "node_modules/cssesc": {
-      "version": "3.0.0",
-      "resolved": "https://registry.npmjs.org/cssesc/-/cssesc-3.0.0.tgz",
-      "integrity": "sha512-/Tb/JcjK111nNScGob5MNtsntNM1aCNUDipB/TkwZFhyDrrE47SOx/18wF2bbjgc3ZzCSKW1T5nt5EbFoAz/Vg==",
-      "dev": true,
-      "license": "MIT",
-      "bin": {
-        "cssesc": "bin/cssesc"
-      },
-      "engines": {
-        "node": ">=4"
-      }
-    },
-    "node_modules/csstype": {
-      "version": "3.1.3",
-      "resolved": "https://registry.npmjs.org/csstype/-/csstype-3.1.3.tgz",
-      "integrity": "sha512-M1uQkMl8rQK/szD0LNhtqxIPLpimGm8sOBwU7lLnCpSbTyY3yeU1Vc7l4KT5zT4s/yOxHH5O7tIuuLOCnLADRw==",
-      "dev": true,
-      "license": "MIT"
-    },
-    "node_modules/damerau-levenshtein": {
-      "version": "1.0.8",
-      "resolved": "https://registry.npmjs.org/damerau-levenshtein/-/damerau-levenshtein-1.0.8.tgz",
-      "integrity": "sha512-sdQSFB7+llfUcQHUQO3+B8ERRj0Oa4w9POWMI/puGtuf7gFywGmkaLCElnudfTiKZV+NvHqL0ifzdrI8Ro7ESA==",
-      "dev": true,
-      "license": "BSD-2-Clause"
-    },
-    "node_modules/data-view-buffer": {
-      "version": "1.0.2",
-      "resolved": "https://registry.npmjs.org/data-view-buffer/-/data-view-buffer-1.0.2.tgz",
-      "integrity": "sha512-EmKO5V3OLXh1rtK2wgXRansaK1/mtVdTUEiEI0W8RkvgT05kfxaH29PliLnpLP73yYO6142Q72QNa8Wx/A5CqQ==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "call-bound": "^1.0.3",
-        "es-errors": "^1.3.0",
-        "is-data-view": "^1.0.2"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/data-view-byte-length": {
-      "version": "1.0.2",
-      "resolved": "https://registry.npmjs.org/data-view-byte-length/-/data-view-byte-length-1.0.2.tgz",
-      "integrity": "sha512-tuhGbE6CfTM9+5ANGf+oQb72Ky/0+s3xKUpHvShfiz2RxMFgFPjsXuRLBVMtvMs15awe45SRb83D6wH4ew6wlQ==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "call-bound": "^1.0.3",
-        "es-errors": "^1.3.0",
-        "is-data-view": "^1.0.2"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/inspect-js"
-      }
-    },
-    "node_modules/data-view-byte-offset": {
-      "version": "1.0.1",
-      "resolved": "https://registry.npmjs.org/data-view-byte-offset/-/data-view-byte-offset-1.0.1.tgz",
-      "integrity": "sha512-BS8PfmtDGnrgYdOonGZQdLZslWIeCGFP9tpan0hi1Co2Zr2NKADsvGYA8XxuG/4UWgJ6Cjtv+YJnB6MM69QGlQ==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "call-bound": "^1.0.2",
-        "es-errors": "^1.3.0",
-        "is-data-view": "^1.0.1"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/debug": {
-      "version": "4.4.1",
-      "resolved": "https://registry.npmjs.org/debug/-/debug-4.4.1.tgz",
-      "integrity": "sha512-KcKCqiftBJcZr++7ykoDIEwSa3XWowTfNPo92BYxjXiyYEVrUQh2aLyhxBCwww+heortUFxEJYcRzosstTEBYQ==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "ms": "^2.1.3"
-      },
-      "engines": {
-        "node": ">=6.0"
-      },
-      "peerDependenciesMeta": {
-        "supports-color": {
-          "optional": true
-        }
-      }
-    },
-    "node_modules/deep-is": {
-      "version": "0.1.4",
-      "resolved": "https://registry.npmjs.org/deep-is/-/deep-is-0.1.4.tgz",
-      "integrity": "sha512-oIPzksmTg4/MriiaYGO+okXDT7ztn/w3Eptv/+gSIdMdKsJo0u4CfYNFJPy+4SKMuCqGw2wxnA+URMg3t8a/bQ==",
-      "dev": true,
-      "license": "MIT"
-    },
-    "node_modules/define-data-property": {
-      "version": "1.1.4",
-      "resolved": "https://registry.npmjs.org/define-data-property/-/define-data-property-1.1.4.tgz",
-      "integrity": "sha512-rBMvIzlpA8v6E+SJZoo++HAYqsLrkg7MSfIinMPFhmkorw7X+dOXVJQs+QT69zGkzMyfDnIMN2Wid1+NbL3T+A==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "es-define-property": "^1.0.0",
-        "es-errors": "^1.3.0",
-        "gopd": "^1.0.1"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/define-properties": {
-      "version": "1.2.1",
-      "resolved": "https://registry.npmjs.org/define-properties/-/define-properties-1.2.1.tgz",
-      "integrity": "sha512-8QmQKqEASLd5nx0U1B1okLElbUuuttJ/AnYmRXbbbGDWh6uS208EjD4Xqq/I9wK7u0v6O08XhTWnt5XtEbR6Dg==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "define-data-property": "^1.0.1",
-        "has-property-descriptors": "^1.0.0",
-        "object-keys": "^1.1.1"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/dequal": {
-      "version": "2.0.3",
-      "resolved": "https://registry.npmjs.org/dequal/-/dequal-2.0.3.tgz",
-      "integrity": "sha512-0je+qPKHEMohvfRTCEo3CrPG6cAzAYgmzKyxRiYSSDkS6eGJdyVJm7WaYA5ECaAD9wLB2T4EEeymA5aFVcYXCA==",
-      "license": "MIT",
-      "engines": {
-        "node": ">=6"
-      }
-    },
-    "node_modules/didyoumean": {
-      "version": "1.2.2",
-      "resolved": "https://registry.npmjs.org/didyoumean/-/didyoumean-1.2.2.tgz",
-      "integrity": "sha512-gxtyfqMg7GKyhQmb056K7M3xszy/myH8w+B4RT+QXBQsvAOdc3XymqDDPHx1BgPgsdAA5SIifona89YtRATDzw==",
-      "dev": true,
-      "license": "Apache-2.0"
-    },
-    "node_modules/dlv": {
-      "version": "1.1.3",
-      "resolved": "https://registry.npmjs.org/dlv/-/dlv-1.1.3.tgz",
-      "integrity": "sha512-+HlytyjlPKnIG8XuRG8WvmBP8xs8P71y+SKKS6ZXWoEgLuePxtDoUEiH7WkdePWrQ5JBpE6aoVqfZfJUQkjXwA==",
-      "dev": true,
-      "license": "MIT"
-    },
-    "node_modules/dunder-proto": {
-      "version": "1.0.1",
-      "resolved": "https://registry.npmjs.org/dunder-proto/-/dunder-proto-1.0.1.tgz",
-      "integrity": "sha512-KIN/nDJBQRcXw0MLVhZE9iQHmG68qAVIBg9CqmUYjmQIhgij9U5MFvrqkUL5FbtyyzZuOeOt0zdeRe4UY7ct+A==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "call-bind-apply-helpers": "^1.0.1",
-        "es-errors": "^1.3.0",
-        "gopd": "^1.2.0"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      }
-    },
-    "node_modules/eastasianwidth": {
-      "version": "0.2.0",
-      "resolved": "https://registry.npmjs.org/eastasianwidth/-/eastasianwidth-0.2.0.tgz",
-      "integrity": "sha512-I88TYZWc9XiYHRQ4/3c5rjjfgkjhLyW2luGIheGERbNQ6OY7yTybanSpDXZa8y7VUP9YmDcYa+eyq4ca7iLqWA==",
-      "dev": true,
-      "license": "MIT"
-    },
-    "node_modules/electron-to-chromium": {
-      "version": "1.5.191",
-      "resolved": "https://registry.npmjs.org/electron-to-chromium/-/electron-to-chromium-1.5.191.tgz",
-      "integrity": "sha512-xcwe9ELcuxYLUFqZZxL19Z6HVKcvNkIwhbHUz7L3us6u12yR+7uY89dSl570f/IqNthx8dAw3tojG7i4Ni4tDA==",
-      "dev": true,
-      "license": "ISC"
-    },
-    "node_modules/emoji-regex": {
-      "version": "9.2.2",
-      "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-9.2.2.tgz",
-      "integrity": "sha512-L18DaJsXSUk2+42pv8mLs5jJT2hqFkFE4j21wOmgbUqsZ2hL72NsUU785g9RXgo3s0ZNgVl42TiHp3ZtOv/Vyg==",
-      "dev": true,
-      "license": "MIT"
-    },
-    "node_modules/es-abstract": {
-      "version": "1.24.0",
-      "resolved": "https://registry.npmjs.org/es-abstract/-/es-abstract-1.24.0.tgz",
-      "integrity": "sha512-WSzPgsdLtTcQwm4CROfS5ju2Wa1QQcVeT37jFjYzdFz1r9ahadC8B8/a4qxJxM+09F18iumCdRmlr96ZYkQvEg==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "array-buffer-byte-length": "^1.0.2",
-        "arraybuffer.prototype.slice": "^1.0.4",
-        "available-typed-arrays": "^1.0.7",
-        "call-bind": "^1.0.8",
-        "call-bound": "^1.0.4",
-        "data-view-buffer": "^1.0.2",
-        "data-view-byte-length": "^1.0.2",
-        "data-view-byte-offset": "^1.0.1",
-        "es-define-property": "^1.0.1",
-        "es-errors": "^1.3.0",
-        "es-object-atoms": "^1.1.1",
-        "es-set-tostringtag": "^2.1.0",
-        "es-to-primitive": "^1.3.0",
-        "function.prototype.name": "^1.1.8",
-        "get-intrinsic": "^1.3.0",
-        "get-proto": "^1.0.1",
-        "get-symbol-description": "^1.1.0",
-        "globalthis": "^1.0.4",
-        "gopd": "^1.2.0",
-        "has-property-descriptors": "^1.0.2",
-        "has-proto": "^1.2.0",
-        "has-symbols": "^1.1.0",
-        "hasown": "^2.0.2",
-        "internal-slot": "^1.1.0",
-        "is-array-buffer": "^3.0.5",
-        "is-callable": "^1.2.7",
-        "is-data-view": "^1.0.2",
-        "is-negative-zero": "^2.0.3",
-        "is-regex": "^1.2.1",
-        "is-set": "^2.0.3",
-        "is-shared-array-buffer": "^1.0.4",
-        "is-string": "^1.1.1",
-        "is-typed-array": "^1.1.15",
-        "is-weakref": "^1.1.1",
-        "math-intrinsics": "^1.1.0",
-        "object-inspect": "^1.13.4",
-        "object-keys": "^1.1.1",
-        "object.assign": "^4.1.7",
-        "own-keys": "^1.0.1",
-        "regexp.prototype.flags": "^1.5.4",
-        "safe-array-concat": "^1.1.3",
-        "safe-push-apply": "^1.0.0",
-        "safe-regex-test": "^1.1.0",
-        "set-proto": "^1.0.0",
-        "stop-iteration-iterator": "^1.1.0",
-        "string.prototype.trim": "^1.2.10",
-        "string.prototype.trimend": "^1.0.9",
-        "string.prototype.trimstart": "^1.0.8",
-        "typed-array-buffer": "^1.0.3",
-        "typed-array-byte-length": "^1.0.3",
-        "typed-array-byte-offset": "^1.0.4",
-        "typed-array-length": "^1.0.7",
-        "unbox-primitive": "^1.1.0",
-        "which-typed-array": "^1.1.19"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/es-define-property": {
-      "version": "1.0.1",
-      "resolved": "https://registry.npmjs.org/es-define-property/-/es-define-property-1.0.1.tgz",
-      "integrity": "sha512-e3nRfgfUZ4rNGL232gUgX06QNyyez04KdjFrF+LTRoOXmrOgFKDg4BCdsjW8EnT69eqdYGmRpJwiPVYNrCaW3g==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">= 0.4"
-      }
-    },
-    "node_modules/es-errors": {
-      "version": "1.3.0",
-      "resolved": "https://registry.npmjs.org/es-errors/-/es-errors-1.3.0.tgz",
-      "integrity": "sha512-Zf5H2Kxt2xjTvbJvP2ZWLEICxA6j+hAmMzIlypy4xcBg1vKVnx89Wy0GbS+kf5cwCVFFzdCFh2XSCFNULS6csw==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">= 0.4"
-      }
-    },
-    "node_modules/es-iterator-helpers": {
-      "version": "1.2.1",
-      "resolved": "https://registry.npmjs.org/es-iterator-helpers/-/es-iterator-helpers-1.2.1.tgz",
-      "integrity": "sha512-uDn+FE1yrDzyC0pCo961B2IHbdM8y/ACZsKD4dG6WqrjV53BADjwa7D+1aom2rsNVfLyDgU/eigvlJGJ08OQ4w==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "call-bind": "^1.0.8",
-        "call-bound": "^1.0.3",
-        "define-properties": "^1.2.1",
-        "es-abstract": "^1.23.6",
-        "es-errors": "^1.3.0",
-        "es-set-tostringtag": "^2.0.3",
-        "function-bind": "^1.1.2",
-        "get-intrinsic": "^1.2.6",
-        "globalthis": "^1.0.4",
-        "gopd": "^1.2.0",
-        "has-property-descriptors": "^1.0.2",
-        "has-proto": "^1.2.0",
-        "has-symbols": "^1.1.0",
-        "internal-slot": "^1.1.0",
-        "iterator.prototype": "^1.1.4",
-        "safe-array-concat": "^1.1.3"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      }
-    },
-    "node_modules/es-object-atoms": {
-      "version": "1.1.1",
-      "resolved": "https://registry.npmjs.org/es-object-atoms/-/es-object-atoms-1.1.1.tgz",
-      "integrity": "sha512-FGgH2h8zKNim9ljj7dankFPcICIK9Cp5bm+c2gQSYePhpaG5+esrLODihIorn+Pe6FGJzWhXQotPv73jTaldXA==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "es-errors": "^1.3.0"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      }
-    },
-    "node_modules/es-set-tostringtag": {
-      "version": "2.1.0",
-      "resolved": "https://registry.npmjs.org/es-set-tostringtag/-/es-set-tostringtag-2.1.0.tgz",
-      "integrity": "sha512-j6vWzfrGVfyXxge+O0x5sh6cvxAog0a/4Rdd2K36zCMV5eJ+/+tOAngRO8cODMNWbVRdVlmGZQL2YS3yR8bIUA==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "es-errors": "^1.3.0",
-        "get-intrinsic": "^1.2.6",
-        "has-tostringtag": "^1.0.2",
-        "hasown": "^2.0.2"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      }
-    },
-    "node_modules/es-shim-unscopables": {
-      "version": "1.1.0",
-      "resolved": "https://registry.npmjs.org/es-shim-unscopables/-/es-shim-unscopables-1.1.0.tgz",
-      "integrity": "sha512-d9T8ucsEhh8Bi1woXCf+TIKDIROLG5WCkxg8geBCbvk22kzwC5G2OnXVMO6FUsvQlgUUXQ2itephWDLqDzbeCw==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "hasown": "^2.0.2"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      }
-    },
-    "node_modules/es-to-primitive": {
-      "version": "1.3.0",
-      "resolved": "https://registry.npmjs.org/es-to-primitive/-/es-to-primitive-1.3.0.tgz",
-      "integrity": "sha512-w+5mJ3GuFL+NjVtJlvydShqE1eN3h3PbI7/5LAsYJP/2qtuMXjfL2LpHSRqo4b4eSF5K/DH1JXKUAHSB2UW50g==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "is-callable": "^1.2.7",
-        "is-date-object": "^1.0.5",
-        "is-symbol": "^1.0.4"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/escalade": {
-      "version": "3.2.0",
-      "resolved": "https://registry.npmjs.org/escalade/-/escalade-3.2.0.tgz",
-      "integrity": "sha512-WUj2qlxaQtO4g6Pq5c29GTcWGDyd8itL8zTlipgECz3JesAiiOKotd8JU6otB3PACgG6xkJUyVhboMS+bje/jA==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">=6"
-      }
-    },
-    "node_modules/escape-string-regexp": {
-      "version": "4.0.0",
-      "resolved": "https://registry.npmjs.org/escape-string-regexp/-/escape-string-regexp-4.0.0.tgz",
-      "integrity": "sha512-TtpcNJ3XAzx3Gq8sWRzJaVajRs0uVxA2YAkdb1jm2YkPz4G6egUFAyA3n5vtEIZefPk5Wa4UXbKuS5fKkJWdgA==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">=10"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/sindresorhus"
-      }
-    },
-    "node_modules/eslint": {
-      "version": "9.39.1",
-      "resolved": "https://registry.npmjs.org/eslint/-/eslint-9.39.1.tgz",
-      "integrity": "sha512-BhHmn2yNOFA9H9JmmIVKJmd288g9hrVRDkdoIgRCRuSySRUHH7r/DI6aAXW9T1WwUuY3DFgrcaqB+deURBLR5g==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "@eslint-community/eslint-utils": "^4.8.0",
-        "@eslint-community/regexpp": "^4.12.1",
-        "@eslint/config-array": "^0.21.1",
-        "@eslint/config-helpers": "^0.4.2",
-        "@eslint/core": "^0.17.0",
-        "@eslint/eslintrc": "^3.3.1",
-        "@eslint/js": "9.39.1",
-        "@eslint/plugin-kit": "^0.4.1",
-        "@humanfs/node": "^0.16.6",
-        "@humanwhocodes/module-importer": "^1.0.1",
-        "@humanwhocodes/retry": "^0.4.2",
-        "@types/estree": "^1.0.6",
-        "ajv": "^6.12.4",
-        "chalk": "^4.0.0",
-        "cross-spawn": "^7.0.6",
-        "debug": "^4.3.2",
-        "escape-string-regexp": "^4.0.0",
-        "eslint-scope": "^8.4.0",
-        "eslint-visitor-keys": "^4.2.1",
-        "espree": "^10.4.0",
-        "esquery": "^1.5.0",
-        "esutils": "^2.0.2",
-        "fast-deep-equal": "^3.1.3",
-        "file-entry-cache": "^8.0.0",
-        "find-up": "^5.0.0",
-        "glob-parent": "^6.0.2",
-        "ignore": "^5.2.0",
-        "imurmurhash": "^0.1.4",
-        "is-glob": "^4.0.0",
-        "json-stable-stringify-without-jsonify": "^1.0.1",
-        "lodash.merge": "^4.6.2",
-        "minimatch": "^3.1.2",
-        "natural-compare": "^1.4.0",
-        "optionator": "^0.9.3"
-      },
-      "bin": {
-        "eslint": "bin/eslint.js"
-      },
-      "engines": {
-        "node": "^18.18.0 || ^20.9.0 || >=21.1.0"
-      },
-      "funding": {
-        "url": "https://eslint.org/donate"
-      },
-      "peerDependencies": {
-        "jiti": "*"
-      },
-      "peerDependenciesMeta": {
-        "jiti": {
-          "optional": true
-        }
-      }
-    },
-    "node_modules/eslint-config-next": {
-      "version": "16.0.3",
-      "resolved": "https://registry.npmjs.org/eslint-config-next/-/eslint-config-next-16.0.3.tgz",
-      "integrity": "sha512-5F6qDjcZldf0Y0ZbqvWvap9xzYUxyDf7/of37aeyhvkrQokj/4bT1JYWZdlWUr283aeVa+s52mPq9ogmGg+5dw==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "@next/eslint-plugin-next": "16.0.3",
-        "eslint-import-resolver-node": "^0.3.6",
-        "eslint-import-resolver-typescript": "^3.5.2",
-        "eslint-plugin-import": "^2.32.0",
-        "eslint-plugin-jsx-a11y": "^6.10.0",
-        "eslint-plugin-react": "^7.37.0",
-        "eslint-plugin-react-hooks": "^7.0.0",
-        "globals": "16.4.0",
-        "typescript-eslint": "^8.46.0"
-      },
-      "peerDependencies": {
-        "eslint": ">=9.0.0",
-        "typescript": ">=3.3.1"
-      },
-      "peerDependenciesMeta": {
-        "typescript": {
-          "optional": true
-        }
-      }
-    },
-    "node_modules/eslint-config-next/node_modules/globals": {
-      "version": "16.4.0",
-      "resolved": "https://registry.npmjs.org/globals/-/globals-16.4.0.tgz",
-      "integrity": "sha512-ob/2LcVVaVGCYN+r14cnwnoDPUufjiYgSqRhiFD0Q1iI4Odora5RE8Iv1D24hAz5oMophRGkGz+yuvQmmUMnMw==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">=18"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/sindresorhus"
-      }
-    },
-    "node_modules/eslint-import-resolver-node": {
-      "version": "0.3.9",
-      "resolved": "https://registry.npmjs.org/eslint-import-resolver-node/-/eslint-import-resolver-node-0.3.9.tgz",
-      "integrity": "sha512-WFj2isz22JahUv+B788TlO3N6zL3nNJGU8CcZbPZvVEkBPaJdCV4vy5wyghty5ROFbCRnm132v8BScu5/1BQ8g==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "debug": "^3.2.7",
-        "is-core-module": "^2.13.0",
-        "resolve": "^1.22.4"
-      }
-    },
-    "node_modules/eslint-import-resolver-node/node_modules/debug": {
-      "version": "3.2.7",
-      "resolved": "https://registry.npmjs.org/debug/-/debug-3.2.7.tgz",
-      "integrity": "sha512-CFjzYYAi4ThfiQvizrFQevTTXHtnCqWfe7x1AhgEscTz6ZbLbfoLRLPugTQyBth6f8ZERVUSyWHFD/7Wu4t1XQ==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "ms": "^2.1.1"
-      }
-    },
-    "node_modules/eslint-import-resolver-typescript": {
-      "version": "3.10.1",
-      "resolved": "https://registry.npmjs.org/eslint-import-resolver-typescript/-/eslint-import-resolver-typescript-3.10.1.tgz",
-      "integrity": "sha512-A1rHYb06zjMGAxdLSkN2fXPBwuSaQ0iO5M/hdyS0Ajj1VBaRp0sPD3dn1FhME3c/JluGFbwSxyCfqdSbtQLAHQ==",
-      "dev": true,
-      "license": "ISC",
-      "dependencies": {
-        "@nolyfill/is-core-module": "1.0.39",
-        "debug": "^4.4.0",
-        "get-tsconfig": "^4.10.0",
-        "is-bun-module": "^2.0.0",
-        "stable-hash": "^0.0.5",
-        "tinyglobby": "^0.2.13",
-        "unrs-resolver": "^1.6.2"
-      },
-      "engines": {
-        "node": "^14.18.0 || >=16.0.0"
-      },
-      "funding": {
-        "url": "https://opencollective.com/eslint-import-resolver-typescript"
-      },
-      "peerDependencies": {
-        "eslint": "*",
-        "eslint-plugin-import": "*",
-        "eslint-plugin-import-x": "*"
-      },
-      "peerDependenciesMeta": {
-        "eslint-plugin-import": {
-          "optional": true
-        },
-        "eslint-plugin-import-x": {
-          "optional": true
-        }
-      }
-    },
-    "node_modules/eslint-module-utils": {
-      "version": "2.12.1",
-      "resolved": "https://registry.npmjs.org/eslint-module-utils/-/eslint-module-utils-2.12.1.tgz",
-      "integrity": "sha512-L8jSWTze7K2mTg0vos/RuLRS5soomksDPoJLXIslC7c8Wmut3bx7CPpJijDcBZtxQ5lrbUdM+s0OlNbz0DCDNw==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "debug": "^3.2.7"
-      },
-      "engines": {
-        "node": ">=4"
-      },
-      "peerDependenciesMeta": {
-        "eslint": {
-          "optional": true
-        }
-      }
-    },
-    "node_modules/eslint-module-utils/node_modules/debug": {
-      "version": "3.2.7",
-      "resolved": "https://registry.npmjs.org/debug/-/debug-3.2.7.tgz",
-      "integrity": "sha512-CFjzYYAi4ThfiQvizrFQevTTXHtnCqWfe7x1AhgEscTz6ZbLbfoLRLPugTQyBth6f8ZERVUSyWHFD/7Wu4t1XQ==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "ms": "^2.1.1"
-      }
-    },
-    "node_modules/eslint-plugin-import": {
-      "version": "2.32.0",
-      "resolved": "https://registry.npmjs.org/eslint-plugin-import/-/eslint-plugin-import-2.32.0.tgz",
-      "integrity": "sha512-whOE1HFo/qJDyX4SnXzP4N6zOWn79WhnCUY/iDR0mPfQZO8wcYE4JClzI2oZrhBnnMUCBCHZhO6VQyoBU95mZA==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "@rtsao/scc": "^1.1.0",
-        "array-includes": "^3.1.9",
-        "array.prototype.findlastindex": "^1.2.6",
-        "array.prototype.flat": "^1.3.3",
-        "array.prototype.flatmap": "^1.3.3",
-        "debug": "^3.2.7",
-        "doctrine": "^2.1.0",
-        "eslint-import-resolver-node": "^0.3.9",
-        "eslint-module-utils": "^2.12.1",
-        "hasown": "^2.0.2",
-        "is-core-module": "^2.16.1",
-        "is-glob": "^4.0.3",
-        "minimatch": "^3.1.2",
-        "object.fromentries": "^2.0.8",
-        "object.groupby": "^1.0.3",
-        "object.values": "^1.2.1",
-        "semver": "^6.3.1",
-        "string.prototype.trimend": "^1.0.9",
-        "tsconfig-paths": "^3.15.0"
-      },
-      "engines": {
-        "node": ">=4"
-      },
-      "peerDependencies": {
-        "eslint": "^2 || ^3 || ^4 || ^5 || ^6 || ^7.2.0 || ^8 || ^9"
-      }
-    },
-    "node_modules/eslint-plugin-import/node_modules/debug": {
-      "version": "3.2.7",
-      "resolved": "https://registry.npmjs.org/debug/-/debug-3.2.7.tgz",
-      "integrity": "sha512-CFjzYYAi4ThfiQvizrFQevTTXHtnCqWfe7x1AhgEscTz6ZbLbfoLRLPugTQyBth6f8ZERVUSyWHFD/7Wu4t1XQ==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "ms": "^2.1.1"
-      }
-    },
-    "node_modules/eslint-plugin-import/node_modules/doctrine": {
-      "version": "2.1.0",
-      "resolved": "https://registry.npmjs.org/doctrine/-/doctrine-2.1.0.tgz",
-      "integrity": "sha512-35mSku4ZXK0vfCuHEDAwt55dg2jNajHZ1odvF+8SSr82EsZY4QmXfuWso8oEd8zRhVObSN18aM0CjSdoBX7zIw==",
-      "dev": true,
-      "license": "Apache-2.0",
-      "dependencies": {
-        "esutils": "^2.0.2"
-      },
-      "engines": {
-        "node": ">=0.10.0"
-      }
-    },
-    "node_modules/eslint-plugin-import/node_modules/semver": {
-      "version": "6.3.1",
-      "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.1.tgz",
-      "integrity": "sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA==",
-      "dev": true,
-      "license": "ISC",
-      "bin": {
-        "semver": "bin/semver.js"
-      }
-    },
-    "node_modules/eslint-plugin-jsx-a11y": {
-      "version": "6.10.2",
-      "resolved": "https://registry.npmjs.org/eslint-plugin-jsx-a11y/-/eslint-plugin-jsx-a11y-6.10.2.tgz",
-      "integrity": "sha512-scB3nz4WmG75pV8+3eRUQOHZlNSUhFNq37xnpgRkCCELU3XMvXAxLk1eqWWyE22Ki4Q01Fnsw9BA3cJHDPgn2Q==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "aria-query": "^5.3.2",
-        "array-includes": "^3.1.8",
-        "array.prototype.flatmap": "^1.3.2",
-        "ast-types-flow": "^0.0.8",
-        "axe-core": "^4.10.0",
-        "axobject-query": "^4.1.0",
-        "damerau-levenshtein": "^1.0.8",
-        "emoji-regex": "^9.2.2",
-        "hasown": "^2.0.2",
-        "jsx-ast-utils": "^3.3.5",
-        "language-tags": "^1.0.9",
-        "minimatch": "^3.1.2",
-        "object.fromentries": "^2.0.8",
-        "safe-regex-test": "^1.0.3",
-        "string.prototype.includes": "^2.0.1"
-      },
-      "engines": {
-        "node": ">=4.0"
-      },
-      "peerDependencies": {
-        "eslint": "^3 || ^4 || ^5 || ^6 || ^7 || ^8 || ^9"
-      }
-    },
-    "node_modules/eslint-plugin-react": {
-      "version": "7.37.5",
-      "resolved": "https://registry.npmjs.org/eslint-plugin-react/-/eslint-plugin-react-7.37.5.tgz",
-      "integrity": "sha512-Qteup0SqU15kdocexFNAJMvCJEfa2xUKNV4CC1xsVMrIIqEy3SQ/rqyxCWNzfrd3/ldy6HMlD2e0JDVpDg2qIA==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "array-includes": "^3.1.8",
-        "array.prototype.findlast": "^1.2.5",
-        "array.prototype.flatmap": "^1.3.3",
-        "array.prototype.tosorted": "^1.1.4",
-        "doctrine": "^2.1.0",
-        "es-iterator-helpers": "^1.2.1",
-        "estraverse": "^5.3.0",
-        "hasown": "^2.0.2",
-        "jsx-ast-utils": "^2.4.1 || ^3.0.0",
-        "minimatch": "^3.1.2",
-        "object.entries": "^1.1.9",
-        "object.fromentries": "^2.0.8",
-        "object.values": "^1.2.1",
-        "prop-types": "^15.8.1",
-        "resolve": "^2.0.0-next.5",
-        "semver": "^6.3.1",
-        "string.prototype.matchall": "^4.0.12",
-        "string.prototype.repeat": "^1.0.0"
-      },
-      "engines": {
-        "node": ">=4"
-      },
-      "peerDependencies": {
-        "eslint": "^3 || ^4 || ^5 || ^6 || ^7 || ^8 || ^9.7"
-      }
-    },
-    "node_modules/eslint-plugin-react-hooks": {
-      "version": "7.0.1",
-      "resolved": "https://registry.npmjs.org/eslint-plugin-react-hooks/-/eslint-plugin-react-hooks-7.0.1.tgz",
-      "integrity": "sha512-O0d0m04evaNzEPoSW+59Mezf8Qt0InfgGIBJnpC0h3NH/WjUAR7BIKUfysC6todmtiZ/A0oUVS8Gce0WhBrHsA==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "@babel/core": "^7.24.4",
-        "@babel/parser": "^7.24.4",
-        "hermes-parser": "^0.25.1",
-        "zod": "^3.25.0 || ^4.0.0",
-        "zod-validation-error": "^3.5.0 || ^4.0.0"
-      },
-      "engines": {
-        "node": ">=18"
-      },
-      "peerDependencies": {
-        "eslint": "^3.0.0 || ^4.0.0 || ^5.0.0 || ^6.0.0 || ^7.0.0 || ^8.0.0-0 || ^9.0.0"
-      }
-    },
-    "node_modules/eslint-plugin-react/node_modules/doctrine": {
-      "version": "2.1.0",
-      "resolved": "https://registry.npmjs.org/doctrine/-/doctrine-2.1.0.tgz",
-      "integrity": "sha512-35mSku4ZXK0vfCuHEDAwt55dg2jNajHZ1odvF+8SSr82EsZY4QmXfuWso8oEd8zRhVObSN18aM0CjSdoBX7zIw==",
-      "dev": true,
-      "license": "Apache-2.0",
-      "dependencies": {
-        "esutils": "^2.0.2"
-      },
-      "engines": {
-        "node": ">=0.10.0"
-      }
-    },
-    "node_modules/eslint-plugin-react/node_modules/resolve": {
-      "version": "2.0.0-next.5",
-      "resolved": "https://registry.npmjs.org/resolve/-/resolve-2.0.0-next.5.tgz",
-      "integrity": "sha512-U7WjGVG9sH8tvjW5SmGbQuui75FiyjAX72HX15DwBBwF9dNiQZRQAg9nnPhYy+TUnE0+VcrttuvNI8oSxZcocA==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "is-core-module": "^2.13.0",
-        "path-parse": "^1.0.7",
-        "supports-preserve-symlinks-flag": "^1.0.0"
-      },
-      "bin": {
-        "resolve": "bin/resolve"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/eslint-plugin-react/node_modules/semver": {
-      "version": "6.3.1",
-      "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.1.tgz",
-      "integrity": "sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA==",
-      "dev": true,
-      "license": "ISC",
-      "bin": {
-        "semver": "bin/semver.js"
-      }
-    },
-    "node_modules/eslint-scope": {
-      "version": "8.4.0",
-      "resolved": "https://registry.npmjs.org/eslint-scope/-/eslint-scope-8.4.0.tgz",
-      "integrity": "sha512-sNXOfKCn74rt8RICKMvJS7XKV/Xk9kA7DyJr8mJik3S7Cwgy3qlkkmyS2uQB3jiJg6VNdZd/pDBJu0nvG2NlTg==",
-      "dev": true,
-      "license": "BSD-2-Clause",
-      "dependencies": {
-        "esrecurse": "^4.3.0",
-        "estraverse": "^5.2.0"
-      },
-      "engines": {
-        "node": "^18.18.0 || ^20.9.0 || >=21.1.0"
-      },
-      "funding": {
-        "url": "https://opencollective.com/eslint"
-      }
-    },
-    "node_modules/eslint-visitor-keys": {
-      "version": "3.4.3",
-      "resolved": "https://registry.npmjs.org/eslint-visitor-keys/-/eslint-visitor-keys-3.4.3.tgz",
-      "integrity": "sha512-wpc+LXeiyiisxPlEkUzU6svyS1frIO3Mgxj1fdy7Pm8Ygzguax2N3Fa/D/ag1WqbOprdI+uY6wMUl8/a2G+iag==",
-      "dev": true,
-      "license": "Apache-2.0",
-      "engines": {
-        "node": "^12.22.0 || ^14.17.0 || >=16.0.0"
-      },
-      "funding": {
-        "url": "https://opencollective.com/eslint"
-      }
-    },
-    "node_modules/eslint/node_modules/eslint-visitor-keys": {
-      "version": "4.2.1",
-      "resolved": "https://registry.npmjs.org/eslint-visitor-keys/-/eslint-visitor-keys-4.2.1.tgz",
-      "integrity": "sha512-Uhdk5sfqcee/9H/rCOJikYz67o0a2Tw2hGRPOG2Y1R2dg7brRe1uG0yaNQDHu+TO/uQPF/5eCapvYSmHUjt7JQ==",
-      "dev": true,
-      "license": "Apache-2.0",
-      "engines": {
-        "node": "^18.18.0 || ^20.9.0 || >=21.1.0"
-      },
-      "funding": {
-        "url": "https://opencollective.com/eslint"
-      }
-    },
-    "node_modules/espree": {
-      "version": "10.4.0",
-      "resolved": "https://registry.npmjs.org/espree/-/espree-10.4.0.tgz",
-      "integrity": "sha512-j6PAQ2uUr79PZhBjP5C5fhl8e39FmRnOjsD5lGnWrFU8i2G776tBK7+nP8KuQUTTyAZUwfQqXAgrVH5MbH9CYQ==",
-      "dev": true,
-      "license": "BSD-2-Clause",
-      "dependencies": {
-        "acorn": "^8.15.0",
-        "acorn-jsx": "^5.3.2",
-        "eslint-visitor-keys": "^4.2.1"
-      },
-      "engines": {
-        "node": "^18.18.0 || ^20.9.0 || >=21.1.0"
-      },
-      "funding": {
-        "url": "https://opencollective.com/eslint"
-      }
-    },
-    "node_modules/espree/node_modules/eslint-visitor-keys": {
-      "version": "4.2.1",
-      "resolved": "https://registry.npmjs.org/eslint-visitor-keys/-/eslint-visitor-keys-4.2.1.tgz",
-      "integrity": "sha512-Uhdk5sfqcee/9H/rCOJikYz67o0a2Tw2hGRPOG2Y1R2dg7brRe1uG0yaNQDHu+TO/uQPF/5eCapvYSmHUjt7JQ==",
-      "dev": true,
-      "license": "Apache-2.0",
-      "engines": {
-        "node": "^18.18.0 || ^20.9.0 || >=21.1.0"
-      },
-      "funding": {
-        "url": "https://opencollective.com/eslint"
-      }
-    },
-    "node_modules/esquery": {
-      "version": "1.6.0",
-      "resolved": "https://registry.npmjs.org/esquery/-/esquery-1.6.0.tgz",
-      "integrity": "sha512-ca9pw9fomFcKPvFLXhBKUK90ZvGibiGOvRJNbjljY7s7uq/5YO4BOzcYtJqExdx99rF6aAcnRxHmcUHcz6sQsg==",
-      "dev": true,
-      "license": "BSD-3-Clause",
-      "dependencies": {
-        "estraverse": "^5.1.0"
-      },
-      "engines": {
-        "node": ">=0.10"
-      }
-    },
-    "node_modules/esrecurse": {
-      "version": "4.3.0",
-      "resolved": "https://registry.npmjs.org/esrecurse/-/esrecurse-4.3.0.tgz",
-      "integrity": "sha512-KmfKL3b6G+RXvP8N1vr3Tq1kL/oCFgn2NYXEtqP8/L3pKapUA4G8cFVaoF3SU323CD4XypR/ffioHmkti6/Tag==",
-      "dev": true,
-      "license": "BSD-2-Clause",
-      "dependencies": {
-        "estraverse": "^5.2.0"
-      },
-      "engines": {
-        "node": ">=4.0"
-      }
-    },
-    "node_modules/estraverse": {
-      "version": "5.3.0",
-      "resolved": "https://registry.npmjs.org/estraverse/-/estraverse-5.3.0.tgz",
-      "integrity": "sha512-MMdARuVEQziNTeJD8DgMqmhwR11BRQ/cBP+pLtYdSTnf3MIO8fFeiINEbX36ZdNlfU/7A9f3gUw49B3oQsvwBA==",
-      "dev": true,
-      "license": "BSD-2-Clause",
-      "engines": {
-        "node": ">=4.0"
-      }
-    },
-    "node_modules/esutils": {
-      "version": "2.0.3",
-      "resolved": "https://registry.npmjs.org/esutils/-/esutils-2.0.3.tgz",
-      "integrity": "sha512-kVscqXk4OCp68SZ0dkgEKVi6/8ij300KBWTJq32P/dYeWTSwK41WyTxalN1eRmA5Z9UU/LX9D7FWSmV9SAYx6g==",
-      "dev": true,
-      "license": "BSD-2-Clause",
-      "engines": {
-        "node": ">=0.10.0"
-      }
-    },
-    "node_modules/fast-deep-equal": {
-      "version": "3.1.3",
-      "resolved": "https://registry.npmjs.org/fast-deep-equal/-/fast-deep-equal-3.1.3.tgz",
-      "integrity": "sha512-f3qQ9oQy9j2AhBe/H9VC91wLmKBCCU/gDOnKNAYG5hswO7BLKj09Hc5HYNz9cGI++xlpDCIgDaitVs03ATR84Q==",
-      "dev": true,
-      "license": "MIT"
-    },
-    "node_modules/fast-glob": {
-      "version": "3.3.3",
-      "resolved": "https://registry.npmjs.org/fast-glob/-/fast-glob-3.3.3.tgz",
-      "integrity": "sha512-7MptL8U0cqcFdzIzwOTHoilX9x5BrNqye7Z/LuC7kCMRio1EMSyqRK3BEAUD7sXRq4iT4AzTVuZdhgQ2TCvYLg==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "@nodelib/fs.stat": "^2.0.2",
-        "@nodelib/fs.walk": "^1.2.3",
-        "glob-parent": "^5.1.2",
-        "merge2": "^1.3.0",
-        "micromatch": "^4.0.8"
-      },
-      "engines": {
-        "node": ">=8.6.0"
-      }
-    },
-    "node_modules/fast-glob/node_modules/glob-parent": {
-      "version": "5.1.2",
-      "resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-5.1.2.tgz",
-      "integrity": "sha512-AOIgSQCepiJYwP3ARnGx+5VnTu2HBYdzbGP45eLw1vr3zB3vZLeyed1sC9hnbcOc9/SrMyM5RPQrkGz4aS9Zow==",
-      "dev": true,
-      "license": "ISC",
-      "dependencies": {
-        "is-glob": "^4.0.1"
-      },
-      "engines": {
-        "node": ">= 6"
-      }
-    },
-    "node_modules/fast-json-stable-stringify": {
-      "version": "2.1.0",
-      "resolved": "https://registry.npmjs.org/fast-json-stable-stringify/-/fast-json-stable-stringify-2.1.0.tgz",
-      "integrity": "sha512-lhd/wF+Lk98HZoTCtlVraHtfh5XYijIjalXck7saUtuanSDyLMxnHhSXEDJqHxD7msR8D0uCmqlkwjCV8xvwHw==",
-      "dev": true,
-      "license": "MIT"
-    },
-    "node_modules/fast-levenshtein": {
-      "version": "2.0.6",
-      "resolved": "https://registry.npmjs.org/fast-levenshtein/-/fast-levenshtein-2.0.6.tgz",
-      "integrity": "sha512-DCXu6Ifhqcks7TZKY3Hxp3y6qphY5SJZmrWMDrKcERSOXWQdMhU9Ig/PYrzyw/ul9jOIyh0N4M0tbC5hodg8dw==",
-      "dev": true,
-      "license": "MIT"
-    },
-    "node_modules/fastq": {
-      "version": "1.19.1",
-      "resolved": "https://registry.npmjs.org/fastq/-/fastq-1.19.1.tgz",
-      "integrity": "sha512-GwLTyxkCXjXbxqIhTsMI2Nui8huMPtnxg7krajPJAjnEG/iiOS7i+zCtWGZR9G0NBKbXKh6X9m9UIsYX/N6vvQ==",
-      "dev": true,
-      "license": "ISC",
-      "dependencies": {
-        "reusify": "^1.0.4"
-      }
-    },
-    "node_modules/file-entry-cache": {
-      "version": "8.0.0",
-      "resolved": "https://registry.npmjs.org/file-entry-cache/-/file-entry-cache-8.0.0.tgz",
-      "integrity": "sha512-XXTUwCvisa5oacNGRP9SfNtYBNAMi+RPwBFmblZEF7N7swHYQS6/Zfk7SRwx4D5j3CH211YNRco1DEMNVfZCnQ==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "flat-cache": "^4.0.0"
-      },
-      "engines": {
-        "node": ">=16.0.0"
-      }
-    },
-    "node_modules/fill-range": {
-      "version": "7.1.1",
-      "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-7.1.1.tgz",
-      "integrity": "sha512-YsGpe3WHLK8ZYi4tWDg2Jy3ebRz2rXowDxnld4bkQB00cc/1Zw9AWnC0i9ztDJitivtQvaI9KaLyKrc+hBW0yg==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "to-regex-range": "^5.0.1"
-      },
-      "engines": {
-        "node": ">=8"
-      }
-    },
-    "node_modules/find-up": {
-      "version": "5.0.0",
-      "resolved": "https://registry.npmjs.org/find-up/-/find-up-5.0.0.tgz",
-      "integrity": "sha512-78/PXT1wlLLDgTzDs7sjq9hzz0vXD+zn+7wypEe4fXQxCmdmqfGsEPQxmiCSQI3ajFV91bVSsvNtrJRiW6nGng==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "locate-path": "^6.0.0",
-        "path-exists": "^4.0.0"
-      },
-      "engines": {
-        "node": ">=10"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/sindresorhus"
-      }
-    },
-    "node_modules/flat-cache": {
-      "version": "4.0.1",
-      "resolved": "https://registry.npmjs.org/flat-cache/-/flat-cache-4.0.1.tgz",
-      "integrity": "sha512-f7ccFPK3SXFHpx15UIGyRJ/FJQctuKZ0zVuN3frBo4HnK3cay9VEW0R6yPYFHC0AgqhukPzKjq22t5DmAyqGyw==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "flatted": "^3.2.9",
-        "keyv": "^4.5.4"
-      },
-      "engines": {
-        "node": ">=16"
-      }
-    },
-    "node_modules/flatted": {
-      "version": "3.3.3",
-      "resolved": "https://registry.npmjs.org/flatted/-/flatted-3.3.3.tgz",
-      "integrity": "sha512-GX+ysw4PBCz0PzosHDepZGANEuFCMLrnRTiEy9McGjmkCQYwRq4A/X786G/fjM/+OjsWSU1ZrY5qyARZmO/uwg==",
-      "dev": true,
-      "license": "ISC"
-    },
-    "node_modules/for-each": {
-      "version": "0.3.5",
-      "resolved": "https://registry.npmjs.org/for-each/-/for-each-0.3.5.tgz",
-      "integrity": "sha512-dKx12eRCVIzqCxFGplyFKJMPvLEWgmNtUrpTiJIR5u97zEhRG8ySrtboPHZXx7daLxQVrl643cTzbab2tkQjxg==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "is-callable": "^1.2.7"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/foreground-child": {
-      "version": "3.3.1",
-      "resolved": "https://registry.npmjs.org/foreground-child/-/foreground-child-3.3.1.tgz",
-      "integrity": "sha512-gIXjKqtFuWEgzFRJA9WCQeSJLZDjgJUOMCMzxtvFq/37KojM1BFGufqsCy0r4qSQmYLsZYMeyRqzIWOMup03sw==",
-      "dev": true,
-      "license": "ISC",
-      "dependencies": {
-        "cross-spawn": "^7.0.6",
-        "signal-exit": "^4.0.1"
-      },
-      "engines": {
-        "node": ">=14"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/isaacs"
-      }
-    },
-    "node_modules/fraction.js": {
-      "version": "4.3.7",
-      "resolved": "https://registry.npmjs.org/fraction.js/-/fraction.js-4.3.7.tgz",
-      "integrity": "sha512-ZsDfxO51wGAXREY55a7la9LScWpwv9RxIrYABrlvOFBlH/ShPnrtsXeuUIfXKKOVicNxQ+o8JTbJvjS4M89yew==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": "*"
-      },
-      "funding": {
-        "type": "patreon",
-        "url": "https://github.com/sponsors/rawify"
-      }
-    },
-    "node_modules/fsevents": {
-      "version": "2.3.3",
-      "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.3.tgz",
-      "integrity": "sha512-5xoDfX+fL7faATnagmWPpbFtwh/R77WmMMqqHGS65C3vvB0YHrgF+B1YmZ3441tMj5n63k0212XNoJwzlhffQw==",
-      "dev": true,
-      "hasInstallScript": true,
-      "license": "MIT",
-      "optional": true,
-      "os": [
-        "darwin"
-      ],
-      "engines": {
-        "node": "^8.16.0 || ^10.6.0 || >=11.0.0"
-      }
-    },
-    "node_modules/function-bind": {
-      "version": "1.1.2",
-      "resolved": "https://registry.npmjs.org/function-bind/-/function-bind-1.1.2.tgz",
-      "integrity": "sha512-7XHNxH7qX9xG5mIwxkhumTox/MIRNcOgDrxWsMt2pAr23WHp6MrRlN7FBSFpCpr+oVO0F744iUgR82nJMfG2SA==",
-      "dev": true,
-      "license": "MIT",
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/function.prototype.name": {
-      "version": "1.1.8",
-      "resolved": "https://registry.npmjs.org/function.prototype.name/-/function.prototype.name-1.1.8.tgz",
-      "integrity": "sha512-e5iwyodOHhbMr/yNrc7fDYG4qlbIvI5gajyzPnb5TCwyhjApznQh1BMFou9b30SevY43gCJKXycoCBjMbsuW0Q==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "call-bind": "^1.0.8",
-        "call-bound": "^1.0.3",
-        "define-properties": "^1.2.1",
-        "functions-have-names": "^1.2.3",
-        "hasown": "^2.0.2",
-        "is-callable": "^1.2.7"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/functions-have-names": {
-      "version": "1.2.3",
-      "resolved": "https://registry.npmjs.org/functions-have-names/-/functions-have-names-1.2.3.tgz",
-      "integrity": "sha512-xckBUXyTIqT97tq2x2AMb+g163b5JFysYk0x4qxNFwbfQkmNZoiRHb6sPzI9/QV33WeuvVYBUIiD4NzNIyqaRQ==",
-      "dev": true,
-      "license": "MIT",
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/gensync": {
-      "version": "1.0.0-beta.2",
-      "resolved": "https://registry.npmjs.org/gensync/-/gensync-1.0.0-beta.2.tgz",
-      "integrity": "sha512-3hN7NaskYvMDLQY55gnW3NQ+mesEAepTqlg+VEbj7zzqEMBVNhzcGYYeqFo/TlYz6eQiFcp1HcsCZO+nGgS8zg==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">=6.9.0"
-      }
-    },
-    "node_modules/get-intrinsic": {
-      "version": "1.3.0",
-      "resolved": "https://registry.npmjs.org/get-intrinsic/-/get-intrinsic-1.3.0.tgz",
-      "integrity": "sha512-9fSjSaos/fRIVIp+xSJlE6lfwhES7LNtKaCBIamHsjr2na1BiABJPo0mOjjz8GJDURarmCPGqaiVg5mfjb98CQ==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "call-bind-apply-helpers": "^1.0.2",
-        "es-define-property": "^1.0.1",
-        "es-errors": "^1.3.0",
-        "es-object-atoms": "^1.1.1",
-        "function-bind": "^1.1.2",
-        "get-proto": "^1.0.1",
-        "gopd": "^1.2.0",
-        "has-symbols": "^1.1.0",
-        "hasown": "^2.0.2",
-        "math-intrinsics": "^1.1.0"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/get-proto": {
-      "version": "1.0.1",
-      "resolved": "https://registry.npmjs.org/get-proto/-/get-proto-1.0.1.tgz",
-      "integrity": "sha512-sTSfBjoXBp89JvIKIefqw7U2CCebsc74kiY6awiGogKtoSGbgjYE/G/+l9sF3MWFPNc9IcoOC4ODfKHfxFmp0g==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "dunder-proto": "^1.0.1",
-        "es-object-atoms": "^1.0.0"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      }
-    },
-    "node_modules/get-symbol-description": {
-      "version": "1.1.0",
-      "resolved": "https://registry.npmjs.org/get-symbol-description/-/get-symbol-description-1.1.0.tgz",
-      "integrity": "sha512-w9UMqWwJxHNOvoNzSJ2oPF5wvYcvP7jUvYzhp67yEhTi17ZDBBC1z9pTdGuzjD+EFIqLSYRweZjqfiPzQ06Ebg==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "call-bound": "^1.0.3",
-        "es-errors": "^1.3.0",
-        "get-intrinsic": "^1.2.6"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/get-tsconfig": {
-      "version": "4.10.1",
-      "resolved": "https://registry.npmjs.org/get-tsconfig/-/get-tsconfig-4.10.1.tgz",
-      "integrity": "sha512-auHyJ4AgMz7vgS8Hp3N6HXSmlMdUyhSUrfBF16w153rxtLIEOE+HGqaBppczZvnHLqQJfiHotCYpNhl0lUROFQ==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "resolve-pkg-maps": "^1.0.0"
-      },
-      "funding": {
-        "url": "https://github.com/privatenumber/get-tsconfig?sponsor=1"
-      }
-    },
-    "node_modules/glob": {
-      "version": "10.5.0",
-      "resolved": "https://registry.npmjs.org/glob/-/glob-10.5.0.tgz",
-      "integrity": "sha512-DfXN8DfhJ7NH3Oe7cFmu3NCu1wKbkReJ8TorzSAFbSKrlNaQSKfIzqYqVY8zlbs2NLBbWpRiU52GX2PbaBVNkg==",
-      "dev": true,
-      "license": "ISC",
-      "dependencies": {
-        "foreground-child": "^3.1.0",
-        "jackspeak": "^3.1.2",
-        "minimatch": "^9.0.4",
-        "minipass": "^7.1.2",
-        "package-json-from-dist": "^1.0.0",
-        "path-scurry": "^1.11.1"
-      },
-      "bin": {
-        "glob": "dist/esm/bin.mjs"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/isaacs"
-      }
-    },
-    "node_modules/glob-parent": {
-      "version": "6.0.2",
-      "resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-6.0.2.tgz",
-      "integrity": "sha512-XxwI8EOhVQgWp6iDL+3b0r86f4d6AX6zSU55HfB4ydCEuXLXc5FcYeOu+nnGftS4TEju/11rt4KJPTMgbfmv4A==",
-      "dev": true,
-      "license": "ISC",
-      "dependencies": {
-        "is-glob": "^4.0.3"
-      },
-      "engines": {
-        "node": ">=10.13.0"
-      }
-    },
-    "node_modules/glob/node_modules/brace-expansion": {
-      "version": "2.0.2",
-      "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-2.0.2.tgz",
-      "integrity": "sha512-Jt0vHyM+jmUBqojB7E1NIYadt0vI0Qxjxd2TErW94wDz+E2LAm5vKMXXwg6ZZBTHPuUlDgQHKXvjGBdfcF1ZDQ==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "balanced-match": "^1.0.0"
-      }
-    },
-    "node_modules/glob/node_modules/minimatch": {
-      "version": "9.0.5",
-      "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-9.0.5.tgz",
-      "integrity": "sha512-G6T0ZX48xgozx7587koeX9Ys2NYy6Gmv//P89sEte9V9whIapMNF4idKxnW2QtCcLiTWlb/wfCabAtAFWhhBow==",
-      "dev": true,
-      "license": "ISC",
-      "dependencies": {
-        "brace-expansion": "^2.0.1"
-      },
-      "engines": {
-        "node": ">=16 || 14 >=14.17"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/isaacs"
-      }
-    },
-    "node_modules/globals": {
-      "version": "14.0.0",
-      "resolved": "https://registry.npmjs.org/globals/-/globals-14.0.0.tgz",
-      "integrity": "sha512-oahGvuMGQlPw/ivIYBjVSrWAfWLBeku5tpPE2fOPLi+WHffIWbuh2tCjhyQhTBPMf5E9jDEH4FOmTYgYwbKwtQ==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">=18"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/sindresorhus"
-      }
-    },
-    "node_modules/globalthis": {
-      "version": "1.0.4",
-      "resolved": "https://registry.npmjs.org/globalthis/-/globalthis-1.0.4.tgz",
-      "integrity": "sha512-DpLKbNU4WylpxJykQujfCcwYWiV/Jhm50Goo0wrVILAv5jOr9d+H+UR3PhSCD2rCCEIg0uc+G+muBTwD54JhDQ==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "define-properties": "^1.2.1",
-        "gopd": "^1.0.1"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/gopd": {
-      "version": "1.2.0",
-      "resolved": "https://registry.npmjs.org/gopd/-/gopd-1.2.0.tgz",
-      "integrity": "sha512-ZUKRh6/kUFoAiTAtTYPZJ3hw9wNxx+BIBOijnlG9PnrJsCcSjs1wyyD6vJpaYtgnzDrKYRSqf3OO6Rfa93xsRg==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/graceful-fs": {
-      "version": "4.2.11",
-      "resolved": "https://registry.npmjs.org/graceful-fs/-/graceful-fs-4.2.11.tgz",
-      "integrity": "sha512-RbJ5/jmFcNNCcDV5o9eTnBLJ/HszWV0P73bc+Ff4nS/rJj+YaS6IGyiOL0VoBYX+l1Wrl3k63h/KrH+nhJ0XvQ==",
-      "license": "ISC"
-    },
-    "node_modules/graphemer": {
-      "version": "1.4.0",
-      "resolved": "https://registry.npmjs.org/graphemer/-/graphemer-1.4.0.tgz",
-      "integrity": "sha512-EtKwoO6kxCL9WO5xipiHTZlSzBm7WLT627TqC/uVRd0HKmq8NXyebnNYxDoBi7wt8eTWrUrKXCOVaFq9x1kgag==",
-      "dev": true,
-      "license": "MIT"
-    },
-    "node_modules/has-bigints": {
-      "version": "1.1.0",
-      "resolved": "https://registry.npmjs.org/has-bigints/-/has-bigints-1.1.0.tgz",
-      "integrity": "sha512-R3pbpkcIqv2Pm3dUwgjclDRVmWpTJW2DcMzcIhEXEx1oh/CEMObMm3KLmRJOdvhM7o4uQBnwr8pzRK2sJWIqfg==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/has-flag": {
-      "version": "4.0.0",
-      "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-4.0.0.tgz",
-      "integrity": "sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">=8"
-      }
-    },
-    "node_modules/has-property-descriptors": {
-      "version": "1.0.2",
-      "resolved": "https://registry.npmjs.org/has-property-descriptors/-/has-property-descriptors-1.0.2.tgz",
-      "integrity": "sha512-55JNKuIW+vq4Ke1BjOTjM2YctQIvCT7GFzHwmfZPGo5wnrgkid0YQtnAleFSqumZm4az3n2BS+erby5ipJdgrg==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "es-define-property": "^1.0.0"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/has-proto": {
-      "version": "1.2.0",
-      "resolved": "https://registry.npmjs.org/has-proto/-/has-proto-1.2.0.tgz",
-      "integrity": "sha512-KIL7eQPfHQRC8+XluaIw7BHUwwqL19bQn4hzNgdr+1wXoU0KKj6rufu47lhY7KbJR2C6T6+PfyN0Ea7wkSS+qQ==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "dunder-proto": "^1.0.0"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/has-symbols": {
-      "version": "1.1.0",
-      "resolved": "https://registry.npmjs.org/has-symbols/-/has-symbols-1.1.0.tgz",
-      "integrity": "sha512-1cDNdwJ2Jaohmb3sg4OmKaMBwuC48sYni5HUw2DvsC8LjGTLK9h+eb1X6RyuOHe4hT0ULCW68iomhjUoKUqlPQ==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/has-tostringtag": {
-      "version": "1.0.2",
-      "resolved": "https://registry.npmjs.org/has-tostringtag/-/has-tostringtag-1.0.2.tgz",
-      "integrity": "sha512-NqADB8VjPFLM2V0VvHUewwwsw0ZWBaIdgo+ieHtK3hasLz4qeCRjYcqfB6AQrBggRKppKF8L52/VqdVsO47Dlw==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "has-symbols": "^1.0.3"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/hasown": {
-      "version": "2.0.2",
-      "resolved": "https://registry.npmjs.org/hasown/-/hasown-2.0.2.tgz",
-      "integrity": "sha512-0hJU9SCPvmMzIBdZFqNPXWa6dqh7WdH0cII9y+CyS8rG3nL48Bclra9HmKhVVUHyPWNH5Y7xDwAB7bfgSjkUMQ==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "function-bind": "^1.1.2"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      }
-    },
-    "node_modules/hermes-estree": {
-      "version": "0.25.1",
-      "resolved": "https://registry.npmjs.org/hermes-estree/-/hermes-estree-0.25.1.tgz",
-      "integrity": "sha512-0wUoCcLp+5Ev5pDW2OriHC2MJCbwLwuRx+gAqMTOkGKJJiBCLjtrvy4PWUGn6MIVefecRpzoOZ/UV6iGdOr+Cw==",
-      "dev": true,
-      "license": "MIT"
-    },
-    "node_modules/hermes-parser": {
-      "version": "0.25.1",
-      "resolved": "https://registry.npmjs.org/hermes-parser/-/hermes-parser-0.25.1.tgz",
-      "integrity": "sha512-6pEjquH3rqaI6cYAXYPcz9MS4rY6R4ngRgrgfDshRptUZIc3lw0MCIJIGDj9++mfySOuPTHB4nrSW99BCvOPIA==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "hermes-estree": "0.25.1"
-      }
-    },
-    "node_modules/ignore": {
-      "version": "5.3.2",
-      "resolved": "https://registry.npmjs.org/ignore/-/ignore-5.3.2.tgz",
-      "integrity": "sha512-hsBTNUqQTDwkWtcdYI2i06Y/nUBEsNEDJKjWdigLvegy8kDuJAS8uRlpkkcQpyEXL0Z/pjDy5HBmMjRCJ2gq+g==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">= 4"
-      }
-    },
-    "node_modules/import-fresh": {
-      "version": "3.3.1",
-      "resolved": "https://registry.npmjs.org/import-fresh/-/import-fresh-3.3.1.tgz",
-      "integrity": "sha512-TR3KfrTZTYLPB6jUjfx6MF9WcWrHL9su5TObK4ZkYgBdWKPOFoSoQIdEuTuR82pmtxH2spWG9h6etwfr1pLBqQ==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "parent-module": "^1.0.0",
-        "resolve-from": "^4.0.0"
-      },
-      "engines": {
-        "node": ">=6"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/sindresorhus"
-      }
-    },
-    "node_modules/imurmurhash": {
-      "version": "0.1.4",
-      "resolved": "https://registry.npmjs.org/imurmurhash/-/imurmurhash-0.1.4.tgz",
-      "integrity": "sha512-JmXMZ6wuvDmLiHEml9ykzqO6lwFbof0GG4IkcGaENdCRDDmMVnny7s5HsIgHCbaq0w2MyPhDqkhTUgS2LU2PHA==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">=0.8.19"
-      }
-    },
-    "node_modules/internal-slot": {
-      "version": "1.1.0",
-      "resolved": "https://registry.npmjs.org/internal-slot/-/internal-slot-1.1.0.tgz",
-      "integrity": "sha512-4gd7VpWNQNB4UKKCFFVcp1AVv+FMOgs9NKzjHKusc8jTMhd5eL1NqQqOpE0KzMds804/yHlglp3uxgluOqAPLw==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "es-errors": "^1.3.0",
-        "hasown": "^2.0.2",
-        "side-channel": "^1.1.0"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      }
-    },
-    "node_modules/is-array-buffer": {
-      "version": "3.0.5",
-      "resolved": "https://registry.npmjs.org/is-array-buffer/-/is-array-buffer-3.0.5.tgz",
-      "integrity": "sha512-DDfANUiiG2wC1qawP66qlTugJeL5HyzMpfr8lLK+jMQirGzNod0B12cFB/9q838Ru27sBwfw78/rdoU7RERz6A==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "call-bind": "^1.0.8",
-        "call-bound": "^1.0.3",
-        "get-intrinsic": "^1.2.6"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/is-async-function": {
-      "version": "2.1.1",
-      "resolved": "https://registry.npmjs.org/is-async-function/-/is-async-function-2.1.1.tgz",
-      "integrity": "sha512-9dgM/cZBnNvjzaMYHVoxxfPj2QXt22Ev7SuuPrs+xav0ukGB0S6d4ydZdEiM48kLx5kDV+QBPrpVnFyefL8kkQ==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "async-function": "^1.0.0",
-        "call-bound": "^1.0.3",
-        "get-proto": "^1.0.1",
-        "has-tostringtag": "^1.0.2",
-        "safe-regex-test": "^1.1.0"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/is-bigint": {
-      "version": "1.1.0",
-      "resolved": "https://registry.npmjs.org/is-bigint/-/is-bigint-1.1.0.tgz",
-      "integrity": "sha512-n4ZT37wG78iz03xPRKJrHTdZbe3IicyucEtdRsV5yglwc3GyUfbAfpSeD0FJ41NbUNSt5wbhqfp1fS+BgnvDFQ==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "has-bigints": "^1.0.2"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/is-binary-path": {
-      "version": "2.1.0",
-      "resolved": "https://registry.npmjs.org/is-binary-path/-/is-binary-path-2.1.0.tgz",
-      "integrity": "sha512-ZMERYes6pDydyuGidse7OsHxtbI7WVeUEozgR/g7rd0xUimYNlvZRE/K2MgZTjWy725IfelLeVcEM97mmtRGXw==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "binary-extensions": "^2.0.0"
-      },
-      "engines": {
-        "node": ">=8"
-      }
-    },
-    "node_modules/is-boolean-object": {
-      "version": "1.2.2",
-      "resolved": "https://registry.npmjs.org/is-boolean-object/-/is-boolean-object-1.2.2.tgz",
-      "integrity": "sha512-wa56o2/ElJMYqjCjGkXri7it5FbebW5usLw/nPmCMs5DeZ7eziSYZhSmPRn0txqeW4LnAmQQU7FgqLpsEFKM4A==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "call-bound": "^1.0.3",
-        "has-tostringtag": "^1.0.2"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/is-bun-module": {
-      "version": "2.0.0",
-      "resolved": "https://registry.npmjs.org/is-bun-module/-/is-bun-module-2.0.0.tgz",
-      "integrity": "sha512-gNCGbnnnnFAUGKeZ9PdbyeGYJqewpmc2aKHUEMO5nQPWU9lOmv7jcmQIv+qHD8fXW6W7qfuCwX4rY9LNRjXrkQ==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "semver": "^7.7.1"
-      }
-    },
-    "node_modules/is-callable": {
-      "version": "1.2.7",
-      "resolved": "https://registry.npmjs.org/is-callable/-/is-callable-1.2.7.tgz",
-      "integrity": "sha512-1BC0BVFhS/p0qtw6enp8e+8OD0UrK0oFLztSjNzhcKA3WDuJxxAPXzPuPtKkjEY9UUoEWlX/8fgKeu2S8i9JTA==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/is-core-module": {
-      "version": "2.16.1",
-      "resolved": "https://registry.npmjs.org/is-core-module/-/is-core-module-2.16.1.tgz",
-      "integrity": "sha512-UfoeMA6fIJ8wTYFEUjelnaGI67v6+N7qXJEvQuIGa99l4xsCruSYOVSQ0uPANn4dAzm8lkYPaKLrrijLq7x23w==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "hasown": "^2.0.2"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/is-data-view": {
-      "version": "1.0.2",
-      "resolved": "https://registry.npmjs.org/is-data-view/-/is-data-view-1.0.2.tgz",
-      "integrity": "sha512-RKtWF8pGmS87i2D6gqQu/l7EYRlVdfzemCJN/P3UOs//x1QE7mfhvzHIApBTRf7axvT6DMGwSwBXYCT0nfB9xw==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "call-bound": "^1.0.2",
-        "get-intrinsic": "^1.2.6",
-        "is-typed-array": "^1.1.13"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/is-date-object": {
-      "version": "1.1.0",
-      "resolved": "https://registry.npmjs.org/is-date-object/-/is-date-object-1.1.0.tgz",
-      "integrity": "sha512-PwwhEakHVKTdRNVOw+/Gyh0+MzlCl4R6qKvkhuvLtPMggI1WAHt9sOwZxQLSGpUaDnrdyDsomoRgNnCfKNSXXg==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "call-bound": "^1.0.2",
-        "has-tostringtag": "^1.0.2"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/is-extglob": {
-      "version": "2.1.1",
-      "resolved": "https://registry.npmjs.org/is-extglob/-/is-extglob-2.1.1.tgz",
-      "integrity": "sha512-SbKbANkN603Vi4jEZv49LeVJMn4yGwsbzZworEoyEiutsN3nJYdbO36zfhGJ6QEDpOZIFkDtnq5JRxmvl3jsoQ==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">=0.10.0"
-      }
-    },
-    "node_modules/is-finalizationregistry": {
-      "version": "1.1.1",
-      "resolved": "https://registry.npmjs.org/is-finalizationregistry/-/is-finalizationregistry-1.1.1.tgz",
-      "integrity": "sha512-1pC6N8qWJbWoPtEjgcL2xyhQOP491EQjeUo3qTKcmV8YSDDJrOepfG8pcC7h/QgnQHYSv0mJ3Z/ZWxmatVrysg==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "call-bound": "^1.0.3"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/is-fullwidth-code-point": {
-      "version": "3.0.0",
-      "resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-3.0.0.tgz",
-      "integrity": "sha512-zymm5+u+sCsSWyD9qNaejV3DFvhCKclKdizYaJUuHA83RLjb7nSuGnddCHGv0hk+KY7BMAlsWeK4Ueg6EV6XQg==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">=8"
-      }
-    },
-    "node_modules/is-generator-function": {
-      "version": "1.1.0",
-      "resolved": "https://registry.npmjs.org/is-generator-function/-/is-generator-function-1.1.0.tgz",
-      "integrity": "sha512-nPUB5km40q9e8UfN/Zc24eLlzdSf9OfKByBw9CIdw4H1giPMeA0OIJvbchsCu4npfI2QcMVBsGEBHKZ7wLTWmQ==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "call-bound": "^1.0.3",
-        "get-proto": "^1.0.0",
-        "has-tostringtag": "^1.0.2",
-        "safe-regex-test": "^1.1.0"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/is-glob": {
-      "version": "4.0.3",
-      "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-4.0.3.tgz",
-      "integrity": "sha512-xelSayHH36ZgE7ZWhli7pW34hNbNl8Ojv5KVmkJD4hBdD3th8Tfk9vYasLM+mXWOZhFkgZfxhLSnrwRr4elSSg==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "is-extglob": "^2.1.1"
-      },
-      "engines": {
-        "node": ">=0.10.0"
-      }
-    },
-    "node_modules/is-map": {
-      "version": "2.0.3",
-      "resolved": "https://registry.npmjs.org/is-map/-/is-map-2.0.3.tgz",
-      "integrity": "sha512-1Qed0/Hr2m+YqxnM09CjA2d/i6YZNfF6R2oRAOj36eUdS6qIV/huPJNSEpKbupewFs+ZsJlxsjjPbc0/afW6Lw==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/is-negative-zero": {
-      "version": "2.0.3",
-      "resolved": "https://registry.npmjs.org/is-negative-zero/-/is-negative-zero-2.0.3.tgz",
-      "integrity": "sha512-5KoIu2Ngpyek75jXodFvnafB6DJgr3u8uuK0LEZJjrU19DrMD3EVERaR8sjz8CCGgpZvxPl9SuE1GMVPFHx1mw==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/is-number": {
-      "version": "7.0.0",
-      "resolved": "https://registry.npmjs.org/is-number/-/is-number-7.0.0.tgz",
-      "integrity": "sha512-41Cifkg6e8TylSpdtTpeLVMqvSBEVzTttHvERD741+pnZ8ANv0004MRL43QKPDlK9cGvNp6NZWZUBlbGXYxxng==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">=0.12.0"
-      }
-    },
-    "node_modules/is-number-object": {
-      "version": "1.1.1",
-      "resolved": "https://registry.npmjs.org/is-number-object/-/is-number-object-1.1.1.tgz",
-      "integrity": "sha512-lZhclumE1G6VYD8VHe35wFaIif+CTy5SJIi5+3y4psDgWu4wPDoBhF8NxUOinEc7pHgiTsT6MaBb92rKhhD+Xw==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "call-bound": "^1.0.3",
-        "has-tostringtag": "^1.0.2"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/is-regex": {
-      "version": "1.2.1",
-      "resolved": "https://registry.npmjs.org/is-regex/-/is-regex-1.2.1.tgz",
-      "integrity": "sha512-MjYsKHO5O7mCsmRGxWcLWheFqN9DJ/2TmngvjKXihe6efViPqc274+Fx/4fYj/r03+ESvBdTXK0V6tA3rgez1g==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "call-bound": "^1.0.2",
-        "gopd": "^1.2.0",
-        "has-tostringtag": "^1.0.2",
-        "hasown": "^2.0.2"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/is-set": {
-      "version": "2.0.3",
-      "resolved": "https://registry.npmjs.org/is-set/-/is-set-2.0.3.tgz",
-      "integrity": "sha512-iPAjerrse27/ygGLxw+EBR9agv9Y6uLeYVJMu+QNCoouJ1/1ri0mGrcWpfCqFZuzzx3WjtwxG098X+n4OuRkPg==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/is-shared-array-buffer": {
-      "version": "1.0.4",
-      "resolved": "https://registry.npmjs.org/is-shared-array-buffer/-/is-shared-array-buffer-1.0.4.tgz",
-      "integrity": "sha512-ISWac8drv4ZGfwKl5slpHG9OwPNty4jOWPRIhBpxOoD+hqITiwuipOQ2bNthAzwA3B4fIjO4Nln74N0S9byq8A==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "call-bound": "^1.0.3"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/is-string": {
-      "version": "1.1.1",
-      "resolved": "https://registry.npmjs.org/is-string/-/is-string-1.1.1.tgz",
-      "integrity": "sha512-BtEeSsoaQjlSPBemMQIrY1MY0uM6vnS1g5fmufYOtnxLGUZM2178PKbhsk7Ffv58IX+ZtcvoGwccYsh0PglkAA==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "call-bound": "^1.0.3",
-        "has-tostringtag": "^1.0.2"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/is-symbol": {
-      "version": "1.1.1",
-      "resolved": "https://registry.npmjs.org/is-symbol/-/is-symbol-1.1.1.tgz",
-      "integrity": "sha512-9gGx6GTtCQM73BgmHQXfDmLtfjjTUDSyoxTCbp5WtoixAhfgsDirWIcVQ/IHpvI5Vgd5i/J5F7B9cN/WlVbC/w==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "call-bound": "^1.0.2",
-        "has-symbols": "^1.1.0",
-        "safe-regex-test": "^1.1.0"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/is-typed-array": {
-      "version": "1.1.15",
-      "resolved": "https://registry.npmjs.org/is-typed-array/-/is-typed-array-1.1.15.tgz",
-      "integrity": "sha512-p3EcsicXjit7SaskXHs1hA91QxgTw46Fv6EFKKGS5DRFLD8yKnohjF3hxoju94b/OcMZoQukzpPpBE9uLVKzgQ==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "which-typed-array": "^1.1.16"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/is-weakmap": {
-      "version": "2.0.2",
-      "resolved": "https://registry.npmjs.org/is-weakmap/-/is-weakmap-2.0.2.tgz",
-      "integrity": "sha512-K5pXYOm9wqY1RgjpL3YTkF39tni1XajUIkawTLUo9EZEVUFga5gSQJF8nNS7ZwJQ02y+1YCNYcMh+HIf1ZqE+w==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/is-weakref": {
-      "version": "1.1.1",
-      "resolved": "https://registry.npmjs.org/is-weakref/-/is-weakref-1.1.1.tgz",
-      "integrity": "sha512-6i9mGWSlqzNMEqpCp93KwRS1uUOodk2OJ6b+sq7ZPDSy2WuI5NFIxp/254TytR8ftefexkWn5xNiHUNpPOfSew==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "call-bound": "^1.0.3"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/is-weakset": {
-      "version": "2.0.4",
-      "resolved": "https://registry.npmjs.org/is-weakset/-/is-weakset-2.0.4.tgz",
-      "integrity": "sha512-mfcwb6IzQyOKTs84CQMrOwW4gQcaTOAWJ0zzJCl2WSPDrWk/OzDaImWFH3djXhb24g4eudZfLRozAvPGw4d9hQ==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "call-bound": "^1.0.3",
-        "get-intrinsic": "^1.2.6"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/isarray": {
-      "version": "2.0.5",
-      "resolved": "https://registry.npmjs.org/isarray/-/isarray-2.0.5.tgz",
-      "integrity": "sha512-xHjhDr3cNBK0BzdUJSPXZntQUx/mwMS5Rw4A7lPJ90XGAO6ISP/ePDNuo0vhqOZU+UD5JoodwCAAoZQd3FeAKw==",
-      "dev": true,
-      "license": "MIT"
-    },
-    "node_modules/isexe": {
-      "version": "2.0.0",
-      "resolved": "https://registry.npmjs.org/isexe/-/isexe-2.0.0.tgz",
-      "integrity": "sha512-RHxMLp9lnKHGHRng9QFhRCMbYAcVpn69smSGcq3f36xjgVVWThj4qqLbTLlq7Ssj8B+fIQ1EuCEGI2lKsyQeIw==",
-      "dev": true,
-      "license": "ISC"
-    },
-    "node_modules/iterator.prototype": {
-      "version": "1.1.5",
-      "resolved": "https://registry.npmjs.org/iterator.prototype/-/iterator.prototype-1.1.5.tgz",
-      "integrity": "sha512-H0dkQoCa3b2VEeKQBOxFph+JAbcrQdE7KC0UkqwpLmv2EC4P41QXP+rqo9wYodACiG5/WM5s9oDApTU8utwj9g==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "define-data-property": "^1.1.4",
-        "es-object-atoms": "^1.0.0",
-        "get-intrinsic": "^1.2.6",
-        "get-proto": "^1.0.0",
-        "has-symbols": "^1.1.0",
-        "set-function-name": "^2.0.2"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      }
-    },
-    "node_modules/jackspeak": {
-      "version": "3.4.3",
-      "resolved": "https://registry.npmjs.org/jackspeak/-/jackspeak-3.4.3.tgz",
-      "integrity": "sha512-OGlZQpz2yfahA/Rd1Y8Cd9SIEsqvXkLVoSw/cgwhnhFMDbsQFeZYoJJ7bIZBS9BcamUW96asq/npPWugM+RQBw==",
-      "dev": true,
-      "license": "BlueOak-1.0.0",
-      "dependencies": {
-        "@isaacs/cliui": "^8.0.2"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/isaacs"
-      },
-      "optionalDependencies": {
-        "@pkgjs/parseargs": "^0.11.0"
-      }
-    },
-    "node_modules/jiti": {
-      "version": "1.21.7",
-      "resolved": "https://registry.npmjs.org/jiti/-/jiti-1.21.7.tgz",
-      "integrity": "sha512-/imKNG4EbWNrVjoNC/1H5/9GFy+tqjGBHCaSsN+P2RnPqjsLmv6UD3Ej+Kj8nBWaRAwyk7kK5ZUc+OEatnTR3A==",
-      "dev": true,
-      "license": "MIT",
-      "bin": {
-        "jiti": "bin/jiti.js"
-      }
-    },
-    "node_modules/js-tokens": {
-      "version": "4.0.0",
-      "resolved": "https://registry.npmjs.org/js-tokens/-/js-tokens-4.0.0.tgz",
-      "integrity": "sha512-RdJUflcE3cUzKiMqQgsCu06FPu9UdIJO0beYbPhHN4k6apgJtifcoCtT9bcxOpYBtpD2kCM6Sbzg4CausW/PKQ==",
-      "license": "MIT"
-    },
-    "node_modules/js-yaml": {
-      "version": "4.1.1",
-      "resolved": "https://registry.npmjs.org/js-yaml/-/js-yaml-4.1.1.tgz",
-      "integrity": "sha512-qQKT4zQxXl8lLwBtHMWwaTcGfFOZviOJet3Oy/xmGk2gZH677CJM9EvtfdSkgWcATZhj/55JZ0rmy3myCT5lsA==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "argparse": "^2.0.1"
-      },
-      "bin": {
-        "js-yaml": "bin/js-yaml.js"
-      }
-    },
-    "node_modules/jsesc": {
-      "version": "3.1.0",
-      "resolved": "https://registry.npmjs.org/jsesc/-/jsesc-3.1.0.tgz",
-      "integrity": "sha512-/sM3dO2FOzXjKQhJuo0Q173wf2KOo8t4I8vHy6lF9poUp7bKT0/NHE8fPX23PwfhnykfqnC2xRxOnVw5XuGIaA==",
-      "dev": true,
-      "license": "MIT",
-      "bin": {
-        "jsesc": "bin/jsesc"
-      },
-      "engines": {
-        "node": ">=6"
-      }
-    },
-    "node_modules/json-buffer": {
-      "version": "3.0.1",
-      "resolved": "https://registry.npmjs.org/json-buffer/-/json-buffer-3.0.1.tgz",
-      "integrity": "sha512-4bV5BfR2mqfQTJm+V5tPPdf+ZpuhiIvTuAB5g8kcrXOZpTT/QwwVRWBywX1ozr6lEuPdbHxwaJlm9G6mI2sfSQ==",
-      "dev": true,
-      "license": "MIT"
-    },
-    "node_modules/json-schema-traverse": {
-      "version": "0.4.1",
-      "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-0.4.1.tgz",
-      "integrity": "sha512-xbbCH5dCYU5T8LcEhhuh7HJ88HXuW3qsI3Y0zOZFKfZEHcpWiHU/Jxzk629Brsab/mMiHQti9wMP+845RPe3Vg==",
-      "dev": true,
-      "license": "MIT"
-    },
-    "node_modules/json-stable-stringify-without-jsonify": {
-      "version": "1.0.1",
-      "resolved": "https://registry.npmjs.org/json-stable-stringify-without-jsonify/-/json-stable-stringify-without-jsonify-1.0.1.tgz",
-      "integrity": "sha512-Bdboy+l7tA3OGW6FjyFHWkP5LuByj1Tk33Ljyq0axyzdk9//JSi2u3fP1QSmd1KNwq6VOKYGlAu87CisVir6Pw==",
-      "dev": true,
-      "license": "MIT"
-    },
-    "node_modules/json5": {
-      "version": "1.0.2",
-      "resolved": "https://registry.npmjs.org/json5/-/json5-1.0.2.tgz",
-      "integrity": "sha512-g1MWMLBiz8FKi1e4w0UyVL3w+iJceWAFBAaBnnGKOpNa5f8TLktkbre1+s6oICydWAm+HRUGTmI+//xv2hvXYA==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "minimist": "^1.2.0"
-      },
-      "bin": {
-        "json5": "lib/cli.js"
-      }
-    },
-    "node_modules/jsx-ast-utils": {
-      "version": "3.3.5",
-      "resolved": "https://registry.npmjs.org/jsx-ast-utils/-/jsx-ast-utils-3.3.5.tgz",
-      "integrity": "sha512-ZZow9HBI5O6EPgSJLUb8n2NKgmVWTwCvHGwFuJlMjvLFqlGG6pjirPhtdsseaLZjSibD8eegzmYpUZwoIlj2cQ==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "array-includes": "^3.1.6",
-        "array.prototype.flat": "^1.3.1",
-        "object.assign": "^4.1.4",
-        "object.values": "^1.1.6"
-      },
-      "engines": {
-        "node": ">=4.0"
-      }
-    },
-    "node_modules/keyv": {
-      "version": "4.5.4",
-      "resolved": "https://registry.npmjs.org/keyv/-/keyv-4.5.4.tgz",
-      "integrity": "sha512-oxVHkHR/EJf2CNXnWxRLW6mg7JyCCUcG0DtEGmL2ctUo1PNTin1PUil+r/+4r5MpVgC/fn1kjsx7mjSujKqIpw==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "json-buffer": "3.0.1"
-      }
-    },
-    "node_modules/language-subtag-registry": {
-      "version": "0.3.23",
-      "resolved": "https://registry.npmjs.org/language-subtag-registry/-/language-subtag-registry-0.3.23.tgz",
-      "integrity": "sha512-0K65Lea881pHotoGEa5gDlMxt3pctLi2RplBb7Ezh4rRdLEOtgi7n4EwK9lamnUCkKBqaeKRVebTq6BAxSkpXQ==",
-      "dev": true,
-      "license": "CC0-1.0"
-    },
-    "node_modules/language-tags": {
-      "version": "1.0.9",
-      "resolved": "https://registry.npmjs.org/language-tags/-/language-tags-1.0.9.tgz",
-      "integrity": "sha512-MbjN408fEndfiQXbFQ1vnd+1NoLDsnQW41410oQBXiyXDMYH5z505juWa4KUE1LqxRC7DgOgZDbKLxHIwm27hA==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "language-subtag-registry": "^0.3.20"
-      },
-      "engines": {
-        "node": ">=0.10"
-      }
-    },
-    "node_modules/levn": {
-      "version": "0.4.1",
-      "resolved": "https://registry.npmjs.org/levn/-/levn-0.4.1.tgz",
-      "integrity": "sha512-+bT2uH4E5LGE7h/n3evcS/sQlJXCpIp6ym8OWJ5eV6+67Dsql/LaaT7qJBAt2rzfoa/5QBGBhxDix1dMt2kQKQ==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "prelude-ls": "^1.2.1",
-        "type-check": "~0.4.0"
-      },
-      "engines": {
-        "node": ">= 0.8.0"
-      }
-    },
-    "node_modules/lilconfig": {
-      "version": "3.1.3",
-      "resolved": "https://registry.npmjs.org/lilconfig/-/lilconfig-3.1.3.tgz",
-      "integrity": "sha512-/vlFKAoH5Cgt3Ie+JLhRbwOsCQePABiU3tJ1egGvyQ+33R/vcwM2Zl2QR/LzjsBeItPt3oSVXapn+m4nQDvpzw==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">=14"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/antonk52"
-      }
-    },
-    "node_modules/lines-and-columns": {
-      "version": "1.2.4",
-      "resolved": "https://registry.npmjs.org/lines-and-columns/-/lines-and-columns-1.2.4.tgz",
-      "integrity": "sha512-7ylylesZQ/PV29jhEDl3Ufjo6ZX7gCqJr5F7PKrqc93v7fzSymt1BpwEU8nAUXs8qzzvqhbjhK5QZg6Mt/HkBg==",
-      "dev": true,
-      "license": "MIT"
-    },
-    "node_modules/locate-path": {
-      "version": "6.0.0",
-      "resolved": "https://registry.npmjs.org/locate-path/-/locate-path-6.0.0.tgz",
-      "integrity": "sha512-iPZK6eYjbxRu3uB4/WZ3EsEIMJFMqAoopl3R+zuq0UjcAm/MO6KCweDgPfP3elTztoKP3KtnVHxTn2NHBSDVUw==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "p-locate": "^5.0.0"
-      },
-      "engines": {
-        "node": ">=10"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/sindresorhus"
-      }
-    },
-    "node_modules/lodash.merge": {
-      "version": "4.6.2",
-      "resolved": "https://registry.npmjs.org/lodash.merge/-/lodash.merge-4.6.2.tgz",
-      "integrity": "sha512-0KpjqXRVvrYyCsX1swR/XTK0va6VQkQM6MNo7PqW77ByjAhoARA8EfrP1N4+KlKj8YS0ZUCtRT/YUuhyYDujIQ==",
-      "dev": true,
-      "license": "MIT"
-    },
-    "node_modules/loose-envify": {
-      "version": "1.4.0",
-      "resolved": "https://registry.npmjs.org/loose-envify/-/loose-envify-1.4.0.tgz",
-      "integrity": "sha512-lyuxPGr/Wfhrlem2CL/UcnUc1zcqKAImBDzukY7Y5F/yQiNdko6+fRLevlw1HgMySw7f611UIY408EtxRSoK3Q==",
-      "license": "MIT",
-      "dependencies": {
-        "js-tokens": "^3.0.0 || ^4.0.0"
-      },
-      "bin": {
-        "loose-envify": "cli.js"
-      }
-    },
-    "node_modules/lru-cache": {
-      "version": "10.4.3",
-      "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-10.4.3.tgz",
-      "integrity": "sha512-JNAzZcXrCt42VGLuYz0zfAzDfAvJWW6AfYlDBQyDV5DClI2m5sAmK+OIO7s59XfsRsWHp02jAJrRadPRGTt6SQ==",
-      "dev": true,
-      "license": "ISC"
-    },
-    "node_modules/math-intrinsics": {
-      "version": "1.1.0",
-      "resolved": "https://registry.npmjs.org/math-intrinsics/-/math-intrinsics-1.1.0.tgz",
-      "integrity": "sha512-/IXtbwEk5HTPyEwyKX6hGkYXxM9nbj64B+ilVJnC/R6B0pH5G4V3b0pVbL7DBj4tkhBAppbQUlf6F6Xl9LHu1g==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">= 0.4"
-      }
-    },
-    "node_modules/merge2": {
-      "version": "1.4.1",
-      "resolved": "https://registry.npmjs.org/merge2/-/merge2-1.4.1.tgz",
-      "integrity": "sha512-8q7VEgMJW4J8tcfVPy8g09NcQwZdbwFEqhe/WZkoIzjn/3TGDwtOCYtXGxA3O8tPzpczCCDgv+P2P5y00ZJOOg==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">= 8"
-      }
-    },
-    "node_modules/micromatch": {
-      "version": "4.0.8",
-      "resolved": "https://registry.npmjs.org/micromatch/-/micromatch-4.0.8.tgz",
-      "integrity": "sha512-PXwfBhYu0hBCPw8Dn0E+WDYb7af3dSLVWKi3HGv84IdF4TyFoC0ysxFd0Goxw7nSv4T/PzEJQxsYsEiFCKo2BA==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "braces": "^3.0.3",
-        "picomatch": "^2.3.1"
-      },
-      "engines": {
-        "node": ">=8.6"
-      }
-    },
-    "node_modules/minimatch": {
-      "version": "3.1.2",
-      "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.1.2.tgz",
-      "integrity": "sha512-J7p63hRiAjw1NDEww1W7i37+ByIrOWO5XQQAzZ3VOcL0PNybwpfmV/N05zFAzwQ9USyEcX6t3UO+K5aqBQOIHw==",
-      "dev": true,
-      "license": "ISC",
-      "dependencies": {
-        "brace-expansion": "^1.1.7"
-      },
-      "engines": {
-        "node": "*"
-      }
-    },
-    "node_modules/minimist": {
-      "version": "1.2.8",
-      "resolved": "https://registry.npmjs.org/minimist/-/minimist-1.2.8.tgz",
-      "integrity": "sha512-2yyAR8qBkN3YuheJanUpWC5U3bb5osDywNB8RzDVlDwDHbocAJveqqj1u8+SVD7jkWT4yvsHCpWqqWqAxb0zCA==",
-      "dev": true,
-      "license": "MIT",
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/minipass": {
-      "version": "7.1.2",
-      "resolved": "https://registry.npmjs.org/minipass/-/minipass-7.1.2.tgz",
-      "integrity": "sha512-qOOzS1cBTWYF4BH8fVePDBOO9iptMnGUEZwNc/cMWnTV2nVLZ7VoNWEPHkYczZA0pdoA7dl6e7FL659nX9S2aw==",
-      "dev": true,
-      "license": "ISC",
-      "engines": {
-        "node": ">=16 || 14 >=14.17"
-      }
-    },
-    "node_modules/ms": {
-      "version": "2.1.3",
-      "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz",
-      "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==",
-      "dev": true,
-      "license": "MIT"
-    },
-    "node_modules/mz": {
-      "version": "2.7.0",
-      "resolved": "https://registry.npmjs.org/mz/-/mz-2.7.0.tgz",
-      "integrity": "sha512-z81GNO7nnYMEhrGh9LeymoE4+Yr0Wn5McHIZMK5cfQCl+NDX08sCZgUc9/6MHni9IWuFLm1Z3HTCXu2z9fN62Q==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "any-promise": "^1.0.0",
-        "object-assign": "^4.0.1",
-        "thenify-all": "^1.0.0"
-      }
-    },
-    "node_modules/nanoid": {
-      "version": "3.3.11",
-      "resolved": "https://registry.npmjs.org/nanoid/-/nanoid-3.3.11.tgz",
-      "integrity": "sha512-N8SpfPUnUp1bK+PMYW8qSWdl9U+wwNWI4QKxOYDy9JAro3WMX7p2OeVRF9v+347pnakNevPmiHhNmZ2HbFA76w==",
-      "funding": [
-        {
-          "type": "github",
-          "url": "https://github.com/sponsors/ai"
-        }
-      ],
-      "license": "MIT",
-      "bin": {
-        "nanoid": "bin/nanoid.cjs"
-      },
-      "engines": {
-        "node": "^10 || ^12 || ^13.7 || ^14 || >=15.0.1"
-      }
-    },
-    "node_modules/napi-postinstall": {
-      "version": "0.3.2",
-      "resolved": "https://registry.npmjs.org/napi-postinstall/-/napi-postinstall-0.3.2.tgz",
-      "integrity": "sha512-tWVJxJHmBWLy69PvO96TZMZDrzmw5KeiZBz3RHmiM2XZ9grBJ2WgMAFVVg25nqp3ZjTFUs2Ftw1JhscL3Teliw==",
-      "dev": true,
-      "license": "MIT",
-      "bin": {
-        "napi-postinstall": "lib/cli.js"
-      },
-      "engines": {
-        "node": "^12.20.0 || ^14.18.0 || >=16.0.0"
-      },
-      "funding": {
-        "url": "https://opencollective.com/napi-postinstall"
-      }
-    },
-    "node_modules/natural-compare": {
-      "version": "1.4.0",
-      "resolved": "https://registry.npmjs.org/natural-compare/-/natural-compare-1.4.0.tgz",
-      "integrity": "sha512-OWND8ei3VtNC9h7V60qff3SVobHr996CTwgxubgyQYEpg290h9J0buyECNNJexkFm5sOajh5G116RYA1c8ZMSw==",
-      "dev": true,
-      "license": "MIT"
-    },
-    "node_modules/next": {
-      "version": "14.1.0",
-      "resolved": "https://registry.npmjs.org/next/-/next-14.1.0.tgz",
-      "integrity": "sha512-wlzrsbfeSU48YQBjZhDzOwhWhGsy+uQycR8bHAOt1LY1bn3zZEcDyHQOEoN3aWzQ8LHCAJ1nqrWCc9XF2+O45Q==",
-      "license": "MIT",
-      "dependencies": {
-        "@next/env": "14.1.0",
-        "@swc/helpers": "0.5.2",
-        "busboy": "1.6.0",
-        "caniuse-lite": "^1.0.30001579",
-        "graceful-fs": "^4.2.11",
-        "postcss": "8.4.31",
-        "styled-jsx": "5.1.1"
-      },
-      "bin": {
-        "next": "dist/bin/next"
-      },
-      "engines": {
-        "node": ">=18.17.0"
-      },
-      "optionalDependencies": {
-        "@next/swc-darwin-arm64": "14.1.0",
-        "@next/swc-darwin-x64": "14.1.0",
-        "@next/swc-linux-arm64-gnu": "14.1.0",
-        "@next/swc-linux-arm64-musl": "14.1.0",
-        "@next/swc-linux-x64-gnu": "14.1.0",
-        "@next/swc-linux-x64-musl": "14.1.0",
-        "@next/swc-win32-arm64-msvc": "14.1.0",
-        "@next/swc-win32-ia32-msvc": "14.1.0",
-        "@next/swc-win32-x64-msvc": "14.1.0"
-      },
-      "peerDependencies": {
-        "@opentelemetry/api": "^1.1.0",
-        "react": "^18.2.0",
-        "react-dom": "^18.2.0",
-        "sass": "^1.3.0"
-      },
-      "peerDependenciesMeta": {
-        "@opentelemetry/api": {
-          "optional": true
-        },
-        "sass": {
-          "optional": true
-        }
-      }
-    },
-    "node_modules/next/node_modules/postcss": {
-      "version": "8.4.31",
-      "resolved": "https://registry.npmjs.org/postcss/-/postcss-8.4.31.tgz",
-      "integrity": "sha512-PS08Iboia9mts/2ygV3eLpY5ghnUcfLV/EXTOW1E2qYxJKGGBUtNjN76FYHnMs36RmARn41bC0AZmn+rR0OVpQ==",
-      "funding": [
-        {
-          "type": "opencollective",
-          "url": "https://opencollective.com/postcss/"
-        },
-        {
-          "type": "tidelift",
-          "url": "https://tidelift.com/funding/github/npm/postcss"
-        },
-        {
-          "type": "github",
-          "url": "https://github.com/sponsors/ai"
-        }
-      ],
-      "license": "MIT",
-      "dependencies": {
-        "nanoid": "^3.3.6",
-        "picocolors": "^1.0.0",
-        "source-map-js": "^1.0.2"
-      },
-      "engines": {
-        "node": "^10 || ^12 || >=14"
-      }
-    },
-    "node_modules/node-releases": {
-      "version": "2.0.19",
-      "resolved": "https://registry.npmjs.org/node-releases/-/node-releases-2.0.19.tgz",
-      "integrity": "sha512-xxOWJsBKtzAq7DY0J+DTzuz58K8e7sJbdgwkbMWQe8UYB6ekmsQ45q0M/tJDsGaZmbC+l7n57UV8Hl5tHxO9uw==",
-      "dev": true,
-      "license": "MIT"
-    },
-    "node_modules/normalize-path": {
-      "version": "3.0.0",
-      "resolved": "https://registry.npmjs.org/normalize-path/-/normalize-path-3.0.0.tgz",
-      "integrity": "sha512-6eZs5Ls3WtCisHWp9S2GUy8dqkpGi4BVSz3GaqiE6ezub0512ESztXUwUB6C6IKbQkY2Pnb/mD4WYojCRwcwLA==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">=0.10.0"
-      }
-    },
-    "node_modules/normalize-range": {
-      "version": "0.1.2",
-      "resolved": "https://registry.npmjs.org/normalize-range/-/normalize-range-0.1.2.tgz",
-      "integrity": "sha512-bdok/XvKII3nUpklnV6P2hxtMNrCboOjAcyBuQnWEhO665FwrSNRxU+AqpsyvO6LgGYPspN+lu5CLtw4jPRKNA==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">=0.10.0"
-      }
-    },
-    "node_modules/object-assign": {
-      "version": "4.1.1",
-      "resolved": "https://registry.npmjs.org/object-assign/-/object-assign-4.1.1.tgz",
-      "integrity": "sha512-rJgTQnkUnH1sFw8yT6VSU3zD3sWmu6sZhIseY8VX+GRu3P6F7Fu+JNDoXfklElbLJSnc3FUQHVe4cU5hj+BcUg==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">=0.10.0"
-      }
-    },
-    "node_modules/object-hash": {
-      "version": "3.0.0",
-      "resolved": "https://registry.npmjs.org/object-hash/-/object-hash-3.0.0.tgz",
-      "integrity": "sha512-RSn9F68PjH9HqtltsSnqYC1XXoWe9Bju5+213R98cNGttag9q9yAOTzdbsqvIa7aNm5WffBZFpWYr2aWrklWAw==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">= 6"
-      }
-    },
-    "node_modules/object-inspect": {
-      "version": "1.13.4",
-      "resolved": "https://registry.npmjs.org/object-inspect/-/object-inspect-1.13.4.tgz",
-      "integrity": "sha512-W67iLl4J2EXEGTbfeHCffrjDfitvLANg0UlX3wFUUSTx92KXRFegMHUVgSqE+wvhAbi4WqjGg9czysTV2Epbew==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/object-keys": {
-      "version": "1.1.1",
-      "resolved": "https://registry.npmjs.org/object-keys/-/object-keys-1.1.1.tgz",
-      "integrity": "sha512-NuAESUOUMrlIXOfHKzD6bpPu3tYt3xvjNdRIQ+FeT0lNb4K8WR70CaDxhuNguS2XG+GjkyMwOzsN5ZktImfhLA==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">= 0.4"
-      }
-    },
-    "node_modules/object.assign": {
-      "version": "4.1.7",
-      "resolved": "https://registry.npmjs.org/object.assign/-/object.assign-4.1.7.tgz",
-      "integrity": "sha512-nK28WOo+QIjBkDduTINE4JkF/UJJKyf2EJxvJKfblDpyg0Q+pkOHNTL0Qwy6NP6FhE/EnzV73BxxqcJaXY9anw==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "call-bind": "^1.0.8",
-        "call-bound": "^1.0.3",
-        "define-properties": "^1.2.1",
-        "es-object-atoms": "^1.0.0",
-        "has-symbols": "^1.1.0",
-        "object-keys": "^1.1.1"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/object.entries": {
-      "version": "1.1.9",
-      "resolved": "https://registry.npmjs.org/object.entries/-/object.entries-1.1.9.tgz",
-      "integrity": "sha512-8u/hfXFRBD1O0hPUjioLhoWFHRmt6tKA4/vZPyckBr18l1KE9uHrFaFaUi8MDRTpi4uak2goyPTSNJLXX2k2Hw==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "call-bind": "^1.0.8",
-        "call-bound": "^1.0.4",
-        "define-properties": "^1.2.1",
-        "es-object-atoms": "^1.1.1"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      }
-    },
-    "node_modules/object.fromentries": {
-      "version": "2.0.8",
-      "resolved": "https://registry.npmjs.org/object.fromentries/-/object.fromentries-2.0.8.tgz",
-      "integrity": "sha512-k6E21FzySsSK5a21KRADBd/NGneRegFO5pLHfdQLpRDETUNJueLXs3WCzyQ3tFRDYgbq3KHGXfTbi2bs8WQ6rQ==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "call-bind": "^1.0.7",
-        "define-properties": "^1.2.1",
-        "es-abstract": "^1.23.2",
-        "es-object-atoms": "^1.0.0"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/object.groupby": {
-      "version": "1.0.3",
-      "resolved": "https://registry.npmjs.org/object.groupby/-/object.groupby-1.0.3.tgz",
-      "integrity": "sha512-+Lhy3TQTuzXI5hevh8sBGqbmurHbbIjAi0Z4S63nthVLmLxfbj4T54a4CfZrXIrt9iP4mVAPYMo/v99taj3wjQ==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "call-bind": "^1.0.7",
-        "define-properties": "^1.2.1",
-        "es-abstract": "^1.23.2"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      }
-    },
-    "node_modules/object.values": {
-      "version": "1.2.1",
-      "resolved": "https://registry.npmjs.org/object.values/-/object.values-1.2.1.tgz",
-      "integrity": "sha512-gXah6aZrcUxjWg2zR2MwouP2eHlCBzdV4pygudehaKXSGW4v2AsRQUK+lwwXhii6KFZcunEnmSUoYp5CXibxtA==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "call-bind": "^1.0.8",
-        "call-bound": "^1.0.3",
-        "define-properties": "^1.2.1",
-        "es-object-atoms": "^1.0.0"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/optionator": {
-      "version": "0.9.4",
-      "resolved": "https://registry.npmjs.org/optionator/-/optionator-0.9.4.tgz",
-      "integrity": "sha512-6IpQ7mKUxRcZNLIObR0hz7lxsapSSIYNZJwXPGeF0mTVqGKFIXj1DQcMoT22S3ROcLyY/rz0PWaWZ9ayWmad9g==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "deep-is": "^0.1.3",
-        "fast-levenshtein": "^2.0.6",
-        "levn": "^0.4.1",
-        "prelude-ls": "^1.2.1",
-        "type-check": "^0.4.0",
-        "word-wrap": "^1.2.5"
-      },
-      "engines": {
-        "node": ">= 0.8.0"
-      }
-    },
-    "node_modules/own-keys": {
-      "version": "1.0.1",
-      "resolved": "https://registry.npmjs.org/own-keys/-/own-keys-1.0.1.tgz",
-      "integrity": "sha512-qFOyK5PjiWZd+QQIh+1jhdb9LpxTF0qs7Pm8o5QHYZ0M3vKqSqzsZaEB6oWlxZ+q2sJBMI/Ktgd2N5ZwQoRHfg==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "get-intrinsic": "^1.2.6",
-        "object-keys": "^1.1.1",
-        "safe-push-apply": "^1.0.0"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/p-limit": {
-      "version": "3.1.0",
-      "resolved": "https://registry.npmjs.org/p-limit/-/p-limit-3.1.0.tgz",
-      "integrity": "sha512-TYOanM3wGwNGsZN2cVTYPArw454xnXj5qmWF1bEoAc4+cU/ol7GVh7odevjp1FNHduHc3KZMcFduxU5Xc6uJRQ==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "yocto-queue": "^0.1.0"
-      },
-      "engines": {
-        "node": ">=10"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/sindresorhus"
-      }
-    },
-    "node_modules/p-locate": {
-      "version": "5.0.0",
-      "resolved": "https://registry.npmjs.org/p-locate/-/p-locate-5.0.0.tgz",
-      "integrity": "sha512-LaNjtRWUBY++zB5nE/NwcaoMylSPk+S+ZHNB1TzdbMJMny6dynpAGt7X/tl/QYq3TIeE6nxHppbo2LGymrG5Pw==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "p-limit": "^3.0.2"
-      },
-      "engines": {
-        "node": ">=10"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/sindresorhus"
-      }
-    },
-    "node_modules/package-json-from-dist": {
-      "version": "1.0.1",
-      "resolved": "https://registry.npmjs.org/package-json-from-dist/-/package-json-from-dist-1.0.1.tgz",
-      "integrity": "sha512-UEZIS3/by4OC8vL3P2dTXRETpebLI2NiI5vIrjaD/5UtrkFX/tNbwjTSRAGC/+7CAo2pIcBaRgWmcBBHcsaCIw==",
-      "dev": true,
-      "license": "BlueOak-1.0.0"
-    },
-    "node_modules/parent-module": {
-      "version": "1.0.1",
-      "resolved": "https://registry.npmjs.org/parent-module/-/parent-module-1.0.1.tgz",
-      "integrity": "sha512-GQ2EWRpQV8/o+Aw8YqtfZZPfNRWZYkbidE9k5rpl/hC3vtHHBfGm2Ifi6qWV+coDGkrUKZAxE3Lot5kcsRlh+g==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "callsites": "^3.0.0"
-      },
-      "engines": {
-        "node": ">=6"
-      }
-    },
-    "node_modules/path-exists": {
-      "version": "4.0.0",
-      "resolved": "https://registry.npmjs.org/path-exists/-/path-exists-4.0.0.tgz",
-      "integrity": "sha512-ak9Qy5Q7jYb2Wwcey5Fpvg2KoAc/ZIhLSLOSBmRmygPsGwkVVt0fZa0qrtMz+m6tJTAHfZQ8FnmB4MG4LWy7/w==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">=8"
-      }
-    },
-    "node_modules/path-key": {
-      "version": "3.1.1",
-      "resolved": "https://registry.npmjs.org/path-key/-/path-key-3.1.1.tgz",
-      "integrity": "sha512-ojmeN0qd+y0jszEtoY48r0Peq5dwMEkIlCOu6Q5f41lfkswXuKtYrhgoTpLnyIcHm24Uhqx+5Tqm2InSwLhE6Q==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">=8"
-      }
-    },
-    "node_modules/path-parse": {
-      "version": "1.0.7",
-      "resolved": "https://registry.npmjs.org/path-parse/-/path-parse-1.0.7.tgz",
-      "integrity": "sha512-LDJzPVEEEPR+y48z93A0Ed0yXb8pAByGWo/k5YYdYgpY2/2EsOsksJrq7lOHxryrVOn1ejG6oAp8ahvOIQD8sw==",
-      "dev": true,
-      "license": "MIT"
-    },
-    "node_modules/path-scurry": {
-      "version": "1.11.1",
-      "resolved": "https://registry.npmjs.org/path-scurry/-/path-scurry-1.11.1.tgz",
-      "integrity": "sha512-Xa4Nw17FS9ApQFJ9umLiJS4orGjm7ZzwUrwamcGQuHSzDyth9boKDaycYdDcZDuqYATXw4HFXgaqWTctW/v1HA==",
-      "dev": true,
-      "license": "BlueOak-1.0.0",
-      "dependencies": {
-        "lru-cache": "^10.2.0",
-        "minipass": "^5.0.0 || ^6.0.2 || ^7.0.0"
-      },
-      "engines": {
-        "node": ">=16 || 14 >=14.18"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/isaacs"
-      }
-    },
-    "node_modules/picocolors": {
-      "version": "1.1.1",
-      "resolved": "https://registry.npmjs.org/picocolors/-/picocolors-1.1.1.tgz",
-      "integrity": "sha512-xceH2snhtb5M9liqDsmEw56le376mTZkEX/jEb/RxNFyegNul7eNslCXP9FDj/Lcu0X8KEyMceP2ntpaHrDEVA==",
-      "license": "ISC"
-    },
-    "node_modules/picomatch": {
-      "version": "2.3.1",
-      "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-2.3.1.tgz",
-      "integrity": "sha512-JU3teHTNjmE2VCGFzuY8EXzCDVwEqB2a8fsIvwaStHhAWJEeVd1o1QD80CU6+ZdEXXSLbSsuLwJjkCBWqRQUVA==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">=8.6"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/jonschlinkert"
-      }
-    },
-    "node_modules/pify": {
-      "version": "2.3.0",
-      "resolved": "https://registry.npmjs.org/pify/-/pify-2.3.0.tgz",
-      "integrity": "sha512-udgsAY+fTnvv7kI7aaxbqwWNb0AHiB0qBO89PZKPkoTmGOgdbrHDKD+0B2X4uTfJ/FT1R09r9gTsjUjNJotuog==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">=0.10.0"
-      }
-    },
-    "node_modules/pirates": {
-      "version": "4.0.7",
-      "resolved": "https://registry.npmjs.org/pirates/-/pirates-4.0.7.tgz",
-      "integrity": "sha512-TfySrs/5nm8fQJDcBDuUng3VOUKsd7S+zqvbOTiGXHfxX4wK31ard+hoNuvkicM/2YFzlpDgABOevKSsB4G/FA==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">= 6"
-      }
-    },
-    "node_modules/possible-typed-array-names": {
-      "version": "1.1.0",
-      "resolved": "https://registry.npmjs.org/possible-typed-array-names/-/possible-typed-array-names-1.1.0.tgz",
-      "integrity": "sha512-/+5VFTchJDoVj3bhoqi6UeymcD00DAwb1nJwamzPvHEszJ4FpF6SNNbUbOS8yI56qHzdV8eK0qEfOSiodkTdxg==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">= 0.4"
-      }
-    },
-    "node_modules/postcss": {
-      "version": "8.5.6",
-      "resolved": "https://registry.npmjs.org/postcss/-/postcss-8.5.6.tgz",
-      "integrity": "sha512-3Ybi1tAuwAP9s0r1UQ2J4n5Y0G05bJkpUIO0/bI9MhwmD70S5aTWbXGBwxHrelT+XM1k6dM0pk+SwNkpTRN7Pg==",
-      "dev": true,
-      "funding": [
-        {
-          "type": "opencollective",
-          "url": "https://opencollective.com/postcss/"
-        },
-        {
-          "type": "tidelift",
-          "url": "https://tidelift.com/funding/github/npm/postcss"
-        },
-        {
-          "type": "github",
-          "url": "https://github.com/sponsors/ai"
-        }
-      ],
-      "license": "MIT",
-      "dependencies": {
-        "nanoid": "^3.3.11",
-        "picocolors": "^1.1.1",
-        "source-map-js": "^1.2.1"
-      },
-      "engines": {
-        "node": "^10 || ^12 || >=14"
-      }
-    },
-    "node_modules/postcss-import": {
-      "version": "15.1.0",
-      "resolved": "https://registry.npmjs.org/postcss-import/-/postcss-import-15.1.0.tgz",
-      "integrity": "sha512-hpr+J05B2FVYUAXHeK1YyI267J/dDDhMU6B6civm8hSY1jYJnBXxzKDKDswzJmtLHryrjhnDjqqp/49t8FALew==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "postcss-value-parser": "^4.0.0",
-        "read-cache": "^1.0.0",
-        "resolve": "^1.1.7"
-      },
-      "engines": {
-        "node": ">=14.0.0"
-      },
-      "peerDependencies": {
-        "postcss": "^8.0.0"
-      }
-    },
-    "node_modules/postcss-js": {
-      "version": "4.0.1",
-      "resolved": "https://registry.npmjs.org/postcss-js/-/postcss-js-4.0.1.tgz",
-      "integrity": "sha512-dDLF8pEO191hJMtlHFPRa8xsizHaM82MLfNkUHdUtVEV3tgTp5oj+8qbEqYM57SLfc74KSbw//4SeJma2LRVIw==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "camelcase-css": "^2.0.1"
-      },
-      "engines": {
-        "node": "^12 || ^14 || >= 16"
-      },
-      "funding": {
-        "type": "opencollective",
-        "url": "https://opencollective.com/postcss/"
-      },
-      "peerDependencies": {
-        "postcss": "^8.4.21"
-      }
-    },
-    "node_modules/postcss-nested": {
-      "version": "6.2.0",
-      "resolved": "https://registry.npmjs.org/postcss-nested/-/postcss-nested-6.2.0.tgz",
-      "integrity": "sha512-HQbt28KulC5AJzG+cZtj9kvKB93CFCdLvog1WFLf1D+xmMvPGlBstkpTEZfK5+AN9hfJocyBFCNiqyS48bpgzQ==",
-      "dev": true,
-      "funding": [
-        {
-          "type": "opencollective",
-          "url": "https://opencollective.com/postcss/"
-        },
-        {
-          "type": "github",
-          "url": "https://github.com/sponsors/ai"
-        }
-      ],
-      "license": "MIT",
-      "dependencies": {
-        "postcss-selector-parser": "^6.1.1"
-      },
-      "engines": {
-        "node": ">=12.0"
-      },
-      "peerDependencies": {
-        "postcss": "^8.2.14"
-      }
-    },
-    "node_modules/postcss-selector-parser": {
-      "version": "6.1.2",
-      "resolved": "https://registry.npmjs.org/postcss-selector-parser/-/postcss-selector-parser-6.1.2.tgz",
-      "integrity": "sha512-Q8qQfPiZ+THO/3ZrOrO0cJJKfpYCagtMUkXbnEfmgUjwXg6z/WBeOyS9APBBPCTSiDV+s4SwQGu8yFsiMRIudg==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "cssesc": "^3.0.0",
-        "util-deprecate": "^1.0.2"
-      },
-      "engines": {
-        "node": ">=4"
-      }
-    },
-    "node_modules/postcss-value-parser": {
-      "version": "4.2.0",
-      "resolved": "https://registry.npmjs.org/postcss-value-parser/-/postcss-value-parser-4.2.0.tgz",
-      "integrity": "sha512-1NNCs6uurfkVbeXG4S8JFT9t19m45ICnif8zWLd5oPSZ50QnwMfK+H3jv408d4jw/7Bttv5axS5IiHoLaVNHeQ==",
-      "dev": true,
-      "license": "MIT"
-    },
-    "node_modules/prelude-ls": {
-      "version": "1.2.1",
-      "resolved": "https://registry.npmjs.org/prelude-ls/-/prelude-ls-1.2.1.tgz",
-      "integrity": "sha512-vkcDPrRZo1QZLbn5RLGPpg/WmIQ65qoWWhcGKf/b5eplkkarX0m9z8ppCat4mlOqUsWpyNuYgO3VRyrYHSzX5g==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">= 0.8.0"
-      }
-    },
-    "node_modules/prop-types": {
-      "version": "15.8.1",
-      "resolved": "https://registry.npmjs.org/prop-types/-/prop-types-15.8.1.tgz",
-      "integrity": "sha512-oj87CgZICdulUohogVAR7AjlC0327U4el4L6eAvOqCeudMDVU0NThNaV+b9Df4dXgSP1gXMTnPdhfe/2qDH5cg==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "loose-envify": "^1.4.0",
-        "object-assign": "^4.1.1",
-        "react-is": "^16.13.1"
-      }
-    },
-    "node_modules/punycode": {
-      "version": "2.3.1",
-      "resolved": "https://registry.npmjs.org/punycode/-/punycode-2.3.1.tgz",
-      "integrity": "sha512-vYt7UD1U9Wg6138shLtLOvdAu+8DsC/ilFtEVHcH+wydcSpNE20AfSOduf6MkRFahL5FY7X1oU7nKVZFtfq8Fg==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">=6"
-      }
-    },
-    "node_modules/queue-microtask": {
-      "version": "1.2.3",
-      "resolved": "https://registry.npmjs.org/queue-microtask/-/queue-microtask-1.2.3.tgz",
-      "integrity": "sha512-NuaNSa6flKT5JaSYQzJok04JzTL1CA6aGhv5rfLW3PgqA+M2ChpZQnAC8h8i4ZFkBS8X5RqkDBHA7r4hej3K9A==",
-      "dev": true,
-      "funding": [
-        {
-          "type": "github",
-          "url": "https://github.com/sponsors/feross"
-        },
-        {
-          "type": "patreon",
-          "url": "https://www.patreon.com/feross"
-        },
-        {
-          "type": "consulting",
-          "url": "https://feross.org/support"
-        }
-      ],
-      "license": "MIT"
-    },
-    "node_modules/react": {
-      "version": "18.3.1",
-      "resolved": "https://registry.npmjs.org/react/-/react-18.3.1.tgz",
-      "integrity": "sha512-wS+hAgJShR0KhEvPJArfuPVN1+Hz1t0Y6n5jLrGQbkb4urgPE/0Rve+1kMB1v/oWgHgm4WIcV+i7F2pTVj+2iQ==",
-      "license": "MIT",
-      "dependencies": {
-        "loose-envify": "^1.1.0"
-      },
-      "engines": {
-        "node": ">=0.10.0"
-      }
-    },
-    "node_modules/react-dom": {
-      "version": "18.3.1",
-      "resolved": "https://registry.npmjs.org/react-dom/-/react-dom-18.3.1.tgz",
-      "integrity": "sha512-5m4nQKp+rZRb09LNH59GM4BxTh9251/ylbKIbpe7TpGxfJ+9kv6BLkLBXIjjspbgbnIBNqlI23tRnTWT0snUIw==",
-      "license": "MIT",
-      "dependencies": {
-        "loose-envify": "^1.1.0",
-        "scheduler": "^0.23.2"
-      },
-      "peerDependencies": {
-        "react": "^18.3.1"
-      }
-    },
-    "node_modules/react-is": {
-      "version": "16.13.1",
-      "resolved": "https://registry.npmjs.org/react-is/-/react-is-16.13.1.tgz",
-      "integrity": "sha512-24e6ynE2H+OKt4kqsOvNd8kBpV65zoxbA4BVsEOB3ARVWQki/DHzaUoC5KuON/BiccDaCCTZBuOcfZs70kR8bQ==",
-      "dev": true,
-      "license": "MIT"
-    },
-    "node_modules/read-cache": {
-      "version": "1.0.0",
-      "resolved": "https://registry.npmjs.org/read-cache/-/read-cache-1.0.0.tgz",
-      "integrity": "sha512-Owdv/Ft7IjOgm/i0xvNDZ1LrRANRfew4b2prF3OWMQLxLfu3bS8FVhCsrSCMK4lR56Y9ya+AThoTpDCTxCmpRA==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "pify": "^2.3.0"
-      }
-    },
-    "node_modules/readdirp": {
-      "version": "3.6.0",
-      "resolved": "https://registry.npmjs.org/readdirp/-/readdirp-3.6.0.tgz",
-      "integrity": "sha512-hOS089on8RduqdbhvQ5Z37A0ESjsqz6qnRcffsMU3495FuTdqSm+7bhJ29JvIOsBDEEnan5DPu9t3To9VRlMzA==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "picomatch": "^2.2.1"
-      },
-      "engines": {
-        "node": ">=8.10.0"
-      }
-    },
-    "node_modules/reflect.getprototypeof": {
-      "version": "1.0.10",
-      "resolved": "https://registry.npmjs.org/reflect.getprototypeof/-/reflect.getprototypeof-1.0.10.tgz",
-      "integrity": "sha512-00o4I+DVrefhv+nX0ulyi3biSHCPDe+yLv5o/p6d/UVlirijB8E16FtfwSAi4g3tcqrQ4lRAqQSoFEZJehYEcw==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "call-bind": "^1.0.8",
-        "define-properties": "^1.2.1",
-        "es-abstract": "^1.23.9",
-        "es-errors": "^1.3.0",
-        "es-object-atoms": "^1.0.0",
-        "get-intrinsic": "^1.2.7",
-        "get-proto": "^1.0.1",
-        "which-builtin-type": "^1.2.1"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/regexp.prototype.flags": {
-      "version": "1.5.4",
-      "resolved": "https://registry.npmjs.org/regexp.prototype.flags/-/regexp.prototype.flags-1.5.4.tgz",
-      "integrity": "sha512-dYqgNSZbDwkaJ2ceRd9ojCGjBq+mOm9LmtXnAnEGyHhN/5R7iDW2TRw3h+o/jCFxus3P2LfWIIiwowAjANm7IA==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "call-bind": "^1.0.8",
-        "define-properties": "^1.2.1",
-        "es-errors": "^1.3.0",
-        "get-proto": "^1.0.1",
-        "gopd": "^1.2.0",
-        "set-function-name": "^2.0.2"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/resolve": {
-      "version": "1.22.10",
-      "resolved": "https://registry.npmjs.org/resolve/-/resolve-1.22.10.tgz",
-      "integrity": "sha512-NPRy+/ncIMeDlTAsuqwKIiferiawhefFJtkNSW0qZJEqMEb+qBt/77B/jGeeek+F0uOeN05CDa6HXbbIgtVX4w==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "is-core-module": "^2.16.0",
-        "path-parse": "^1.0.7",
-        "supports-preserve-symlinks-flag": "^1.0.0"
-      },
-      "bin": {
-        "resolve": "bin/resolve"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/resolve-from": {
-      "version": "4.0.0",
-      "resolved": "https://registry.npmjs.org/resolve-from/-/resolve-from-4.0.0.tgz",
-      "integrity": "sha512-pb/MYmXstAkysRFx8piNI1tGFNQIFA3vkE3Gq4EuA1dF6gHp/+vgZqsCGJapvy8N3Q+4o7FwvquPJcnZ7RYy4g==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">=4"
-      }
-    },
-    "node_modules/resolve-pkg-maps": {
-      "version": "1.0.0",
-      "resolved": "https://registry.npmjs.org/resolve-pkg-maps/-/resolve-pkg-maps-1.0.0.tgz",
-      "integrity": "sha512-seS2Tj26TBVOC2NIc2rOe2y2ZO7efxITtLZcGSOnHHNOQ7CkiUBfw0Iw2ck6xkIhPwLhKNLS8BO+hEpngQlqzw==",
-      "dev": true,
-      "license": "MIT",
-      "funding": {
-        "url": "https://github.com/privatenumber/resolve-pkg-maps?sponsor=1"
-      }
-    },
-    "node_modules/reusify": {
-      "version": "1.1.0",
-      "resolved": "https://registry.npmjs.org/reusify/-/reusify-1.1.0.tgz",
-      "integrity": "sha512-g6QUff04oZpHs0eG5p83rFLhHeV00ug/Yf9nZM6fLeUrPguBTkTQOdpAWWspMh55TZfVQDPaN3NQJfbVRAxdIw==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "iojs": ">=1.0.0",
-        "node": ">=0.10.0"
-      }
-    },
-    "node_modules/run-parallel": {
-      "version": "1.2.0",
-      "resolved": "https://registry.npmjs.org/run-parallel/-/run-parallel-1.2.0.tgz",
-      "integrity": "sha512-5l4VyZR86LZ/lDxZTR6jqL8AFE2S0IFLMP26AbjsLVADxHdhB/c0GUsH+y39UfCi3dzz8OlQuPmnaJOMoDHQBA==",
-      "dev": true,
-      "funding": [
-        {
-          "type": "github",
-          "url": "https://github.com/sponsors/feross"
-        },
-        {
-          "type": "patreon",
-          "url": "https://www.patreon.com/feross"
-        },
-        {
-          "type": "consulting",
-          "url": "https://feross.org/support"
-        }
-      ],
-      "license": "MIT",
-      "dependencies": {
-        "queue-microtask": "^1.2.2"
-      }
-    },
-    "node_modules/safe-array-concat": {
-      "version": "1.1.3",
-      "resolved": "https://registry.npmjs.org/safe-array-concat/-/safe-array-concat-1.1.3.tgz",
-      "integrity": "sha512-AURm5f0jYEOydBj7VQlVvDrjeFgthDdEF5H1dP+6mNpoXOMo1quQqJ4wvJDyRZ9+pO3kGWoOdmV08cSv2aJV6Q==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "call-bind": "^1.0.8",
-        "call-bound": "^1.0.2",
-        "get-intrinsic": "^1.2.6",
-        "has-symbols": "^1.1.0",
-        "isarray": "^2.0.5"
-      },
-      "engines": {
-        "node": ">=0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/safe-push-apply": {
-      "version": "1.0.0",
-      "resolved": "https://registry.npmjs.org/safe-push-apply/-/safe-push-apply-1.0.0.tgz",
-      "integrity": "sha512-iKE9w/Z7xCzUMIZqdBsp6pEQvwuEebH4vdpjcDWnyzaI6yl6O9FHvVpmGelvEHNsoY6wGblkxR6Zty/h00WiSA==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "es-errors": "^1.3.0",
-        "isarray": "^2.0.5"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/safe-regex-test": {
-      "version": "1.1.0",
-      "resolved": "https://registry.npmjs.org/safe-regex-test/-/safe-regex-test-1.1.0.tgz",
-      "integrity": "sha512-x/+Cz4YrimQxQccJf5mKEbIa1NzeCRNI5Ecl/ekmlYaampdNLPalVyIcCZNNH3MvmqBugV5TMYZXv0ljslUlaw==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "call-bound": "^1.0.2",
-        "es-errors": "^1.3.0",
-        "is-regex": "^1.2.1"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/scheduler": {
-      "version": "0.23.2",
-      "resolved": "https://registry.npmjs.org/scheduler/-/scheduler-0.23.2.tgz",
-      "integrity": "sha512-UOShsPwz7NrMUqhR6t0hWjFduvOzbtv7toDH1/hIrfRNIDBnnBWd0CwJTGvTpngVlmwGCdP9/Zl/tVrDqcuYzQ==",
-      "license": "MIT",
-      "dependencies": {
-        "loose-envify": "^1.1.0"
-      }
-    },
-    "node_modules/semver": {
-      "version": "7.7.2",
-      "resolved": "https://registry.npmjs.org/semver/-/semver-7.7.2.tgz",
-      "integrity": "sha512-RF0Fw+rO5AMf9MAyaRXI4AV0Ulj5lMHqVxxdSgiVbixSCXoEmmX/jk0CuJw4+3SqroYO9VoUh+HcuJivvtJemA==",
-      "dev": true,
-      "license": "ISC",
-      "bin": {
-        "semver": "bin/semver.js"
-      },
-      "engines": {
-        "node": ">=10"
-      }
-    },
-    "node_modules/set-function-length": {
-      "version": "1.2.2",
-      "resolved": "https://registry.npmjs.org/set-function-length/-/set-function-length-1.2.2.tgz",
-      "integrity": "sha512-pgRc4hJ4/sNjWCSS9AmnS40x3bNMDTknHgL5UaMBTMyJnU90EgWh1Rz+MC9eFu4BuN/UwZjKQuY/1v3rM7HMfg==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "define-data-property": "^1.1.4",
-        "es-errors": "^1.3.0",
-        "function-bind": "^1.1.2",
-        "get-intrinsic": "^1.2.4",
-        "gopd": "^1.0.1",
-        "has-property-descriptors": "^1.0.2"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      }
-    },
-    "node_modules/set-function-name": {
-      "version": "2.0.2",
-      "resolved": "https://registry.npmjs.org/set-function-name/-/set-function-name-2.0.2.tgz",
-      "integrity": "sha512-7PGFlmtwsEADb0WYyvCMa1t+yke6daIG4Wirafur5kcf+MhUnPms1UeR0CKQdTZD81yESwMHbtn+TR+dMviakQ==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "define-data-property": "^1.1.4",
-        "es-errors": "^1.3.0",
-        "functions-have-names": "^1.2.3",
-        "has-property-descriptors": "^1.0.2"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      }
-    },
-    "node_modules/set-proto": {
-      "version": "1.0.0",
-      "resolved": "https://registry.npmjs.org/set-proto/-/set-proto-1.0.0.tgz",
-      "integrity": "sha512-RJRdvCo6IAnPdsvP/7m6bsQqNnn1FCBX5ZNtFL98MmFF/4xAIJTIg1YbHW5DC2W5SKZanrC6i4HsJqlajw/dZw==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "dunder-proto": "^1.0.1",
-        "es-errors": "^1.3.0",
-        "es-object-atoms": "^1.0.0"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      }
-    },
-    "node_modules/shebang-command": {
-      "version": "2.0.0",
-      "resolved": "https://registry.npmjs.org/shebang-command/-/shebang-command-2.0.0.tgz",
-      "integrity": "sha512-kHxr2zZpYtdmrN1qDjrrX/Z1rR1kG8Dx+gkpK1G4eXmvXswmcE1hTWBWYUzlraYw1/yZp6YuDY77YtvbN0dmDA==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "shebang-regex": "^3.0.0"
-      },
-      "engines": {
-        "node": ">=8"
-      }
-    },
-    "node_modules/shebang-regex": {
-      "version": "3.0.0",
-      "resolved": "https://registry.npmjs.org/shebang-regex/-/shebang-regex-3.0.0.tgz",
-      "integrity": "sha512-7++dFhtcx3353uBaq8DDR4NuxBetBzC7ZQOhmTQInHEd6bSrXdiEyzCvG07Z44UYdLShWUyXt5M/yhz8ekcb1A==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">=8"
-      }
-    },
-    "node_modules/side-channel": {
-      "version": "1.1.0",
-      "resolved": "https://registry.npmjs.org/side-channel/-/side-channel-1.1.0.tgz",
-      "integrity": "sha512-ZX99e6tRweoUXqR+VBrslhda51Nh5MTQwou5tnUDgbtyM0dBgmhEDtWGP/xbKn6hqfPRHujUNwz5fy/wbbhnpw==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "es-errors": "^1.3.0",
-        "object-inspect": "^1.13.3",
-        "side-channel-list": "^1.0.0",
-        "side-channel-map": "^1.0.1",
-        "side-channel-weakmap": "^1.0.2"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/side-channel-list": {
-      "version": "1.0.0",
-      "resolved": "https://registry.npmjs.org/side-channel-list/-/side-channel-list-1.0.0.tgz",
-      "integrity": "sha512-FCLHtRD/gnpCiCHEiJLOwdmFP+wzCmDEkc9y7NsYxeF4u7Btsn1ZuwgwJGxImImHicJArLP4R0yX4c2KCrMrTA==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "es-errors": "^1.3.0",
-        "object-inspect": "^1.13.3"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/side-channel-map": {
-      "version": "1.0.1",
-      "resolved": "https://registry.npmjs.org/side-channel-map/-/side-channel-map-1.0.1.tgz",
-      "integrity": "sha512-VCjCNfgMsby3tTdo02nbjtM/ewra6jPHmpThenkTYh8pG9ucZ/1P8So4u4FGBek/BjpOVsDCMoLA/iuBKIFXRA==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "call-bound": "^1.0.2",
-        "es-errors": "^1.3.0",
-        "get-intrinsic": "^1.2.5",
-        "object-inspect": "^1.13.3"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/side-channel-weakmap": {
-      "version": "1.0.2",
-      "resolved": "https://registry.npmjs.org/side-channel-weakmap/-/side-channel-weakmap-1.0.2.tgz",
-      "integrity": "sha512-WPS/HvHQTYnHisLo9McqBHOJk2FkHO/tlpvldyrnem4aeQp4hai3gythswg6p01oSoTl58rcpiFAjF2br2Ak2A==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "call-bound": "^1.0.2",
-        "es-errors": "^1.3.0",
-        "get-intrinsic": "^1.2.5",
-        "object-inspect": "^1.13.3",
-        "side-channel-map": "^1.0.1"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/signal-exit": {
-      "version": "4.1.0",
-      "resolved": "https://registry.npmjs.org/signal-exit/-/signal-exit-4.1.0.tgz",
-      "integrity": "sha512-bzyZ1e88w9O1iNJbKnOlvYTrWPDl46O1bG0D3XInv+9tkPrxrN8jUUTiFlDkkmKWgn1M6CfIA13SuGqOa9Korw==",
-      "dev": true,
-      "license": "ISC",
-      "engines": {
-        "node": ">=14"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/isaacs"
-      }
-    },
-    "node_modules/source-map-js": {
-      "version": "1.2.1",
-      "resolved": "https://registry.npmjs.org/source-map-js/-/source-map-js-1.2.1.tgz",
-      "integrity": "sha512-UXWMKhLOwVKb728IUtQPXxfYU+usdybtUrK/8uGE8CQMvrhOpwvzDBwj0QhSL7MQc7vIsISBG8VQ8+IDQxpfQA==",
-      "license": "BSD-3-Clause",
-      "engines": {
-        "node": ">=0.10.0"
-      }
-    },
-    "node_modules/stable-hash": {
-      "version": "0.0.5",
-      "resolved": "https://registry.npmjs.org/stable-hash/-/stable-hash-0.0.5.tgz",
-      "integrity": "sha512-+L3ccpzibovGXFK+Ap/f8LOS0ahMrHTf3xu7mMLSpEGU0EO9ucaysSylKo9eRDFNhWve/y275iPmIZ4z39a9iA==",
-      "dev": true,
-      "license": "MIT"
-    },
-    "node_modules/stop-iteration-iterator": {
-      "version": "1.1.0",
-      "resolved": "https://registry.npmjs.org/stop-iteration-iterator/-/stop-iteration-iterator-1.1.0.tgz",
-      "integrity": "sha512-eLoXW/DHyl62zxY4SCaIgnRhuMr6ri4juEYARS8E6sCEqzKpOiE521Ucofdx+KnDZl5xmvGYaaKCk5FEOxJCoQ==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "es-errors": "^1.3.0",
-        "internal-slot": "^1.1.0"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      }
-    },
-    "node_modules/streamsearch": {
-      "version": "1.1.0",
-      "resolved": "https://registry.npmjs.org/streamsearch/-/streamsearch-1.1.0.tgz",
-      "integrity": "sha512-Mcc5wHehp9aXz1ax6bZUyY5afg9u2rv5cqQI3mRrYkGC8rW2hM02jWuwjtL++LS5qinSyhj2QfLyNsuc+VsExg==",
-      "engines": {
-        "node": ">=10.0.0"
-      }
-    },
-    "node_modules/string-width": {
-      "version": "5.1.2",
-      "resolved": "https://registry.npmjs.org/string-width/-/string-width-5.1.2.tgz",
-      "integrity": "sha512-HnLOCR3vjcY8beoNLtcjZ5/nxn2afmME6lhrDrebokqMap+XbeW8n9TXpPDOqdGK5qcI3oT0GKTW6wC7EMiVqA==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "eastasianwidth": "^0.2.0",
-        "emoji-regex": "^9.2.2",
-        "strip-ansi": "^7.0.1"
-      },
-      "engines": {
-        "node": ">=12"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/sindresorhus"
-      }
-    },
-    "node_modules/string-width-cjs": {
-      "name": "string-width",
-      "version": "4.2.3",
-      "resolved": "https://registry.npmjs.org/string-width/-/string-width-4.2.3.tgz",
-      "integrity": "sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "emoji-regex": "^8.0.0",
-        "is-fullwidth-code-point": "^3.0.0",
-        "strip-ansi": "^6.0.1"
-      },
-      "engines": {
-        "node": ">=8"
-      }
-    },
-    "node_modules/string-width-cjs/node_modules/emoji-regex": {
-      "version": "8.0.0",
-      "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-8.0.0.tgz",
-      "integrity": "sha512-MSjYzcWNOA0ewAHpz0MxpYFvwg6yjy1NG3xteoqz644VCo/RPgnr1/GGt+ic3iJTzQ8Eu3TdM14SawnVUmGE6A==",
-      "dev": true,
-      "license": "MIT"
-    },
-    "node_modules/string-width/node_modules/ansi-regex": {
-      "version": "6.2.2",
-      "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-6.2.2.tgz",
-      "integrity": "sha512-Bq3SmSpyFHaWjPk8If9yc6svM8c56dB5BAtW4Qbw5jHTwwXXcTLoRMkpDJp6VL0XzlWaCHTXrkFURMYmD0sLqg==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">=12"
-      },
-      "funding": {
-        "url": "https://github.com/chalk/ansi-regex?sponsor=1"
-      }
-    },
-    "node_modules/string-width/node_modules/strip-ansi": {
-      "version": "7.1.2",
-      "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-7.1.2.tgz",
-      "integrity": "sha512-gmBGslpoQJtgnMAvOVqGZpEz9dyoKTCzy2nfz/n8aIFhN/jCE/rCmcxabB6jOOHV+0WNnylOxaxBQPSvcWklhA==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "ansi-regex": "^6.0.1"
-      },
-      "engines": {
-        "node": ">=12"
-      },
-      "funding": {
-        "url": "https://github.com/chalk/strip-ansi?sponsor=1"
-      }
-    },
-    "node_modules/string.prototype.includes": {
-      "version": "2.0.1",
-      "resolved": "https://registry.npmjs.org/string.prototype.includes/-/string.prototype.includes-2.0.1.tgz",
-      "integrity": "sha512-o7+c9bW6zpAdJHTtujeePODAhkuicdAryFsfVKwA+wGw89wJ4GTY484WTucM9hLtDEOpOvI+aHnzqnC5lHp4Rg==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "call-bind": "^1.0.7",
-        "define-properties": "^1.2.1",
-        "es-abstract": "^1.23.3"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      }
-    },
-    "node_modules/string.prototype.matchall": {
-      "version": "4.0.12",
-      "resolved": "https://registry.npmjs.org/string.prototype.matchall/-/string.prototype.matchall-4.0.12.tgz",
-      "integrity": "sha512-6CC9uyBL+/48dYizRf7H7VAYCMCNTBeM78x/VTUe9bFEaxBepPJDa1Ow99LqI/1yF7kuy7Q3cQsYMrcjGUcskA==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "call-bind": "^1.0.8",
-        "call-bound": "^1.0.3",
-        "define-properties": "^1.2.1",
-        "es-abstract": "^1.23.6",
-        "es-errors": "^1.3.0",
-        "es-object-atoms": "^1.0.0",
-        "get-intrinsic": "^1.2.6",
-        "gopd": "^1.2.0",
-        "has-symbols": "^1.1.0",
-        "internal-slot": "^1.1.0",
-        "regexp.prototype.flags": "^1.5.3",
-        "set-function-name": "^2.0.2",
-        "side-channel": "^1.1.0"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/string.prototype.repeat": {
-      "version": "1.0.0",
-      "resolved": "https://registry.npmjs.org/string.prototype.repeat/-/string.prototype.repeat-1.0.0.tgz",
-      "integrity": "sha512-0u/TldDbKD8bFCQ/4f5+mNRrXwZ8hg2w7ZR8wa16e8z9XpePWl3eGEcUD0OXpEH/VJH/2G3gjUtR3ZOiBe2S/w==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "define-properties": "^1.1.3",
-        "es-abstract": "^1.17.5"
-      }
-    },
-    "node_modules/string.prototype.trim": {
-      "version": "1.2.10",
-      "resolved": "https://registry.npmjs.org/string.prototype.trim/-/string.prototype.trim-1.2.10.tgz",
-      "integrity": "sha512-Rs66F0P/1kedk5lyYyH9uBzuiI/kNRmwJAR9quK6VOtIpZ2G+hMZd+HQbbv25MgCA6gEffoMZYxlTod4WcdrKA==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "call-bind": "^1.0.8",
-        "call-bound": "^1.0.2",
-        "define-data-property": "^1.1.4",
-        "define-properties": "^1.2.1",
-        "es-abstract": "^1.23.5",
-        "es-object-atoms": "^1.0.0",
-        "has-property-descriptors": "^1.0.2"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/string.prototype.trimend": {
-      "version": "1.0.9",
-      "resolved": "https://registry.npmjs.org/string.prototype.trimend/-/string.prototype.trimend-1.0.9.tgz",
-      "integrity": "sha512-G7Ok5C6E/j4SGfyLCloXTrngQIQU3PWtXGst3yM7Bea9FRURf1S42ZHlZZtsNque2FN2PoUhfZXYLNWwEr4dLQ==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "call-bind": "^1.0.8",
-        "call-bound": "^1.0.2",
-        "define-properties": "^1.2.1",
-        "es-object-atoms": "^1.0.0"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/string.prototype.trimstart": {
-      "version": "1.0.8",
-      "resolved": "https://registry.npmjs.org/string.prototype.trimstart/-/string.prototype.trimstart-1.0.8.tgz",
-      "integrity": "sha512-UXSH262CSZY1tfu3G3Secr6uGLCFVPMhIqHjlgCUtCCcgihYc/xKs9djMTMUOb2j1mVSeU8EU6NWc/iQKU6Gfg==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "call-bind": "^1.0.7",
-        "define-properties": "^1.2.1",
-        "es-object-atoms": "^1.0.0"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/strip-ansi": {
-      "version": "6.0.1",
-      "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.1.tgz",
-      "integrity": "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "ansi-regex": "^5.0.1"
-      },
-      "engines": {
-        "node": ">=8"
-      }
-    },
-    "node_modules/strip-ansi-cjs": {
-      "name": "strip-ansi",
-      "version": "6.0.1",
-      "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.1.tgz",
-      "integrity": "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "ansi-regex": "^5.0.1"
-      },
-      "engines": {
-        "node": ">=8"
-      }
-    },
-    "node_modules/strip-bom": {
-      "version": "3.0.0",
-      "resolved": "https://registry.npmjs.org/strip-bom/-/strip-bom-3.0.0.tgz",
-      "integrity": "sha512-vavAMRXOgBVNF6nyEEmL3DBK19iRpDcoIwW+swQ+CbGiu7lju6t+JklA1MHweoWtadgt4ISVUsXLyDq34ddcwA==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">=4"
-      }
-    },
-    "node_modules/strip-json-comments": {
-      "version": "3.1.1",
-      "resolved": "https://registry.npmjs.org/strip-json-comments/-/strip-json-comments-3.1.1.tgz",
-      "integrity": "sha512-6fPc+R4ihwqP6N/aIv2f1gMH8lOVtWQHoqC4yK6oSDVVocumAsfCqjkXnqiYMhmMwS/mEHLp7Vehlt3ql6lEig==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">=8"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/sindresorhus"
-      }
-    },
-    "node_modules/styled-jsx": {
-      "version": "5.1.1",
-      "resolved": "https://registry.npmjs.org/styled-jsx/-/styled-jsx-5.1.1.tgz",
-      "integrity": "sha512-pW7uC1l4mBZ8ugbiZrcIsiIvVx1UmTfw7UkC3Um2tmfUq9Bhk8IiyEIPl6F8agHgjzku6j0xQEZbfA5uSgSaCw==",
-      "license": "MIT",
-      "dependencies": {
-        "client-only": "0.0.1"
-      },
-      "engines": {
-        "node": ">= 12.0.0"
-      },
-      "peerDependencies": {
-        "react": ">= 16.8.0 || 17.x.x || ^18.0.0-0"
-      },
-      "peerDependenciesMeta": {
-        "@babel/core": {
-          "optional": true
-        },
-        "babel-plugin-macros": {
-          "optional": true
-        }
-      }
-    },
-    "node_modules/sucrase": {
-      "version": "3.35.0",
-      "resolved": "https://registry.npmjs.org/sucrase/-/sucrase-3.35.0.tgz",
-      "integrity": "sha512-8EbVDiu9iN/nESwxeSxDKe0dunta1GOlHufmSSXxMD2z2/tMZpDMpvXQGsc+ajGo8y2uYUmixaSRUc/QPoQ0GA==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "@jridgewell/gen-mapping": "^0.3.2",
-        "commander": "^4.0.0",
-        "glob": "^10.3.10",
-        "lines-and-columns": "^1.1.6",
-        "mz": "^2.7.0",
-        "pirates": "^4.0.1",
-        "ts-interface-checker": "^0.1.9"
-      },
-      "bin": {
-        "sucrase": "bin/sucrase",
-        "sucrase-node": "bin/sucrase-node"
-      },
-      "engines": {
-        "node": ">=16 || 14 >=14.17"
-      }
-    },
-    "node_modules/supports-color": {
-      "version": "7.2.0",
-      "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-7.2.0.tgz",
-      "integrity": "sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "has-flag": "^4.0.0"
-      },
-      "engines": {
-        "node": ">=8"
-      }
-    },
-    "node_modules/supports-preserve-symlinks-flag": {
-      "version": "1.0.0",
-      "resolved": "https://registry.npmjs.org/supports-preserve-symlinks-flag/-/supports-preserve-symlinks-flag-1.0.0.tgz",
-      "integrity": "sha512-ot0WnXS9fgdkgIcePe6RHNk1WA8+muPa6cSjeR3V8K27q9BB1rTE3R1p7Hv0z1ZyAc8s6Vvv8DIyWf681MAt0w==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/swr": {
-      "version": "2.3.4",
-      "resolved": "https://registry.npmjs.org/swr/-/swr-2.3.4.tgz",
-      "integrity": "sha512-bYd2lrhc+VarcpkgWclcUi92wYCpOgMws9Sd1hG1ntAu0NEy+14CbotuFjshBU2kt9rYj9TSmDcybpxpeTU1fg==",
-      "license": "MIT",
-      "dependencies": {
-        "dequal": "^2.0.3",
-        "use-sync-external-store": "^1.4.0"
-      },
-      "peerDependencies": {
-        "react": "^16.11.0 || ^17.0.0 || ^18.0.0 || ^19.0.0"
-      }
-    },
-    "node_modules/tailwindcss": {
-      "version": "3.4.17",
-      "resolved": "https://registry.npmjs.org/tailwindcss/-/tailwindcss-3.4.17.tgz",
-      "integrity": "sha512-w33E2aCvSDP0tW9RZuNXadXlkHXqFzSkQew/aIa2i/Sj8fThxwovwlXHSPXTbAHwEIhBFXAedUhP2tueAKP8Og==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "@alloc/quick-lru": "^5.2.0",
-        "arg": "^5.0.2",
-        "chokidar": "^3.6.0",
-        "didyoumean": "^1.2.2",
-        "dlv": "^1.1.3",
-        "fast-glob": "^3.3.2",
-        "glob-parent": "^6.0.2",
-        "is-glob": "^4.0.3",
-        "jiti": "^1.21.6",
-        "lilconfig": "^3.1.3",
-        "micromatch": "^4.0.8",
-        "normalize-path": "^3.0.0",
-        "object-hash": "^3.0.0",
-        "picocolors": "^1.1.1",
-        "postcss": "^8.4.47",
-        "postcss-import": "^15.1.0",
-        "postcss-js": "^4.0.1",
-        "postcss-load-config": "^4.0.2",
-        "postcss-nested": "^6.2.0",
-        "postcss-selector-parser": "^6.1.2",
-        "resolve": "^1.22.8",
-        "sucrase": "^3.35.0"
-      },
-      "bin": {
-        "tailwind": "lib/cli.js",
-        "tailwindcss": "lib/cli.js"
-      },
-      "engines": {
-        "node": ">=14.0.0"
-      }
-    },
-    "node_modules/tailwindcss/node_modules/postcss-load-config": {
-      "version": "4.0.2",
-      "resolved": "https://registry.npmjs.org/postcss-load-config/-/postcss-load-config-4.0.2.tgz",
-      "integrity": "sha512-bSVhyJGL00wMVoPUzAVAnbEoWyqRxkjv64tUl427SKnPrENtq6hJwUojroMz2VB+Q1edmi4IfrAPpami5VVgMQ==",
-      "dev": true,
-      "funding": [
-        {
-          "type": "opencollective",
-          "url": "https://opencollective.com/postcss/"
-        },
-        {
-          "type": "github",
-          "url": "https://github.com/sponsors/ai"
-        }
-      ],
-      "license": "MIT",
-      "dependencies": {
-        "lilconfig": "^3.0.0",
-        "yaml": "^2.3.4"
-      },
-      "engines": {
-        "node": ">= 14"
-      },
-      "peerDependencies": {
-        "postcss": ">=8.0.9",
-        "ts-node": ">=9.0.0"
-      },
-      "peerDependenciesMeta": {
-        "postcss": {
-          "optional": true
-        },
-        "ts-node": {
-          "optional": true
-        }
-      }
-    },
-    "node_modules/thenify": {
-      "version": "3.3.1",
-      "resolved": "https://registry.npmjs.org/thenify/-/thenify-3.3.1.tgz",
-      "integrity": "sha512-RVZSIV5IG10Hk3enotrhvz0T9em6cyHBLkH/YAZuKqd8hRkKhSfCGIcP2KUY0EPxndzANBmNllzWPwak+bheSw==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "any-promise": "^1.0.0"
-      }
-    },
-    "node_modules/thenify-all": {
-      "version": "1.6.0",
-      "resolved": "https://registry.npmjs.org/thenify-all/-/thenify-all-1.6.0.tgz",
-      "integrity": "sha512-RNxQH/qI8/t3thXJDwcstUO4zeqo64+Uy/+sNVRBx4Xn2OX+OZ9oP+iJnNFqplFra2ZUVeKCSa2oVWi3T4uVmA==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "thenify": ">= 3.1.0 < 4"
-      },
-      "engines": {
-        "node": ">=0.8"
-      }
-    },
-    "node_modules/tinyglobby": {
-      "version": "0.2.14",
-      "resolved": "https://registry.npmjs.org/tinyglobby/-/tinyglobby-0.2.14.tgz",
-      "integrity": "sha512-tX5e7OM1HnYr2+a2C/4V0htOcSQcoSTH9KgJnVvNm5zm/cyEWKJ7j7YutsH9CxMdtOkkLFy2AHrMci9IM8IPZQ==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "fdir": "^6.4.4",
-        "picomatch": "^4.0.2"
-      },
-      "engines": {
-        "node": ">=12.0.0"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/SuperchupuDev"
-      }
-    },
-    "node_modules/tinyglobby/node_modules/fdir": {
-      "version": "6.4.6",
-      "resolved": "https://registry.npmjs.org/fdir/-/fdir-6.4.6.tgz",
-      "integrity": "sha512-hiFoqpyZcfNm1yc4u8oWCf9A2c4D3QjCrks3zmoVKVxpQRzmPNar1hUJcBG2RQHvEVGDN+Jm81ZheVLAQMK6+w==",
-      "dev": true,
-      "license": "MIT",
-      "peerDependencies": {
-        "picomatch": "^3 || ^4"
-      },
-      "peerDependenciesMeta": {
-        "picomatch": {
-          "optional": true
-        }
-      }
-    },
-    "node_modules/tinyglobby/node_modules/picomatch": {
-      "version": "4.0.3",
-      "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-4.0.3.tgz",
-      "integrity": "sha512-5gTmgEY/sqK6gFXLIsQNH19lWb4ebPDLA4SdLP7dsWkIXHWlG66oPuVvXSGFPppYZz8ZDZq0dYYrbHfBCVUb1Q==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">=12"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/jonschlinkert"
-      }
-    },
-    "node_modules/to-regex-range": {
-      "version": "5.0.1",
-      "resolved": "https://registry.npmjs.org/to-regex-range/-/to-regex-range-5.0.1.tgz",
-      "integrity": "sha512-65P7iz6X5yEr1cwcgvQxbbIw7Uk3gOy5dIdtZ4rDveLqhrdJP+Li/Hx6tyK0NEb+2GCyneCMJiGqrADCSNk8sQ==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "is-number": "^7.0.0"
-      },
-      "engines": {
-        "node": ">=8.0"
-      }
-    },
-    "node_modules/ts-api-utils": {
-      "version": "2.1.0",
-      "resolved": "https://registry.npmjs.org/ts-api-utils/-/ts-api-utils-2.1.0.tgz",
-      "integrity": "sha512-CUgTZL1irw8u29bzrOD/nH85jqyc74D6SshFgujOIA7osm2Rz7dYH77agkx7H4FBNxDq7Cjf+IjaX/8zwFW+ZQ==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">=18.12"
-      },
-      "peerDependencies": {
-        "typescript": ">=4.8.4"
-      }
-    },
-    "node_modules/ts-interface-checker": {
-      "version": "0.1.13",
-      "resolved": "https://registry.npmjs.org/ts-interface-checker/-/ts-interface-checker-0.1.13.tgz",
-      "integrity": "sha512-Y/arvbn+rrz3JCKl9C4kVNfTfSm2/mEp5FSz5EsZSANGPSlQrpRI5M4PKF+mJnE52jOO90PnPSc3Ur3bTQw0gA==",
-      "dev": true,
-      "license": "Apache-2.0"
-    },
-    "node_modules/tsconfig-paths": {
-      "version": "3.15.0",
-      "resolved": "https://registry.npmjs.org/tsconfig-paths/-/tsconfig-paths-3.15.0.tgz",
-      "integrity": "sha512-2Ac2RgzDe/cn48GvOe3M+o82pEFewD3UPbyoUHHdKasHwJKjds4fLXWf/Ux5kATBKN20oaFGu+jbElp1pos0mg==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "@types/json5": "^0.0.29",
-        "json5": "^1.0.2",
-        "minimist": "^1.2.6",
-        "strip-bom": "^3.0.0"
-      }
-    },
-    "node_modules/tslib": {
-      "version": "2.8.1",
-      "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.8.1.tgz",
-      "integrity": "sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w==",
-      "license": "0BSD"
-    },
-    "node_modules/type-check": {
-      "version": "0.4.0",
-      "resolved": "https://registry.npmjs.org/type-check/-/type-check-0.4.0.tgz",
-      "integrity": "sha512-XleUoc9uwGXqjWwXaUTZAmzMcFZ5858QA2vvx1Ur5xIcixXIP+8LnFDgRplU30us6teqdlskFfu+ae4K79Ooew==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "prelude-ls": "^1.2.1"
-      },
-      "engines": {
-        "node": ">= 0.8.0"
-      }
-    },
-    "node_modules/typed-array-buffer": {
-      "version": "1.0.3",
-      "resolved": "https://registry.npmjs.org/typed-array-buffer/-/typed-array-buffer-1.0.3.tgz",
-      "integrity": "sha512-nAYYwfY3qnzX30IkA6AQZjVbtK6duGontcQm1WSG1MD94YLqK0515GNApXkoxKOWMusVssAHWLh9SeaoefYFGw==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "call-bound": "^1.0.3",
-        "es-errors": "^1.3.0",
-        "is-typed-array": "^1.1.14"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      }
-    },
-    "node_modules/typed-array-byte-length": {
-      "version": "1.0.3",
-      "resolved": "https://registry.npmjs.org/typed-array-byte-length/-/typed-array-byte-length-1.0.3.tgz",
-      "integrity": "sha512-BaXgOuIxz8n8pIq3e7Atg/7s+DpiYrxn4vdot3w9KbnBhcRQq6o3xemQdIfynqSeXeDrF32x+WvfzmOjPiY9lg==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "call-bind": "^1.0.8",
-        "for-each": "^0.3.3",
-        "gopd": "^1.2.0",
-        "has-proto": "^1.2.0",
-        "is-typed-array": "^1.1.14"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/typed-array-byte-offset": {
-      "version": "1.0.4",
-      "resolved": "https://registry.npmjs.org/typed-array-byte-offset/-/typed-array-byte-offset-1.0.4.tgz",
-      "integrity": "sha512-bTlAFB/FBYMcuX81gbL4OcpH5PmlFHqlCCpAl8AlEzMz5k53oNDvN8p1PNOWLEmI2x4orp3raOFB51tv9X+MFQ==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "available-typed-arrays": "^1.0.7",
-        "call-bind": "^1.0.8",
-        "for-each": "^0.3.3",
-        "gopd": "^1.2.0",
-        "has-proto": "^1.2.0",
-        "is-typed-array": "^1.1.15",
-        "reflect.getprototypeof": "^1.0.9"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/typed-array-length": {
-      "version": "1.0.7",
-      "resolved": "https://registry.npmjs.org/typed-array-length/-/typed-array-length-1.0.7.tgz",
-      "integrity": "sha512-3KS2b+kL7fsuk/eJZ7EQdnEmQoaho/r6KUef7hxvltNA5DR8NAUM+8wJMbJyZ4G9/7i3v5zPBIMN5aybAh2/Jg==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "call-bind": "^1.0.7",
-        "for-each": "^0.3.3",
-        "gopd": "^1.0.1",
-        "is-typed-array": "^1.1.13",
-        "possible-typed-array-names": "^1.0.0",
-        "reflect.getprototypeof": "^1.0.6"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/typescript": {
-      "version": "5.8.3",
-      "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.8.3.tgz",
-      "integrity": "sha512-p1diW6TqL9L07nNxvRMM7hMMw4c5XOo/1ibL4aAIGmSAt9slTE1Xgw5KWuof2uTOvCg9BY7ZRi+GaF+7sfgPeQ==",
-      "dev": true,
-      "license": "Apache-2.0",
-      "bin": {
-        "tsc": "bin/tsc",
-        "tsserver": "bin/tsserver"
-      },
-      "engines": {
-        "node": ">=14.17"
-      }
-    },
-    "node_modules/typescript-eslint": {
-      "version": "8.47.0",
-      "resolved": "https://registry.npmjs.org/typescript-eslint/-/typescript-eslint-8.47.0.tgz",
-      "integrity": "sha512-Lwe8i2XQ3WoMjua/r1PHrCTpkubPYJCAfOurtn+mtTzqB6jNd+14n9UN1bJ4s3F49x9ixAm0FLflB/JzQ57M8Q==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "@typescript-eslint/eslint-plugin": "8.47.0",
-        "@typescript-eslint/parser": "8.47.0",
-        "@typescript-eslint/typescript-estree": "8.47.0",
-        "@typescript-eslint/utils": "8.47.0"
-      },
-      "engines": {
-        "node": "^18.18.0 || ^20.9.0 || >=21.1.0"
-      },
-      "funding": {
-        "type": "opencollective",
-        "url": "https://opencollective.com/typescript-eslint"
-      },
-      "peerDependencies": {
-        "eslint": "^8.57.0 || ^9.0.0",
-        "typescript": ">=4.8.4 <6.0.0"
-      }
-    },
-    "node_modules/unbox-primitive": {
-      "version": "1.1.0",
-      "resolved": "https://registry.npmjs.org/unbox-primitive/-/unbox-primitive-1.1.0.tgz",
-      "integrity": "sha512-nWJ91DjeOkej/TA8pXQ3myruKpKEYgqvpw9lz4OPHj/NWFNluYrjbz9j01CJ8yKQd2g4jFoOkINCTW2I5LEEyw==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "call-bound": "^1.0.3",
-        "has-bigints": "^1.0.2",
-        "has-symbols": "^1.1.0",
-        "which-boxed-primitive": "^1.1.1"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/undici-types": {
-      "version": "6.21.0",
-      "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-6.21.0.tgz",
-      "integrity": "sha512-iwDZqg0QAGrg9Rav5H4n0M64c3mkR59cJ6wQp+7C4nI0gsmExaedaYLNO44eT4AtBBwjbTiGPMlt2Md0T9H9JQ==",
-      "dev": true,
-      "license": "MIT"
-    },
-    "node_modules/unrs-resolver": {
-      "version": "1.11.1",
-      "resolved": "https://registry.npmjs.org/unrs-resolver/-/unrs-resolver-1.11.1.tgz",
-      "integrity": "sha512-bSjt9pjaEBnNiGgc9rUiHGKv5l4/TGzDmYw3RhnkJGtLhbnnA/5qJj7x3dNDCRx/PJxu774LlH8lCOlB4hEfKg==",
-      "dev": true,
-      "hasInstallScript": true,
-      "license": "MIT",
-      "dependencies": {
-        "napi-postinstall": "^0.3.0"
-      },
-      "funding": {
-        "url": "https://opencollective.com/unrs-resolver"
-      },
-      "optionalDependencies": {
-        "@unrs/resolver-binding-android-arm-eabi": "1.11.1",
-        "@unrs/resolver-binding-android-arm64": "1.11.1",
-        "@unrs/resolver-binding-darwin-arm64": "1.11.1",
-        "@unrs/resolver-binding-darwin-x64": "1.11.1",
-        "@unrs/resolver-binding-freebsd-x64": "1.11.1",
-        "@unrs/resolver-binding-linux-arm-gnueabihf": "1.11.1",
-        "@unrs/resolver-binding-linux-arm-musleabihf": "1.11.1",
-        "@unrs/resolver-binding-linux-arm64-gnu": "1.11.1",
-        "@unrs/resolver-binding-linux-arm64-musl": "1.11.1",
-        "@unrs/resolver-binding-linux-ppc64-gnu": "1.11.1",
-        "@unrs/resolver-binding-linux-riscv64-gnu": "1.11.1",
-        "@unrs/resolver-binding-linux-riscv64-musl": "1.11.1",
-        "@unrs/resolver-binding-linux-s390x-gnu": "1.11.1",
-        "@unrs/resolver-binding-linux-x64-gnu": "1.11.1",
-        "@unrs/resolver-binding-linux-x64-musl": "1.11.1",
-        "@unrs/resolver-binding-wasm32-wasi": "1.11.1",
-        "@unrs/resolver-binding-win32-arm64-msvc": "1.11.1",
-        "@unrs/resolver-binding-win32-ia32-msvc": "1.11.1",
-        "@unrs/resolver-binding-win32-x64-msvc": "1.11.1"
-      }
-    },
-    "node_modules/update-browserslist-db": {
-      "version": "1.1.3",
-      "resolved": "https://registry.npmjs.org/update-browserslist-db/-/update-browserslist-db-1.1.3.tgz",
-      "integrity": "sha512-UxhIZQ+QInVdunkDAaiazvvT/+fXL5Osr0JZlJulepYu6Jd7qJtDZjlur0emRlT71EN3ScPoE7gvsuIKKNavKw==",
-      "dev": true,
-      "funding": [
-        {
-          "type": "opencollective",
-          "url": "https://opencollective.com/browserslist"
-        },
-        {
-          "type": "tidelift",
-          "url": "https://tidelift.com/funding/github/npm/browserslist"
-        },
-        {
-          "type": "github",
-          "url": "https://github.com/sponsors/ai"
-        }
-      ],
-      "license": "MIT",
-      "dependencies": {
-        "escalade": "^3.2.0",
-        "picocolors": "^1.1.1"
-      },
-      "bin": {
-        "update-browserslist-db": "cli.js"
-      },
-      "peerDependencies": {
-        "browserslist": ">= 4.21.0"
-      }
-    },
-    "node_modules/uri-js": {
-      "version": "4.4.1",
-      "resolved": "https://registry.npmjs.org/uri-js/-/uri-js-4.4.1.tgz",
-      "integrity": "sha512-7rKUyy33Q1yc98pQ1DAmLtwX109F7TIfWlW1Ydo8Wl1ii1SeHieeh0HHfPeL2fMXK6z0s8ecKs9frCuLJvndBg==",
-      "dev": true,
-      "license": "BSD-2-Clause",
-      "dependencies": {
-        "punycode": "^2.1.0"
-      }
-    },
-    "node_modules/use-sync-external-store": {
-      "version": "1.5.0",
-      "resolved": "https://registry.npmjs.org/use-sync-external-store/-/use-sync-external-store-1.5.0.tgz",
-      "integrity": "sha512-Rb46I4cGGVBmjamjphe8L/UnvJD+uPPtTkNvX5mZgqdbavhI4EbgIWJiIHXJ8bc/i9EQGPRh4DwEURJ552Do0A==",
-      "license": "MIT",
-      "peerDependencies": {
-        "react": "^16.8.0 || ^17.0.0 || ^18.0.0 || ^19.0.0"
-      }
-    },
-    "node_modules/util-deprecate": {
-      "version": "1.0.2",
-      "resolved": "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz",
-      "integrity": "sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw==",
-      "dev": true,
-      "license": "MIT"
-    },
-    "node_modules/which": {
-      "version": "2.0.2",
-      "resolved": "https://registry.npmjs.org/which/-/which-2.0.2.tgz",
-      "integrity": "sha512-BLI3Tl1TW3Pvl70l3yq3Y64i+awpwXqsGBYWkkqMtnbXgrMD+yj7rhW0kuEDxzJaYXGjEW5ogapKNMEKNMjibA==",
-      "dev": true,
-      "license": "ISC",
-      "dependencies": {
-        "isexe": "^2.0.0"
-      },
-      "bin": {
-        "node-which": "bin/node-which"
-      },
-      "engines": {
-        "node": ">= 8"
-      }
-    },
-    "node_modules/which-boxed-primitive": {
-      "version": "1.1.1",
-      "resolved": "https://registry.npmjs.org/which-boxed-primitive/-/which-boxed-primitive-1.1.1.tgz",
-      "integrity": "sha512-TbX3mj8n0odCBFVlY8AxkqcHASw3L60jIuF8jFP78az3C2YhmGvqbHBpAjTRH2/xqYunrJ9g1jSyjCjpoWzIAA==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "is-bigint": "^1.1.0",
-        "is-boolean-object": "^1.2.1",
-        "is-number-object": "^1.1.1",
-        "is-string": "^1.1.1",
-        "is-symbol": "^1.1.1"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/which-builtin-type": {
-      "version": "1.2.1",
-      "resolved": "https://registry.npmjs.org/which-builtin-type/-/which-builtin-type-1.2.1.tgz",
-      "integrity": "sha512-6iBczoX+kDQ7a3+YJBnh3T+KZRxM/iYNPXicqk66/Qfm1b93iu+yOImkg0zHbj5LNOcNv1TEADiZ0xa34B4q6Q==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "call-bound": "^1.0.2",
-        "function.prototype.name": "^1.1.6",
-        "has-tostringtag": "^1.0.2",
-        "is-async-function": "^2.0.0",
-        "is-date-object": "^1.1.0",
-        "is-finalizationregistry": "^1.1.0",
-        "is-generator-function": "^1.0.10",
-        "is-regex": "^1.2.1",
-        "is-weakref": "^1.0.2",
-        "isarray": "^2.0.5",
-        "which-boxed-primitive": "^1.1.0",
-        "which-collection": "^1.0.2",
-        "which-typed-array": "^1.1.16"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/which-collection": {
-      "version": "1.0.2",
-      "resolved": "https://registry.npmjs.org/which-collection/-/which-collection-1.0.2.tgz",
-      "integrity": "sha512-K4jVyjnBdgvc86Y6BkaLZEN933SwYOuBFkdmBu9ZfkcAbdVbpITnDmjvZ/aQjRXQrv5EPkTnD1s39GiiqbngCw==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "is-map": "^2.0.3",
-        "is-set": "^2.0.3",
-        "is-weakmap": "^2.0.2",
-        "is-weakset": "^2.0.3"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/which-typed-array": {
-      "version": "1.1.19",
-      "resolved": "https://registry.npmjs.org/which-typed-array/-/which-typed-array-1.1.19.tgz",
-      "integrity": "sha512-rEvr90Bck4WZt9HHFC4DJMsjvu7x+r6bImz0/BrbWb7A2djJ8hnZMrWnHo9F8ssv0OMErasDhftrfROTyqSDrw==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "available-typed-arrays": "^1.0.7",
-        "call-bind": "^1.0.8",
-        "call-bound": "^1.0.4",
-        "for-each": "^0.3.5",
-        "get-proto": "^1.0.1",
-        "gopd": "^1.2.0",
-        "has-tostringtag": "^1.0.2"
-      },
-      "engines": {
-        "node": ">= 0.4"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/ljharb"
-      }
-    },
-    "node_modules/word-wrap": {
-      "version": "1.2.5",
-      "resolved": "https://registry.npmjs.org/word-wrap/-/word-wrap-1.2.5.tgz",
-      "integrity": "sha512-BN22B5eaMMI9UMtjrGd5g5eCYPpCPDUy0FJXbYsaT5zYxjFOckS53SQDE3pWkVoWpHXVb3BrYcEN4Twa55B5cA==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">=0.10.0"
-      }
-    },
-    "node_modules/wrap-ansi": {
-      "version": "8.1.0",
-      "resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-8.1.0.tgz",
-      "integrity": "sha512-si7QWI6zUMq56bESFvagtmzMdGOtoxfR+Sez11Mobfc7tm+VkUckk9bW2UeffTGVUbOksxmSw0AA2gs8g71NCQ==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "ansi-styles": "^6.1.0",
-        "string-width": "^5.0.1",
-        "strip-ansi": "^7.0.1"
-      },
-      "engines": {
-        "node": ">=12"
-      },
-      "funding": {
-        "url": "https://github.com/chalk/wrap-ansi?sponsor=1"
-      }
-    },
-    "node_modules/wrap-ansi-cjs": {
-      "name": "wrap-ansi",
-      "version": "7.0.0",
-      "resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-7.0.0.tgz",
-      "integrity": "sha512-YVGIj2kamLSTxw6NsZjoBxfSwsn0ycdesmc4p+Q21c5zPuZ1pl+NfxVdxPtdHvmNVOQ6XSYG4AUtyt/Fi7D16Q==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "ansi-styles": "^4.0.0",
-        "string-width": "^4.1.0",
-        "strip-ansi": "^6.0.0"
-      },
-      "engines": {
-        "node": ">=10"
-      },
-      "funding": {
-        "url": "https://github.com/chalk/wrap-ansi?sponsor=1"
-      }
-    },
-    "node_modules/wrap-ansi-cjs/node_modules/emoji-regex": {
-      "version": "8.0.0",
-      "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-8.0.0.tgz",
-      "integrity": "sha512-MSjYzcWNOA0ewAHpz0MxpYFvwg6yjy1NG3xteoqz644VCo/RPgnr1/GGt+ic3iJTzQ8Eu3TdM14SawnVUmGE6A==",
-      "dev": true,
-      "license": "MIT"
-    },
-    "node_modules/wrap-ansi-cjs/node_modules/string-width": {
-      "version": "4.2.3",
-      "resolved": "https://registry.npmjs.org/string-width/-/string-width-4.2.3.tgz",
-      "integrity": "sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "emoji-regex": "^8.0.0",
-        "is-fullwidth-code-point": "^3.0.0",
-        "strip-ansi": "^6.0.1"
-      },
-      "engines": {
-        "node": ">=8"
-      }
-    },
-    "node_modules/wrap-ansi/node_modules/ansi-regex": {
-      "version": "6.2.2",
-      "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-6.2.2.tgz",
-      "integrity": "sha512-Bq3SmSpyFHaWjPk8If9yc6svM8c56dB5BAtW4Qbw5jHTwwXXcTLoRMkpDJp6VL0XzlWaCHTXrkFURMYmD0sLqg==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">=12"
-      },
-      "funding": {
-        "url": "https://github.com/chalk/ansi-regex?sponsor=1"
-      }
-    },
-    "node_modules/wrap-ansi/node_modules/ansi-styles": {
-      "version": "6.2.3",
-      "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-6.2.3.tgz",
-      "integrity": "sha512-4Dj6M28JB+oAH8kFkTLUo+a2jwOFkuqb3yucU0CANcRRUbxS0cP0nZYCGjcc3BNXwRIsUVmDGgzawme7zvJHvg==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">=12"
-      },
-      "funding": {
-        "url": "https://github.com/chalk/ansi-styles?sponsor=1"
-      }
-    },
-    "node_modules/wrap-ansi/node_modules/strip-ansi": {
-      "version": "7.1.2",
-      "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-7.1.2.tgz",
-      "integrity": "sha512-gmBGslpoQJtgnMAvOVqGZpEz9dyoKTCzy2nfz/n8aIFhN/jCE/rCmcxabB6jOOHV+0WNnylOxaxBQPSvcWklhA==",
-      "dev": true,
-      "license": "MIT",
-      "dependencies": {
-        "ansi-regex": "^6.0.1"
-      },
-      "engines": {
-        "node": ">=12"
-      },
-      "funding": {
-        "url": "https://github.com/chalk/strip-ansi?sponsor=1"
-      }
-    },
-    "node_modules/yallist": {
-      "version": "3.1.1",
-      "resolved": "https://registry.npmjs.org/yallist/-/yallist-3.1.1.tgz",
-      "integrity": "sha512-a4UGQaWPH59mOXUYnAG2ewncQS4i4F43Tv3JoAM+s2VDAmS9NsK8GpDMLrCHPksFT7h3K6TOoUNn2pb7RoXx4g==",
-      "dev": true,
-      "license": "ISC"
-    },
-    "node_modules/yaml": {
-      "version": "2.8.0",
-      "resolved": "https://registry.npmjs.org/yaml/-/yaml-2.8.0.tgz",
-      "integrity": "sha512-4lLa/EcQCB0cJkyts+FpIRx5G/llPxfP6VQU5KByHEhLxY3IJCH0f0Hy1MHI8sClTvsIb8qwRJ6R/ZdlDJ/leQ==",
-      "dev": true,
-      "license": "ISC",
-      "bin": {
-        "yaml": "bin.mjs"
-      },
-      "engines": {
-        "node": ">= 14.6"
-      }
-    },
-    "node_modules/yocto-queue": {
-      "version": "0.1.0",
-      "resolved": "https://registry.npmjs.org/yocto-queue/-/yocto-queue-0.1.0.tgz",
-      "integrity": "sha512-rVksvsnNCdJ/ohGc6xgPwyN8eheCxsiLM8mxuE/t/mOVqJewPuO1miLpTHQiRgTKCLexL4MeAFVagts7HmNZ2Q==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">=10"
-      },
-      "funding": {
-        "url": "https://github.com/sponsors/sindresorhus"
-      }
-    },
-    "node_modules/zod": {
-      "version": "4.1.12",
-      "resolved": "https://registry.npmjs.org/zod/-/zod-4.1.12.tgz",
-      "integrity": "sha512-JInaHOamG8pt5+Ey8kGmdcAcg3OL9reK8ltczgHTAwNhMys/6ThXHityHxVV2p3fkw/c+MAvBHFVYHFZDmjMCQ==",
-      "dev": true,
-      "license": "MIT",
-      "funding": {
-        "url": "https://github.com/sponsors/colinhacks"
-      }
-    },
-    "node_modules/zod-validation-error": {
-      "version": "4.0.2",
-      "resolved": "https://registry.npmjs.org/zod-validation-error/-/zod-validation-error-4.0.2.tgz",
-      "integrity": "sha512-Q6/nZLe6jxuU80qb/4uJ4t5v2VEZ44lzQjPDhYJNztRQ4wyWc6VF3D3Kb/fAuPetZQnhS3hnajCf9CsWesghLQ==",
-      "dev": true,
-      "license": "MIT",
-      "engines": {
-        "node": ">=18.0.0"
-      },
-      "peerDependencies": {
-        "zod": "^3.25.0 || ^4.0.0"
-      }
-    }
-  }
-}
diff --git a/environments/browser/environment/todo/frontend/package.json b/environments/browser/environment/todo/frontend/package.json
deleted file mode 100644
index 52dfd9a4..00000000
--- a/environments/browser/environment/todo/frontend/package.json
+++ /dev/null
@@ -1,28 +0,0 @@
-{
-  "name": "sample-frontend",
-  "version": "0.1.0",
-  "private": true,
-  "scripts": {
-    "dev": "next dev",
-    "build": "next build",
-    "start": "next start",
-    "lint": "next lint"
-  },
-  "dependencies": {
-    "next": "14.1.0",
-    "react": "^18",
-    "react-dom": "^18",
-    "swr": "^2.2.4"
-  },
-  "devDependencies": {
-    "@types/node": "^20",
-    "@types/react": "^18",
-    "@types/react-dom": "^18",
-    "autoprefixer": "^10.0.1",
-    "eslint": "^9",
-    "eslint-config-next": "16.0.3",
-    "postcss": "^8",
-    "tailwindcss": "^3.3.0",
-    "typescript": "^5"
-  }
-} 
\ No newline at end of file
diff --git a/environments/browser/environment/todo/frontend/postcss.config.js b/environments/browser/environment/todo/frontend/postcss.config.js
deleted file mode 100644
index 0cc9a9de..00000000
--- a/environments/browser/environment/todo/frontend/postcss.config.js
+++ /dev/null
@@ -1,6 +0,0 @@
-module.exports = {
-  plugins: {
-    tailwindcss: {},
-    autoprefixer: {},
-  },
-} 
\ No newline at end of file
diff --git a/environments/browser/environment/todo/frontend/tailwind.config.js b/environments/browser/environment/todo/frontend/tailwind.config.js
deleted file mode 100644
index a16c47d9..00000000
--- a/environments/browser/environment/todo/frontend/tailwind.config.js
+++ /dev/null
@@ -1,12 +0,0 @@
-/** @type {import('tailwindcss').Config} */
-module.exports = {
-  content: [
-    './pages/**/*.{js,ts,jsx,tsx,mdx}',
-    './components/**/*.{js,ts,jsx,tsx,mdx}',
-    './app/**/*.{js,ts,jsx,tsx,mdx}',
-  ],
-  theme: {
-    extend: {},
-  },
-  plugins: [],
-} 
\ No newline at end of file
diff --git a/environments/browser/environment/todo/frontend/tsconfig.json b/environments/browser/environment/todo/frontend/tsconfig.json
deleted file mode 100644
index 4bcd5670..00000000
--- a/environments/browser/environment/todo/frontend/tsconfig.json
+++ /dev/null
@@ -1,26 +0,0 @@
-{
-  "compilerOptions": {
-    "lib": ["dom", "dom.iterable", "esnext"],
-    "allowJs": true,
-    "skipLibCheck": true,
-    "strict": true,
-    "noEmit": true,
-    "esModuleInterop": true,
-    "module": "esnext",
-    "moduleResolution": "bundler",
-    "resolveJsonModule": true,
-    "isolatedModules": true,
-    "jsx": "preserve",
-    "incremental": true,
-    "plugins": [
-      {
-        "name": "next"
-      }
-    ],
-    "paths": {
-      "@/*": ["./*"]
-    }
-  },
-  "include": ["next-env.d.ts", "**/*.ts", "**/*.tsx", ".next/types/**/*.ts"],
-  "exclude": ["node_modules"]
-} 
\ No newline at end of file
diff --git a/environments/browser/environment/todo/launch.py b/environments/browser/environment/todo/launch.py
deleted file mode 100644
index 797049bb..00000000
--- a/environments/browser/environment/todo/launch.py
+++ /dev/null
@@ -1,286 +0,0 @@
-#!/usr/bin/env python3
-"""Todo app launcher script."""
-
-import subprocess
-import time
-import signal
-import sys
-import argparse
-import logging
-import os
-import socket
-from pathlib import Path
-from typing import Optional
-
-# Configure logging to stderr to avoid stdio contamination
-logging.basicConfig(
-    level=logging.INFO, format="[%(asctime)s] TodoApp: %(message)s", stream=sys.stderr
-)
-
-# Global variables to track processes
-frontend_process: Optional[subprocess.Popen] = None
-backend_process: Optional[subprocess.Popen] = None
-
-
-def cleanup_processes():
-    """Clean up running processes."""
-    global frontend_process, backend_process
-    logging.info("Shutting down services...")
-
-    for proc in [frontend_process, backend_process]:
-        if proc and proc.poll() is None:
-            proc.terminate()
-            try:
-                proc.wait(timeout=5)
-            except subprocess.TimeoutExpired:
-                proc.kill()
-
-
-def signal_handler(sig, frame):
-    """Handle shutdown signals."""
-    cleanup_processes()
-    sys.exit(0)
-
-
-def check_port_available(port: int) -> bool:
-    """Check if a port is available."""
-    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-    sock.settimeout(1)
-    try:
-        result = sock.connect_ex(("localhost", port))
-        sock.close()
-        return result != 0  # Port is available if connection fails
-    except:
-        return True
-
-
-def launch_app(frontend_port: int = 3000, backend_port: int = 5000):
-    """Launch the todo app with frontend and backend."""
-    global frontend_process, backend_process
-
-    # Set up signal handlers
-    signal.signal(signal.SIGINT, signal_handler)
-    signal.signal(signal.SIGTERM, signal_handler)
-
-    try:
-        # Get current directory
-        app_dir = Path(__file__).parent
-        frontend_dir = app_dir / "frontend"
-        backend_dir = app_dir / "backend"
-
-        logging.info(
-            f"Starting todo app - Frontend port: {frontend_port}, Backend port: {backend_port}"
-        )
-
-        # Check if ports are available
-        if not check_port_available(backend_port):
-            logging.warning(f"Backend port {backend_port} is already in use")
-        if not check_port_available(frontend_port):
-            logging.warning(f"Frontend port {frontend_port} is already in use")
-
-        # Prepare backend command
-        backend_env = {
-            "PORT": str(backend_port),
-            "PYTHONPATH": str(backend_dir),
-            **dict(os.environ),
-        }
-
-        # Check if we can use uv, otherwise fall back to system python
-        try:
-            subprocess.run(["uv", "--version"], check=True, capture_output=True)
-            backend_cmd = [
-                "uv",
-                "run",
-                "uvicorn",
-                "main:app",
-                "--host",
-                "0.0.0.0",
-                "--port",
-                str(backend_port),
-            ]
-            logging.info("Using uv for backend")
-        except (subprocess.CalledProcessError, FileNotFoundError):
-            # Fall back to system python with uvicorn
-            logging.info("uv not available, using system python for backend")
-            backend_cmd = [
-                "python3",
-                "-m",
-                "uvicorn",
-                "main:app",
-                "--host",
-                "0.0.0.0",
-                "--port",
-                str(backend_port),
-            ]
-
-        # Prepare frontend command
-        frontend_env = {
-            "NEXT_PUBLIC_API_URL": f"http://localhost:{backend_port}",
-            "PORT": str(frontend_port),
-            **dict(os.environ),
-        }
-
-        # Check if dependencies are installed
-        if frontend_dir.exists():
-            node_modules = frontend_dir / "node_modules"
-            if not node_modules.exists():
-                logging.info("Installing frontend dependencies...")
-                npm_install = subprocess.run(
-                    ["npm", "install"], cwd=frontend_dir, capture_output=True
-                )
-                if npm_install.returncode != 0:
-                    logging.error(
-                        f"Failed to install npm dependencies: {npm_install.stderr.decode()}"
-                    )
-                    cleanup_processes()
-                    raise RuntimeError("npm install failed")
-
-            # Check if we have a production build
-            if (frontend_dir / ".next").exists():
-                logging.info("Running in production mode (pre-built)...")
-                frontend_cmd = [
-                    "npm",
-                    "run",
-                    "start",
-                    "--",
-                    "--port",
-                    str(frontend_port),
-                    "--hostname",
-                    "0.0.0.0",
-                ]
-            else:
-                logging.info("Running in development mode...")
-                frontend_cmd = [
-                    "npm",
-                    "run",
-                    "dev",
-                    "--",
-                    "--port",
-                    str(frontend_port),
-                    "--hostname",
-                    "0.0.0.0",
-                ]
-
-        # 🚀 START BOTH PROCESSES IN PARALLEL
-        logging.info("Starting backend and frontend in parallel...")
-
-        # Start backend - UPDATE GLOBAL VARIABLE
-        backend_process = subprocess.Popen(
-            backend_cmd,
-            cwd=backend_dir,
-            env=backend_env,
-            stdin=subprocess.DEVNULL,
-            stdout=subprocess.DEVNULL,  # Don't capture stdout - reserved for MCP
-            stderr=subprocess.DEVNULL,  # Don't capture stderr - reserved for MCP
-        )
-
-        # Start frontend immediately (in parallel) - UPDATE GLOBAL VARIABLE
-        if frontend_dir.exists():
-            frontend_process = subprocess.Popen(
-                frontend_cmd,
-                cwd=frontend_dir,
-                env=frontend_env,
-                stdin=subprocess.DEVNULL,
-                stdout=subprocess.DEVNULL,  # Don't capture stdout - reserved for MCP
-                stderr=subprocess.DEVNULL,  # Don't capture stderr - reserved for MCP
-            )
-
-        # 🚀 WAIT FOR BOTH IN PARALLEL WITH FAST POLLING
-        backend_ready = False
-        frontend_ready = False
-
-        # Use faster polling (every 200ms instead of 1s)
-        max_attempts_backend = 150  # 30 seconds at 200ms intervals
-        max_attempts_frontend = 600  # 120 seconds at 200ms intervals
-
-        for attempt in range(max(max_attempts_backend, max_attempts_frontend)):
-            # Check if processes are still alive
-            if backend_process and backend_process.poll() is not None:
-                logging.error(f"Backend process died with exit code {backend_process.returncode}")
-                cleanup_processes()
-                raise RuntimeError("Backend failed to start")
-
-            if frontend_process and frontend_process.poll() is not None:
-                logging.error(f"Frontend process died with exit code {frontend_process.returncode}")
-                cleanup_processes()
-                raise RuntimeError("Frontend failed to start")
-
-            # Check backend readiness
-            if not backend_ready and attempt < max_attempts_backend:
-                sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-                sock.settimeout(0.1)
-                try:
-                    result = sock.connect_ex(("localhost", backend_port))
-                    sock.close()
-                    if result == 0:
-                        backend_ready = True
-                        logging.info(f"Backend is ready (attempt {attempt + 1})")
-                except:
-                    pass
-
-            # Check frontend readiness
-            if not frontend_ready and attempt < max_attempts_frontend:
-                sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-                sock.settimeout(0.1)
-                try:
-                    result = sock.connect_ex(("localhost", frontend_port))
-                    sock.close()
-                    if result == 0:
-                        frontend_ready = True
-                        logging.info(f"Frontend is ready (attempt {attempt + 1})")
-                except:
-                    pass
-
-            # Exit early if both are ready
-            if backend_ready and frontend_ready:
-                break
-
-            time.sleep(0.2)  # 200ms intervals instead of 1s
-
-        # Check final status
-        if not backend_ready:
-            logging.error("Backend did not start within 30 seconds")
-            cleanup_processes()
-            raise RuntimeError("Backend startup timeout")
-
-        if not frontend_ready:
-            logging.error("Frontend did not start within 2 minutes")
-            cleanup_processes()
-            raise RuntimeError("Frontend startup timeout")
-
-        # Log startup information
-        logging.info("Todo app started successfully!")
-        logging.info(f"Frontend: http://localhost:{frontend_port}")
-        logging.info(f"Backend API: http://localhost:{backend_port}/docs")
-        logging.info("Press Ctrl+C to stop")
-
-        # Wait for processes to finish
-        while True:
-            time.sleep(1)
-            if backend_process and backend_process.poll() is not None:
-                logging.error("Backend process died unexpectedly")
-                break
-            if frontend_process and frontend_process.poll() is not None:
-                logging.error("Frontend process died unexpectedly")
-                break
-
-    except Exception as e:
-        logging.error(f"Error launching app: {e}")
-        cleanup_processes()
-        raise
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Launch Todo App")
-    parser.add_argument("--frontend-port", type=int, default=3000, help="Frontend port")
-    parser.add_argument("--backend-port", type=int, default=5000, help="Backend port")
-
-    args = parser.parse_args()
-
-    try:
-        launch_app(args.frontend_port, args.backend_port)
-    except KeyboardInterrupt:
-        logging.info("App interrupted by user")
-    except Exception as e:
-        logging.error(f"Failed to launch app: {e}")
-        sys.exit(1)
diff --git a/environments/browser/hud.lock.yaml b/environments/browser/hud.lock.yaml
deleted file mode 100644
index 4ee5d82f..00000000
--- a/environments/browser/hud.lock.yaml
+++ /dev/null
@@ -1,503 +0,0 @@
-version: '1.1'
-images:
-  local: browser:0.1.0
-  full: browser:0.1.0@sha256:561425e39f921b7df4869fc474b7ad76d719a614a7f98dd9bd088f053bb5b2f2
-  pushed: null
-build:
-  generatedAt: 2025-11-24T01:36:13.480264+00:00Z
-  hudVersion: 0.4.66
-  directory: browser
-  version: 0.1.0
-  sourceHash: e9e1af9f9cc521a2ddf91694b393c7532a0d4814a0b749bbc9112c31d8779888
-  sourceFiles:
-  - Dockerfile
-  - environment/2048/README.md
-  - environment/2048/backend/game.py
-  - environment/2048/backend/main.py
-  - environment/2048/backend/pyproject.toml
-  - environment/2048/frontend/app/globals.css
-  - environment/2048/frontend/app/layout.tsx
-  - environment/2048/frontend/app/page.tsx
-  - environment/2048/frontend/components/GameBoard.tsx
-  - environment/2048/frontend/components/GameControls.tsx
-  - environment/2048/frontend/components/GameTile.tsx
-  - environment/2048/frontend/next.config.js
-  - environment/2048/frontend/package.json
-  - environment/2048/frontend/postcss.config.js
-  - environment/2048/frontend/tailwind.config.js
-  - environment/2048/frontend/tsconfig.json
-  - environment/2048/launch.py
-  - environment/README.md
-  - environment/__init__.py
-  - environment/pyproject.toml
-  - environment/server.py
-  - environment/todo/README.md
-  - environment/todo/backend/main.py
-  - environment/todo/backend/pyproject.toml
-  - environment/todo/frontend/app/globals.css
-  - environment/todo/frontend/app/layout.tsx
-  - environment/todo/frontend/app/page.tsx
-  - environment/todo/frontend/next.config.js
-  - environment/todo/frontend/package-lock.json
-  - environment/todo/frontend/package.json
-  - environment/todo/frontend/postcss.config.js
-  - environment/todo/frontend/tailwind.config.js
-  - environment/todo/frontend/tsconfig.json
-  - environment/todo/launch.py
-  - pyproject.toml
-  - server/__init__.py
-  - server/evaluate/__init__.py
-  - server/evaluate/game_2048.py
-  - server/evaluate/todo.py
-  - server/main.py
-  - server/pyproject.toml
-  - server/resources.py
-  - server/setup/__init__.py
-  - server/setup/game_2048.py
-  - server/setup/todo.py
-  - server/shared.py
-  - server/tools.py
-environment:
-  initializeMs: 15450
-  toolCount: 8
-tools:
-- name: playwright
-  description: Web automation tool using Playwright
-  inputSchema:
-    properties:
-      action:
-        description: The action to perform (navigate, screenshot, click, type, get_page_info,
-          wait_for_element)
-        title: Action
-        type: string
-      url:
-        anyOf:
-        - type: string
-        - type: 'null'
-        default: null
-        description: URL to navigate to (for navigate action)
-        title: Url
-      selector:
-        anyOf:
-        - type: string
-        - type: 'null'
-        default: null
-        description: CSS selector for element (for click, type, wait_for_element actions)
-        title: Selector
-      text:
-        anyOf:
-        - type: string
-        - type: 'null'
-        default: null
-        description: Text to type (for type action)
-        title: Text
-      wait_for_load_state:
-        anyOf:
-        - enum:
-          - commit
-          - domcontentloaded
-          - load
-          - networkidle
-          type: string
-        - type: 'null'
-        default: null
-        description: 'State to wait for: commit, domcontentloaded, load, networkidle
-          (default: networkidle)'
-        title: Wait For Load State
-    required:
-    - action
-    type: object
-- name: computer
-  description: Control computer with mouse, keyboard, and screenshots
-  inputSchema:
-    properties:
-      action:
-        description: The action name (click, press, write, move, etc.)
-        title: Action
-        type: string
-      x:
-        anyOf:
-        - type: integer
-        - type: 'null'
-        default: null
-        description: X coordinate for click/move/scroll actions
-        title: X
-      y:
-        anyOf:
-        - type: integer
-        - type: 'null'
-        default: null
-        description: Y coordinate for click/move/scroll actions
-        title: Y
-      button:
-        anyOf:
-        - enum:
-          - left
-          - right
-          - middle
-          - back
-          - forward
-          type: string
-        - type: 'null'
-        default: null
-        description: Mouse button for click actions
-        title: Button
-      pattern:
-        anyOf:
-        - items:
-            type: integer
-          type: array
-        - type: 'null'
-        default: null
-        description: Click pattern for multi-clicks (e.g., [100] for double-click)
-        title: Pattern
-      text:
-        anyOf:
-        - type: string
-        - type: 'null'
-        default: null
-        description: Text for write/response actions
-        title: Text
-      keys:
-        anyOf:
-        - items:
-            type: string
-          type: array
-        - type: 'null'
-        default: null
-        description: Keys for press/keydown/keyup actions
-        title: Keys
-      enter_after:
-        anyOf:
-        - type: boolean
-        - type: 'null'
-        default: null
-        description: Whether to press Enter after typing
-        title: Enter After
-      scroll_x:
-        anyOf:
-        - type: integer
-        - type: 'null'
-        default: null
-        description: Horizontal scroll amount (positive = right)
-        title: Scroll X
-      scroll_y:
-        anyOf:
-        - type: integer
-        - type: 'null'
-        default: null
-        description: Vertical scroll amount (positive = down)
-        title: Scroll Y
-      offset_x:
-        anyOf:
-        - type: integer
-        - type: 'null'
-        default: null
-        description: X offset for relative move
-        title: Offset X
-      offset_y:
-        anyOf:
-        - type: integer
-        - type: 'null'
-        default: null
-        description: Y offset for relative move
-        title: Offset Y
-      path:
-        anyOf:
-        - items:
-            maxItems: 2
-            minItems: 2
-            prefixItems:
-            - type: integer
-            - type: integer
-            type: array
-          type: array
-        - type: 'null'
-        default: null
-        description: Path for drag actions as list of (x, y) coordinates
-        title: Path
-      time:
-        anyOf:
-        - type: integer
-        - type: 'null'
-        default: null
-        description: Time in milliseconds for wait action
-        title: Time
-      hold_keys:
-        anyOf:
-        - items:
-            type: string
-          type: array
-        - type: 'null'
-        default: null
-        description: Keys to hold during action
-        title: Hold Keys
-      duration:
-        anyOf:
-        - type: number
-        - type: 'null'
-        default: null
-        description: Duration in seconds for hold_key action
-        title: Duration
-    required:
-    - action
-    type: object
-- name: anthropic_computer
-  description: Control computer with mouse, keyboard, and screenshot
-  inputSchema:
-    properties:
-      action:
-        description: The action to perform on the computer
-        title: Action
-        type: string
-      coordinate:
-        anyOf:
-        - items:
-            type: integer
-          type: array
-        - maxItems: 2
-          minItems: 2
-          prefixItems:
-          - type: integer
-          - type: integer
-          type: array
-        - type: 'null'
-        default: null
-        description: The coordinate to interact with on the computer [x, y]
-        title: Coordinate
-      text:
-        anyOf:
-        - type: string
-        - type: 'null'
-        default: null
-        description: The text to type on the computer or key to press
-        title: Text
-      start_coordinate:
-        anyOf:
-        - items:
-            type: integer
-          type: array
-        - maxItems: 2
-          minItems: 2
-          prefixItems:
-          - type: integer
-          - type: integer
-          type: array
-        - type: 'null'
-        default: null
-        description: The starting coordinate for drag actions [x, y]
-        title: Start Coordinate
-      scroll_direction:
-        anyOf:
-        - type: string
-        - type: 'null'
-        default: null
-        description: The direction to scroll (up, down, left, right)
-        title: Scroll Direction
-      scroll_amount:
-        anyOf:
-        - type: integer
-        - type: 'null'
-        default: null
-        description: The amount to scroll
-        title: Scroll Amount
-      duration:
-        anyOf:
-        - type: number
-        - type: 'null'
-        default: null
-        description: The duration of the action in seconds
-        title: Duration
-      take_screenshot_on_click:
-        default: true
-        description: Whether to take a screenshot after clicking
-        title: Take Screenshot On Click
-        type: boolean
-    required:
-    - action
-    type: object
-- name: openai_computer
-  description: Control computer with mouse, keyboard, and screenshots
-  inputSchema:
-    $defs:
-      Coordinate:
-        description: A coordinate point with x and y values.
-        properties:
-          x:
-            description: X coordinate
-            title: X
-            type: integer
-          y:
-            description: Y coordinate
-            title: Y
-            type: integer
-        required:
-        - x
-        - y
-        title: Coordinate
-        type: object
-    properties:
-      type:
-        description: The action type to perform
-        title: Type
-        type: string
-      x:
-        anyOf:
-        - type: integer
-        - type: 'null'
-        default: null
-        description: X coordinate for click/move/scroll actions
-        title: X
-      y:
-        anyOf:
-        - type: integer
-        - type: 'null'
-        default: null
-        description: Y coordinate for click/move/scroll actions
-        title: Y
-      button:
-        anyOf:
-        - type: string
-        - type: 'null'
-        default: null
-        description: Mouse button for click actions (left, right, middle, wheel)
-        title: Button
-      text:
-        anyOf:
-        - type: string
-        - type: 'null'
-        default: null
-        description: Text to type or response text
-        title: Text
-      scroll_x:
-        anyOf:
-        - type: integer
-        - type: 'null'
-        default: null
-        description: Horizontal scroll amount
-        title: Scroll X
-      scroll_y:
-        anyOf:
-        - type: integer
-        - type: 'null'
-        default: null
-        description: Vertical scroll amount
-        title: Scroll Y
-      ms:
-        anyOf:
-        - type: integer
-        - type: 'null'
-        default: null
-        description: Time to wait in milliseconds
-        title: Ms
-      keys:
-        anyOf:
-        - items:
-            type: string
-          type: array
-        - type: 'null'
-        default: null
-        description: Keys to press
-        title: Keys
-      path:
-        anyOf:
-        - items:
-            $ref: '#/$defs/Coordinate'
-          type: array
-        - type: 'null'
-        default: null
-        description: Path for drag actions as list of {x, y} dicts
-        title: Path
-      action:
-        anyOf:
-        - type: string
-        - type: 'null'
-        default: null
-        description: Custom action name
-        title: Action
-    required:
-    - type
-    type: object
-- name: launch_app
-  description: "Launch a specific application dynamically and navigate to it.\n\n\
-    Args:\n    app_name: Name of the app to launch (e.g., 'todo', '2048')\n\nReturns:\n\
-    \    Success message with app URL"
-  inputSchema:
-    properties:
-      app_name:
-        title: App Name
-        type: string
-    required:
-    - app_name
-    type: object
-- name: api_request
-  description: "Make HTTP API requests.\n\nArgs:\n    url: The URL to request\n  \
-    \  method: HTTP method (GET, POST, etc.)\n    data: Optional JSON data for POST/PUT\
-    \ requests\n\nReturns:\n    Response data as dict"
-  inputSchema:
-    properties:
-      url:
-        title: Url
-        type: string
-      method:
-        default: GET
-        title: Method
-        type: string
-      data:
-        anyOf:
-        - additionalProperties: true
-          type: object
-        - type: 'null'
-        default: null
-        title: Data
-    required:
-    - url
-    type: object
-- name: setup
-  description: Call internal 'setup' functions
-  inputSchema:
-    properties:
-      name:
-        title: Name
-        type: string
-      arguments:
-        anyOf:
-        - additionalProperties: true
-          type: object
-        - type: string
-        - type: 'null'
-        default: null
-        title: Arguments
-      ctx:
-        anyOf:
-        - {}
-        - type: 'null'
-        default: null
-        title: Ctx
-    required:
-    - name
-    type: object
-- name: evaluate
-  description: Call internal 'evaluate' functions
-  inputSchema:
-    properties:
-      name:
-        title: Name
-        type: string
-      arguments:
-        anyOf:
-        - additionalProperties: true
-          type: object
-        - type: string
-        - type: 'null'
-        default: null
-        title: Arguments
-      ctx:
-        anyOf:
-        - {}
-        - type: 'null'
-        default: null
-        title: Ctx
-    required:
-    - name
-    type: object
diff --git a/environments/browser/pyproject.toml b/environments/browser/pyproject.toml
deleted file mode 100644
index c834acfc..00000000
--- a/environments/browser/pyproject.toml
+++ /dev/null
@@ -1,22 +0,0 @@
-[project]
-name = "hud-browser-controller"
-version = "0.1.0"
-description = "HUD Browser Controller - MCP interface for browser environments"
-requires-python = ">=3.11,<3.14"
-dependencies = [ "pydantic>=2.6,<3", "pydantic-settings>=2.2,<3", "hud-python>=0.4.69", "playwright", "pyautogui", "httpx", "typer", "fastapi>=0.104.1", "uvicorn[standard]>=0.24.0", "python-multipart>=0.0.6",]
-
-[build-system]
-requires = [ "hatchling",]
-build-backend = "hatchling.build"
-
-[project.scripts]
-hud-browser-controller = "controller.__main__:main"
-
-[tool.hud]
-image = "hud-browser:dev"
-
-[tool.hatch.metadata]
-allow-direct-references = true
-
-[tool.hatch.build.targets.wheel]
-packages = [ "controller", "environment",]
diff --git a/environments/browser/server/__init__.py b/environments/browser/server/__init__.py
deleted file mode 100644
index 35a50f42..00000000
--- a/environments/browser/server/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-"""Browser MCP server package."""
diff --git a/environments/browser/server/evaluate/__init__.py b/environments/browser/server/evaluate/__init__.py
deleted file mode 100644
index 39206f70..00000000
--- a/environments/browser/server/evaluate/__init__.py
+++ /dev/null
@@ -1,15 +0,0 @@
-"""Evaluation tools for browser environment."""
-
-from hud.server import MCPRouter
-
-# Create combined router for all evaluation tools
-router = MCPRouter(name="evaluate")
-
-# Import and include sub-routers
-from .game_2048 import router as game_2048_router
-from .todo import router as todo_router
-
-router.include_router(game_2048_router)
-router.include_router(todo_router)
-
-__all__ = ["router"]
diff --git a/environments/browser/server/evaluate/game_2048.py b/environments/browser/server/evaluate/game_2048.py
deleted file mode 100644
index b674c778..00000000
--- a/environments/browser/server/evaluate/game_2048.py
+++ /dev/null
@@ -1,220 +0,0 @@
-"""2048 game evaluation tools."""
-
-import logging
-import math
-
-from server.main import http_client
-from hud.server import MCPRouter
-
-logger = logging.getLogger(__name__)
-
-# Create router for this module
-router = MCPRouter()
-
-
-@router.tool
-async def game_2048_max_number(target: int):
-    """Check if player reached target tile value.
-
-    Uses logarithmic reward scaling to match text-2048 implementation.
-
-    Args:
-        target: The target tile value to reach
-
-    Returns:
-        Evaluation result with logarithmic reward
-    """
-    try:
-        if not http_client:
-            return {"error": "HTTP client not initialized", "reward": 0.0, "done": False}
-
-        # Get app info
-        app_response = await http_client.get("/apps/2048")
-        if app_response.status_code != 200:
-            return {"error": "2048 app not running", "reward": 0.0, "done": False}
-
-        app_data = app_response.json()
-        backend_port = app_data.get("backend_port", 5001)
-
-        # Get game state
-        url = f"http://localhost:{backend_port}/api/game/state"
-        response = await http_client.get(url)
-        response.raise_for_status()
-        game_state = response.json()
-
-        highest_tile = game_state.get("highest_tile", 0)
-        score = game_state.get("score", 0)
-
-        # Only give reward if progress has been made
-        if score == 0:
-            reward = 0.0
-        elif target > 1 and highest_tile > 1:
-            # Logarithmic reward scale
-            reward = min(1.0, math.log(highest_tile) / math.log(target))
-        else:
-            reward = 0.0
-
-        done = highest_tile >= target
-
-        return {
-            "reward": reward,
-            "done": done,
-            "content": f"Target: {target}, Highest: {highest_tile}",
-            "info": {"target": target, "highest_tile": highest_tile, "progress": reward},
-        }
-    except Exception as e:
-        logger.error(f"game_2048_max_number failed: {e}")
-        return {
-            "reward": 0.0,
-            "done": False,
-            "content": f"Failed to evaluate max number: {str(e)}",
-            "isError": True,
-        }
-
-
-@router.tool
-async def game_2048_efficiency(min_ratio: float):
-    """Evaluate game efficiency based on score/moves ratio.
-
-    Args:
-        min_ratio: The minimum efficiency ratio to achieve
-
-    Returns:
-        Evaluation result with linear reward based on efficiency
-    """
-    try:
-        if not http_client:
-            return {"error": "HTTP client not initialized", "reward": 0.0, "done": False}
-
-        # Get app info
-        app_response = await http_client.get("/apps/2048")
-        if app_response.status_code != 200:
-            return {"error": "2048 app not running", "reward": 0.0, "done": False}
-
-        app_data = app_response.json()
-        backend_port = app_data.get("backend_port", 5001)
-
-        # Get game state
-        url = f"http://localhost:{backend_port}/api/game/state"
-        response = await http_client.get(url)
-        response.raise_for_status()
-        game_state = response.json()
-
-        score = game_state.get("score", 0)
-        moves = game_state.get("moves", 0)
-
-        # Calculate the efficiency ratio directly
-        ratio = score / moves if moves > 0 else 0.0
-
-        # Linear reward: proportional to ratio / min_ratio, capped at 1.0
-        reward = min(1.0, ratio / min_ratio) if min_ratio > 0 else 0.0
-        done = ratio >= min_ratio
-
-        return {
-            "reward": reward,
-            "done": done,
-            "content": f"Efficiency: {ratio:.2f} (target: {min_ratio})",
-            "info": {"score": score, "moves": moves, "ratio": ratio, "target_ratio": min_ratio},
-        }
-    except Exception as e:
-        logger.error(f"game_2048_efficiency failed: {e}")
-        return {
-            "reward": 0.0,
-            "done": False,
-            "content": f"Failed to evaluate efficiency: {str(e)}",
-            "isError": True,
-        }
-
-
-@router.tool
-async def game_2048_score_reached(target_score: int):
-    """Check if player reached target score.
-
-    Args:
-        target_score: The target score to reach
-
-    Returns:
-        Evaluation result
-    """
-    try:
-        if not http_client:
-            return {"error": "HTTP client not initialized", "reward": 0.0, "done": False}
-
-        # Get app info
-        app_response = await http_client.get("/apps/2048")
-        if app_response.status_code != 200:
-            return {"error": "2048 app not running", "reward": 0.0, "done": False}
-
-        app_data = app_response.json()
-        backend_port = app_data.get("backend_port", 5001)
-
-        # Get game state
-        url = f"http://localhost:{backend_port}/api/game/state"
-        response = await http_client.get(url)
-        response.raise_for_status()
-        game_state = response.json()
-        score = game_state.get("score", 0)
-
-        # Linear reward based on score progress
-        reward = min(1.0, score / target_score) if target_score > 0 else 0.0
-        done = score >= target_score
-
-        return {
-            "reward": reward,
-            "done": done,
-            "content": f"Score: {score} (target: {target_score})",
-            "info": {"score": score, "target_score": target_score, "progress": reward},
-        }
-    except Exception as e:
-        logger.error(f"game_2048_score_reached failed: {e}")
-        return {
-            "reward": 0.0,
-            "done": False,
-            "content": f"Failed to evaluate score: {str(e)}",
-            "isError": True,
-        }
-
-
-@router.tool
-async def game_2048_game_won():
-    """Check if game is won (reached target tile).
-
-    Returns:
-        Evaluation result
-    """
-    try:
-        if not http_client:
-            return {"error": "HTTP client not initialized", "reward": 0.0, "done": False}
-
-        # Get app info
-        app_response = await http_client.get("/apps/2048")
-        if app_response.status_code != 200:
-            return {"error": "2048 app not running", "reward": 0.0, "done": False}
-
-        app_data = app_response.json()
-        backend_port = app_data.get("backend_port", 5001)
-
-        # Get game state
-        url = f"http://localhost:{backend_port}/api/game/state"
-        response = await http_client.get(url)
-        response.raise_for_status()
-        game_state = response.json()
-
-        won = game_state.get("won", False)
-        highest_tile = game_state.get("highest_tile", 0)
-        target_tile = game_state.get("target_tile", 2048)
-
-        return {
-            "reward": 1.0 if won else 0.0,
-            "done": won,
-            "content": f"Game {'won' if won else 'in progress'} (highest: {highest_tile}, target: {target_tile})",
-            "info": {"won": won, "highest_tile": highest_tile, "target_tile": target_tile},
-        }
-    except Exception as e:
-        logger.error(f"game_2048_game_won failed: {e}")
-        return {
-            "reward": 0.0,
-            "done": False,
-            "content": f"Failed to check win status: {str(e)}",
-            "isError": True,
-        }
diff --git a/environments/browser/server/evaluate/todo.py b/environments/browser/server/evaluate/todo.py
deleted file mode 100644
index 2e5ff3c8..00000000
--- a/environments/browser/server/evaluate/todo.py
+++ /dev/null
@@ -1,233 +0,0 @@
-"""Todo app evaluation tools."""
-
-import logging
-
-from server.main import http_client
-from hud.server import MCPRouter
-
-logger = logging.getLogger(__name__)
-
-# Create router for this module
-router = MCPRouter()
-
-
-@router.tool
-async def todo_completed(expected_count: int):
-    """Check if expected number of todos are completed.
-
-    Args:
-        expected_count: The expected number of completed todos
-
-    Returns:
-        Evaluation result
-    """
-    try:
-        # Get app info
-        app_response = await http_client.get("/apps/todo")
-        if app_response.status_code != 200:
-            return {"error": "Todo app not running", "reward": 0.0, "done": True}
-
-        app_data = app_response.json()
-        backend_port = app_data.get("backend_port", 5000)
-
-        # Get stats
-        url = f"http://localhost:{backend_port}/api/eval/stats"
-        response = await http_client.get(url)
-        response.raise_for_status()
-        stats = response.json()
-
-        completed_count = stats.get("completed_items", 0)
-        total_count = stats.get("total_items", 0)
-
-        success = completed_count >= expected_count
-        reward = (
-            1.0 if success else (completed_count / expected_count if expected_count > 0 else 0.0)
-        )
-
-        return {
-            "reward": reward,
-            "done": success,
-            "info": {
-                "success": success,
-                "completed_count": completed_count,
-                "total_count": total_count,
-                "expected_count": expected_count,
-                "message": f"Found {completed_count} completed todos (expected {expected_count})",
-            },
-        }
-    except Exception as e:
-        logger.error(f"todo_completed failed: {e}")
-        return {
-            "reward": 0.0,
-            "done": True,
-            "info": {
-                "success": False,
-                "error": str(e),
-                "message": "Failed to evaluate completed todos",
-            },
-        }
-
-
-@router.tool
-async def todo_exists(title: str):
-    """Check if a todo with specific title exists.
-
-    Args:
-        title: The title of the todo to check
-
-    Returns:
-        Evaluation result
-    """
-    try:
-        # Get app info
-        app_response = await http_client.get("/apps/todo")
-        if app_response.status_code != 200:
-            return {"error": "Todo app not running", "reward": 0.0, "done": True}
-
-        app_data = app_response.json()
-        backend_port = app_data.get("backend_port", 5000)
-
-        # Get todos
-        url = f"http://localhost:{backend_port}/api/eval/todos"
-        response = await http_client.get(url)
-        response.raise_for_status()
-        todos = response.json()
-
-        # Check if any todo has the expected title
-        exists = any(todo.get("title") == title for todo in todos)
-
-        return {
-            "reward": 1.0 if exists else 0.0,
-            "done": exists,
-            "info": {
-                "success": exists,
-                "title": title,
-                "total_todos": len(todos),
-                "message": f"Todo '{title}' {'exists' if exists else 'does not exist'}",
-            },
-        }
-    except Exception as e:
-        logger.error(f"todo_exists failed: {e}")
-        return {
-            "reward": 0.0,
-            "done": True,
-            "info": {
-                "success": False,
-                "error": str(e),
-                "message": f"Failed to check if todo '{title}' exists",
-            },
-        }
-
-
-@router.tool
-async def todo_completion_rate(target_rate: float = 0.5):
-    """Check if completion rate meets target.
-
-    Args:
-        target_rate: The target completion rate (default 0.5)
-
-    Returns:
-        Evaluation result
-    """
-    try:
-        # Get app info
-        app_response = await http_client.get("/apps/todo")
-        if app_response.status_code != 200:
-            return {"error": "Todo app not running", "reward": 0.0, "done": True}
-
-        app_data = app_response.json()
-        backend_port = app_data.get("backend_port", 5000)
-
-        # Get stats
-        url = f"http://localhost:{backend_port}/api/eval/stats"
-        response = await http_client.get(url)
-        response.raise_for_status()
-        stats = response.json()
-
-        total_count = stats.get("total_items", 0)
-        completed_count = stats.get("completed_items", 0)
-
-        if total_count == 0:
-            # No todos, consider this as 0% completion
-            actual_rate = 0.0
-        else:
-            actual_rate = completed_count / total_count
-
-        success = actual_rate >= target_rate
-        reward = min(1.0, actual_rate / target_rate) if target_rate > 0 else 1.0
-
-        return {
-            "reward": reward,
-            "done": success,
-            "info": {
-                "success": success,
-                "actual_rate": actual_rate,
-                "target_rate": target_rate,
-                "completed_count": completed_count,
-                "total_count": total_count,
-                "message": f"Completion rate: {actual_rate:.1%} (target: {target_rate:.1%})",
-            },
-        }
-    except Exception as e:
-        logger.error(f"todo_completion_rate failed: {e}")
-        return {
-            "reward": 0.0,
-            "done": True,
-            "info": {
-                "success": False,
-                "error": str(e),
-                "message": "Failed to evaluate completion rate",
-            },
-        }
-
-
-@router.tool
-async def todo_total_count(min_count: int = 1):
-    """Check if total todo count meets minimum.
-
-    Args:
-        min_count: The minimum number of todos expected (default 1)
-
-    Returns:
-        Evaluation result
-    """
-    try:
-        # Get app info
-        app_response = await http_client.get("/apps/todo")
-        if app_response.status_code != 200:
-            return {"error": "Todo app not running", "reward": 0.0, "done": True}
-
-        app_data = app_response.json()
-        backend_port = app_data.get("backend_port", 5000)
-
-        # Get stats
-        url = f"http://localhost:{backend_port}/api/eval/stats"
-        response = await http_client.get(url)
-        response.raise_for_status()
-        stats = response.json()
-        total_count = stats.get("total_items", 0)
-
-        success = total_count >= min_count
-        reward = 1.0 if success else (total_count / min_count if min_count > 0 else 0.0)
-
-        return {
-            "reward": reward,
-            "done": success,
-            "info": {
-                "success": success,
-                "total_count": total_count,
-                "min_count": min_count,
-                "message": f"Found {total_count} todos (minimum: {min_count})",
-            },
-        }
-    except Exception as e:
-        logger.error(f"todo_total_count failed: {e}")
-        return {
-            "reward": 0.0,
-            "done": True,
-            "info": {
-                "success": False,
-                "error": str(e),
-                "message": "Failed to evaluate total count",
-            },
-        }
diff --git a/environments/browser/server/main.py b/environments/browser/server/main.py
deleted file mode 100644
index 04e5b5bc..00000000
--- a/environments/browser/server/main.py
+++ /dev/null
@@ -1,84 +0,0 @@
-"""Browser server - MCP tools for browser automation."""
-
-import contextlib
-import logging
-import sys
-from hud.server import MCPServer
-from hud.tools import HudComputerTool, AnthropicComputerTool, OpenAIComputerTool
-
-# Configure logging
-logging.basicConfig(
-    stream=sys.stderr,
-    level=logging.INFO,
-    format="[%(levelname)s] %(asctime)s | %(name)s | %(message)s",
-)
-logger = logging.getLogger(__name__)
-
-# MCP server instance
-mcp = MCPServer(name="HUD Browser Environment")
-
-from server.shared import ENV_SERVER_URL, http_client, playwright
-
-# Register tools
-mcp.tool(playwright)
-mcp.tool(HudComputerTool(display_num=1))
-mcp.tool(AnthropicComputerTool(display_num=1))
-mcp.tool(OpenAIComputerTool(display_num=1))
-
-# Import and register routers
-from server.tools import router as tools_router
-from server.resources import router as resources_router
-from server.setup import router as setup_router
-from server.evaluate import router as evaluate_router
-
-# Include regular routers
-mcp.include_router(tools_router)
-mcp.include_router(resources_router)
-
-# Include hidden routers (tools dispatched through single tool)
-mcp.include_router(setup_router, hidden=True)
-mcp.include_router(evaluate_router, hidden=True)
-
-
-async def shutdown_env() -> str:
-    """Ask the environment backend to shutdown."""
-    with contextlib.redirect_stdout(sys.stderr):
-        try:
-            resp = await http_client.post("/shutdown")
-            if resp.status_code in (200, 204):
-                logger.info("Requested environment shutdown")
-                return "Environment shutdown requested"
-            logger.warning("Environment /shutdown returned %s", resp.status_code)
-            return f"Environment shutdown returned {resp.status_code}"
-        except Exception as exc:  # noqa: BLE001
-            logger.warning("Failed to call /shutdown: %s", exc)
-            return f"Failed to request environment shutdown: {exc}"
-
-
-@mcp.shutdown
-async def on_shutdown() -> None:
-    """Graceful controller shutdown: remote then local cleanup."""
-    with contextlib.redirect_stdout(sys.stderr):
-        try:
-            await shutdown_env()
-        except Exception as exc:  # noqa: BLE001
-            logger.warning("shutdown_env failed: %s", exc)
-
-        # Close Playwright browser/session if present
-        try:
-            await playwright.close()
-        except Exception as exc:  # noqa: BLE001
-            logger.warning("Playwright close failed: %s", exc)
-
-        # Close shared HTTP client
-        try:
-            await http_client.aclose()
-        except Exception as exc:  # noqa: BLE001
-            logger.warning("HTTP client close failed: %s", exc)
-
-
-__all__ = ["mcp", "http_client", "ENV_SERVER_URL"]
-
-
-if __name__ == "__main__":
-    mcp.run()
diff --git a/environments/browser/server/pyproject.toml b/environments/browser/server/pyproject.toml
deleted file mode 100644
index 53b149e0..00000000
--- a/environments/browser/server/pyproject.toml
+++ /dev/null
@@ -1,21 +0,0 @@
-[project]
-name = "hud-browser-server"
-version = "0.1.0"
-description = "HUD Browser MCP Server"
-requires-python = ">=3.11,<3.14"
-dependencies = [
-    "hud-python>=0.4.62",
-    "httpx",
-    "playwright",
-    "pyautogui",
-]
-
-[build-system]
-requires = ["hatchling"]
-build-backend = "hatchling.build"
-
-[tool.hatch.metadata]
-allow-direct-references = true
-
-[tool.hatch.build.targets.wheel]
-packages = ["server"]
diff --git a/environments/browser/server/resources.py b/environments/browser/server/resources.py
deleted file mode 100644
index a2aba8a8..00000000
--- a/environments/browser/server/resources.py
+++ /dev/null
@@ -1,25 +0,0 @@
-"""Browser environment MCP resources."""
-
-import json
-import os
-from datetime import datetime
-
-from hud.server import MCPRouter
-
-# Create router for this module
-router = MCPRouter()
-
-
-@router.resource("telemetry://live")
-async def get_telemetry() -> str:
-    """MCP resource containing telemetry data including VNC live_url."""
-    telemetry_data = {
-        "live_url": "http://localhost:8080/vnc.html",
-        "display": os.getenv("DISPLAY", ":0"),
-        "vnc_port": 8080,
-        "websockify_port": 8080,
-        "status": "ready",
-        "timestamp": datetime.now().isoformat(),
-        "services": {"x11": "running", "vnc": "running", "websockify": "running"},
-    }
-    return json.dumps(telemetry_data, indent=2)
diff --git a/environments/browser/server/setup/__init__.py b/environments/browser/server/setup/__init__.py
deleted file mode 100644
index 25ea9c3f..00000000
--- a/environments/browser/server/setup/__init__.py
+++ /dev/null
@@ -1,15 +0,0 @@
-"""Setup tools for browser environment."""
-
-from hud.server import MCPRouter
-
-# Create combined router for all setup tools
-router = MCPRouter(name="setup")
-
-# Import and include sub-routers
-from .game_2048 import router as game_2048_router
-from .todo import router as todo_router
-
-router.include_router(game_2048_router)
-router.include_router(todo_router)
-
-__all__ = ["router"]
diff --git a/environments/browser/server/setup/game_2048.py b/environments/browser/server/setup/game_2048.py
deleted file mode 100644
index d938f507..00000000
--- a/environments/browser/server/setup/game_2048.py
+++ /dev/null
@@ -1,150 +0,0 @@
-"""2048 game setup tools."""
-
-import logging
-from typing import List
-
-from server.main import http_client
-from hud.server import MCPRouter
-
-logger = logging.getLogger(__name__)
-
-# Create router for this module
-router = MCPRouter()
-
-
-@router.tool
-async def game_2048_board(board_size: int = 4, target_tile: int = 2048):
-    """Initialize new game with specified board size and target tile.
-
-    Args:
-        board_size: Size of the game board (3-6)
-        target_tile: Target tile value (64, 128, 256, 512, 1024, 2048, etc.)
-
-    Returns:
-        Game initialization status
-    """
-    try:
-        # Launch the 2048 app first
-        response = await http_client.post("/apps/launch", json={"app_name": "2048"})
-
-        if response.status_code != 200:
-            return {"error": f"Failed to launch 2048: {response.text}"}
-
-        app_info = response.json()
-        backend_port = app_info.get("backend_port", 5001)
-
-        # Initialize new game
-        url = f"http://localhost:{backend_port}/api/game/new"
-        game_response = await http_client.post(
-            url, json={"board_size": board_size, "target_tile": target_tile}
-        )
-        game_response.raise_for_status()
-
-        return {
-            "status": "success",
-            "message": f"{board_size}x{board_size} game initialized with target {target_tile}",
-            "app_url": app_info["url"],
-        }
-    except Exception as e:
-        logger.error(f"game_2048_board failed: {e}")
-        return {"error": f"Failed to initialize game: {str(e)}"}
-
-
-@router.tool
-async def game_2048_set_board(board: List[List[int]], score: int = 0, moves: int = 0):
-    """Set specific board configuration for testing.
-
-    Args:
-        board: 2D list representing the board state
-        score: Initial score
-        moves: Initial move count
-
-    Returns:
-        Board configuration status
-    """
-    try:
-        # Get app info
-        app_response = await http_client.get("/apps/2048")
-        if app_response.status_code != 200:
-            return {"error": "2048 app not running"}
-
-        app_data = app_response.json()
-        backend_port = app_data.get("backend_port", 5001)
-
-        # Set the board configuration
-        url = f"http://localhost:{backend_port}/api/eval/set_board"
-        response = await http_client.post(
-            url, json={"board": board, "score": score, "moves": moves}
-        )
-        response.raise_for_status()
-        result = response.json()
-
-        highest_tile = result.get("highest_tile", 0)
-
-        return {
-            "status": "success",
-            "message": f"Board set successfully (highest tile: {highest_tile})",
-            "highest_tile": highest_tile,
-        }
-    except Exception as e:
-        logger.error(f"game_2048_set_board failed: {e}")
-        return {"error": f"Failed to set board: {str(e)}"}
-
-
-@router.tool
-async def game_2048_near_win(target_tile: int = 2048):
-    """Set board close to winning (with tiles near target).
-
-    Args:
-        target_tile: The target tile for the game
-
-    Returns:
-        Near-win board status
-    """
-    try:
-        # Create a board that's one move away from target
-        if target_tile == 2048:
-            board = [[1024, 1024, 256, 128], [512, 256, 64, 32], [128, 64, 16, 8], [32, 16, 4, 2]]
-        elif target_tile == 1024:
-            board = [[512, 512, 128, 64], [256, 128, 32, 16], [64, 32, 8, 4], [16, 8, 2, 0]]
-        elif target_tile == 512:
-            board = [[256, 256, 64, 32], [128, 64, 16, 8], [32, 16, 4, 2], [8, 4, 2, 0]]
-        else:
-            # Generic near-win board
-            half_target = target_tile // 2
-            quarter_target = target_tile // 4
-            board = [
-                [half_target, half_target, quarter_target, quarter_target // 2],
-                [quarter_target, quarter_target // 2, 16, 8],
-                [16, 8, 4, 2],
-                [4, 2, 0, 0],
-            ]
-
-        # Set a high score and move count
-        score = sum(sum(row) for row in board) * 2
-        moves = 150
-
-        result = await game_2048_set_board(board=board, score=score, moves=moves)
-
-        if "error" not in result:
-            result["message"] = f"Board set near winning state for target {target_tile}"
-
-        return result
-    except Exception as e:
-        logger.error(f"game_2048_near_win failed: {e}")
-        return {"error": f"Failed to set near-win board: {str(e)}"}
-
-
-@router.tool
-async def game_2048_reset():
-    """Reset game to initial state.
-
-    Returns:
-        Reset status
-    """
-    try:
-        # Just initialize a new game with defaults
-        return await game_2048_board()
-    except Exception as e:
-        logger.error(f"game_2048_reset failed: {e}")
-        return {"error": f"Failed to reset game: {str(e)}"}
diff --git a/environments/browser/server/setup/todo.py b/environments/browser/server/setup/todo.py
deleted file mode 100644
index f5480ff1..00000000
--- a/environments/browser/server/setup/todo.py
+++ /dev/null
@@ -1,131 +0,0 @@
-"""Todo app setup tools."""
-
-import logging
-from typing import Dict, Any, List
-
-from server.main import http_client
-from server.tools import playwright
-from hud.server import MCPRouter
-
-logger = logging.getLogger(__name__)
-
-# Create router for this module
-router = MCPRouter()
-
-
-@router.tool()
-async def todo_seed(num_items: int = 5):
-    """Seed database with default test todos.
-
-    Args:
-        num_items: Number of test items to create (default: 5)
-
-    Returns:
-        Setup result with seeded items info
-    """
-    try:
-        # Launch the todo app first
-        response = await http_client.post("/apps/launch", json={"app_name": "todo"})
-
-        if response.status_code != 200:
-            return {"error": f"Failed to launch todo: {response.text}"}
-
-        app_info = response.json()
-        backend_port = app_info.get("backend_port", 5000)
-
-        # Call the app's seed API
-        url = f"http://localhost:{backend_port}/api/eval/seed"
-        seed_response = await http_client.post(url)
-        seed_response.raise_for_status()
-        result = seed_response.json()
-
-        # Navigate to the app and reload to show seeded items
-        try:
-            await playwright(action="navigate", url=app_info["url"])
-            # Small delay to ensure navigation completes
-            import asyncio
-
-            await asyncio.sleep(0.5)
-        except Exception:
-            pass
-
-        return {
-            "status": "success",
-            "message": f"Seeded database with {result.get('items_created', num_items)} test items",
-            "app_url": app_info["url"],
-        }
-    except Exception as e:
-        logger.error(f"todo_seed failed: {e}")
-        return {"error": f"Failed to seed database: {str(e)}"}
-
-
-@router.tool()
-async def todo_reset():
-    """Reset database to empty state.
-
-    Returns:
-        Setup result with reset confirmation
-    """
-    try:
-        # Get app info
-        app_response = await http_client.get("/apps/todo")
-        if app_response.status_code != 200:
-            return {"error": "Todo app not running"}
-
-        app_data = app_response.json()
-        backend_port = app_data.get("backend_port", 5000)
-
-        # Call the app's reset API
-        url = f"http://localhost:{backend_port}/api/eval/reset"
-        reset_response = await http_client.delete(url)
-        reset_response.raise_for_status()
-
-        return {"status": "success", "message": "Database reset to empty state"}
-    except Exception as e:
-        logger.error(f"todo_reset failed: {e}")
-        return {"error": f"Failed to reset database: {str(e)}"}
-
-
-@router.tool()
-async def todo_custom_seed(items: List[Dict[str, Any]]):
-    """Seed database with custom todo items.
-
-    Args:
-        items: List of todo items to create, each with 'title' and optional 'completed'
-
-    Returns:
-        Setup result with seeded items info
-    """
-    try:
-        # Ensure all items have required fields
-        formatted_items = []
-        for item in items:
-            formatted_item = {
-                "title": item.get("title", ""),
-                "description": item.get("description", ""),
-                "completed": item.get("completed", False),
-            }
-            formatted_items.append(formatted_item)
-
-        # Launch app if needed
-        response = await http_client.post("/apps/launch", json={"app_name": "todo"})
-
-        if response.status_code != 200:
-            return {"error": f"Failed to launch todo: {response.text}"}
-
-        app_info = response.json()
-        backend_port = app_info.get("backend_port", 5000)
-
-        # Call the app's custom seed API
-        url = f"http://localhost:{backend_port}/api/eval/seed_custom"
-        seed_response = await http_client.post(url, json=formatted_items)
-        seed_response.raise_for_status()
-
-        return {
-            "status": "success",
-            "message": f"Seeded database with {len(items)} custom items",
-            "app_url": app_info["url"],
-        }
-    except Exception as e:
-        logger.error(f"todo_custom_seed failed: {e}")
-        return {"error": f"Failed to seed custom items: {str(e)}"}
diff --git a/environments/browser/server/shared.py b/environments/browser/server/shared.py
deleted file mode 100644
index c7a4aeb3..00000000
--- a/environments/browser/server/shared.py
+++ /dev/null
@@ -1,48 +0,0 @@
-from __future__ import annotations
-
-import contextlib
-import logging
-import sys
-import time
-
-import httpx
-
-from hud.tools import PlaywrightTool
-
-logger = logging.getLogger(__name__)
-
-ENV_SERVER_URL = "http://localhost:8000"
-http_client = httpx.AsyncClient(
-    base_url=ENV_SERVER_URL, timeout=30.0, headers={"User-Agent": "HUD-Browser-Server/1.0"}
-)
-
-
-def _discover_cdp_url(timeout_sec: float = 60.0, poll_interval_sec: float = 0.5) -> str | None:
-    """Synchronously poll the environment for a CDP websocket URL.
-
-    Blocks import until CDP is available or times out. Ensures nothing is
-    written to stdout to avoid corrupting stdio MCP transport.
-    """
-    deadline = time.time() + timeout_sec
-    with contextlib.redirect_stdout(sys.stderr):
-        try:
-            with httpx.Client(base_url=ENV_SERVER_URL, timeout=5.0) as client:
-                while time.time() < deadline:
-                    try:
-                        resp = client.get("/cdp")
-                        if resp.status_code == 200:
-                            ws = resp.json().get("ws")
-                            if ws:
-                                return ws
-                    except Exception:
-                        pass
-                    time.sleep(poll_interval_sec)
-                    logger.info("Polling for CDP URL")
-        except Exception:
-            raise
-    return None
-
-
-playwright = PlaywrightTool(cdp_url=_discover_cdp_url())
-
-__all__ = ["ENV_SERVER_URL", "http_client", "playwright"]
diff --git a/environments/browser/server/tools.py b/environments/browser/server/tools.py
deleted file mode 100644
index a05acd22..00000000
--- a/environments/browser/server/tools.py
+++ /dev/null
@@ -1,89 +0,0 @@
-"""Browser environment MCP tools."""
-
-import asyncio
-import logging
-
-import httpx
-from server.shared import http_client, ENV_SERVER_URL, playwright
-from hud.server import MCPRouter
-
-logger = logging.getLogger(__name__)
-
-# Create router for this module
-router = MCPRouter()
-
-
-@router.tool
-async def launch_app(app_name: str) -> str:
-    """Launch a specific application dynamically and navigate to it.
-
-    Args:
-        app_name: Name of the app to launch (e.g., 'todo', '2048')
-
-    Returns:
-        Success message with app URL
-    """
-    # http_client is imported from controller module
-
-    try:
-        # Call environment server to launch app with timeout
-        response = await http_client.post(
-            "/apps/launch",
-            json={"app_name": app_name},
-            timeout=60.0,  # 60 second timeout
-        )
-
-        if response.status_code == 404:
-            return f"App '{app_name}' not found"
-        elif response.status_code != 200:
-            return f"Failed to launch app: {response.text}"
-    except httpx.ReadTimeout:
-        return f"Timeout launching app '{app_name}'. The environment server may still be starting up. Try again in a few seconds."
-    except httpx.ConnectError:
-        return (
-            f"Could not connect to environment server. Make sure it's running at {ENV_SERVER_URL}"
-        )
-    except Exception as e:
-        return f"Error launching app '{app_name}': {str(e)}"
-
-    app_info = response.json()
-    app_url = app_info["url"]
-
-    # Automatically navigate to the app after launching
-    try:
-        await playwright(action="navigate", url=app_url, wait_for_load_state="networkidle")
-        # Give the page a moment to fully load
-        await asyncio.sleep(1)
-        return f"Launched {app_name} at {app_url} and navigated to it"
-    except Exception as e:
-        logger.warning(f"Could not auto-navigate to app: {e}")
-        return f"Launched {app_name} at {app_url} (navigation failed: {e})"
-
-
-@router.tool
-async def api_request(url: str, method: str = "GET", data: dict | None = None) -> dict:
-    """Make HTTP API requests.
-
-    Args:
-        url: The URL to request
-        method: HTTP method (GET, POST, etc.)
-        data: Optional JSON data for POST/PUT requests
-
-    Returns:
-        Response data as dict
-    """
-    logger.debug(f"Making {method} request to {url}")
-
-    # Create a separate client for external requests
-    # (to not interfere with the persistent environment client)
-    async with httpx.AsyncClient() as client:
-        response = await client.request(method, url, json=data)
-        return {
-            "status": response.status_code,
-            "data": response.json()
-            if response.headers.get("content-type", "").startswith("application/json")
-            else response.text,
-        }
-
-
-__all__ = ["playwright"]
diff --git a/environments/browser/tasks.json b/environments/browser/tasks.json
deleted file mode 100644
index 5923f5b3..00000000
--- a/environments/browser/tasks.json
+++ /dev/null
@@ -1,37 +0,0 @@
-[
-    {
-      "id": "browser_2048_512",
-      "prompt": "Play the browser-based 2048 game and try to reach the 512 tile. Start by taking a screenshot, then make strategic moves using arrow keys.",
-      "mcp_config": {
-        "local": {
-          "command": "docker",
-          "args": ["run", "--rm", "-i", "browser:latest"]
-        }
-      },
-      "agent_config": {
-        "allowed_tools": ["playwright", "computer", "anthropic_computer", "openai_computer"],
-        "append_setup_output": true,
-        "system_prompt": "You are an expert 2048 game player using a browser interface. Your goal is to reach the tile specified by the user.\n\nHOW 2048 WORKS:\n- 4x4 grid with numbered tiles (2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048...)\n- When you move, all tiles slide in that direction\n- When two tiles with SAME number touch, they merge into one (2+2=4, 4+4=8, etc.)\n- After each move, a new tile (2 or 4) appears randomly\n- Game ends when grid is full and no merges possible\n\nBROWSER INTERACTION USING THE COMPUTER TOOL:\n1. FIRST TURN ONLY - TAKE SCREENSHOT:\n   Use: {\"name\": \"computer\", \"arguments\": {\"action\": \"screenshot\"}}\n   After that, the environment returns an image with each successful move.\n\n2. MAKE MOVES - Use the computer tool with action=\"press\" and keys parameter (list of strings):\n   - Move up: {\"name\": \"computer\", \"arguments\": {\"action\": \"press\", \"keys\": [\"up\"]}}\n   - Move down: {\"name\": \"computer\", \"arguments\": {\"action\": \"press\", \"keys\": [\"down\"]}}\n   - Move left: {\"name\": \"computer\", \"arguments\": {\"action\": \"press\", \"keys\": [\"left\"]}}\n   - Move right: {\"name\": \"computer\", \"arguments\": {\"action\": \"press\", \"keys\": [\"right\"]}}\n\nCRITICAL RULES:\n- Make exactly ONE move per turn using arrow keys\n- Continue until target or game ends; no confirmations needed.\n\nStrategy: keep highest tiles in a corner; maintain order; avoid random moves."
-      },
-      "setup_tool": {
-        "name": "launch_app",
-        "arguments": {
-          "app_name": "2048"
-        }
-      },
-      "evaluate_tool": {
-        "name": "evaluate",
-        "arguments": {
-          "name": "game_2048_max_number",
-          "arguments": {
-            "target": 512
-          }
-        }
-      },
-      "metadata": {
-        "task_type": "browser_2048",
-        "difficulty": "hard",
-        "target": 512
-      }
-    }
-  ]
\ No newline at end of file
diff --git a/environments/deepresearch/.gitignore b/environments/deepresearch/.gitignore
deleted file mode 100644
index 15b4ad3f..00000000
--- a/environments/deepresearch/.gitignore
+++ /dev/null
@@ -1,13 +0,0 @@
-wandb/
-outputs/
-checkpoints/
-src/hud_controller/.env
-
-.venv/
-.env
-.env.local
-.env.development.local
-.env.test.local
-.env.production.local
-
-__pycache__/
\ No newline at end of file
diff --git a/environments/deepresearch/Dockerfile b/environments/deepresearch/Dockerfile
deleted file mode 100644
index c5f50344..00000000
--- a/environments/deepresearch/Dockerfile
+++ /dev/null
@@ -1,24 +0,0 @@
-FROM python:3.11-slim
-
-WORKDIR /app
-
-# Install git for dependency installation
-RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
-
-# Copy and install MCP server dependencies
-COPY server/pyproject.toml ./server/
-RUN pip install --no-cache-dir ./server
-
-# Copy and install environment dependencies
-COPY environment/pyproject.toml ./environment/
-RUN pip install --no-cache-dir ./environment
-
-# Copy source code after dependencies
-COPY server/ ./server/
-COPY environment/ ./environment/
-
-ENV ENV_SERVER_PORT=8000
-ENV PYTHONPATH=/app
-
-# Start environment server in background, then run MCP server with stdio
-CMD ["sh", "-c", "uvicorn environment.server:app --host 0.0.0.0 --port $ENV_SERVER_PORT --log-level warning --reload >&2 & sleep 0.5 && cd /app/server && exec hud dev server.main --stdio"]
diff --git a/environments/deepresearch/README.md b/environments/deepresearch/README.md
deleted file mode 100644
index 25431b2f..00000000
--- a/environments/deepresearch/README.md
+++ /dev/null
@@ -1,165 +0,0 @@
-# Deep Research Environment
-
-Web research environment powered by Exa API for searching and fetching content.
-See [docs](https://docs.hud.ai/build-environments) for the complete environment design workflow.
-
-## Architecture
-
-**`environment/`** - Manages Exa API integration and state
-- Holds the Exa API key server-side
-- Exposes HTTP endpoints `/search`, `/fetch`, `/answer`, `/evaluate` for research workflows
-- Implements exponential backoff for rate limiting
-
-**`server/`** - Wraps data in MCP tools
-- Provides `search()`, `fetch()`, `answer()`, `evaluate()` tools for agents
-- Agents and tasks interact only with these tools
-
-**Why separate?** Edit tools for the agent or tasks without restarting the environment backend.
-
-## Tools
-
-- **`search(query: str)`** - Search the web using Exa API, returns list of results with titles and URLs
-- **`fetch(url: str)`** - Fetch full content from a URL, returns summary, highlights, and text
-- **`answer(final_answer: str)`** - Submit the final research answer
-- **`evaluate(expected_answer: str)`** - Evaluate submitted answer against expected result
-
-## Setup
-
-### Requirements
-- Exa API key (get one at [exa.ai](https://exa.ai))
-
-### Environment Variables
-```bash
-export EXA_API_KEY="your_exa_api_key_here"
-```
-
-## Development
-
-```bash
-# Terminal 1 - Environment backend
-cd environment
-export EXA_API_KEY="your_key"
-uv run uvicorn server:app --reload
-
-# Terminal 2 - MCP server
-cd server
-uv run hud dev
-```
-
-The environment includes exponential backoff for rate limiting, so API calls will automatically retry on 429 errors.
-
-In general, we recommend starting work on the environment backend first, then developing the MCP server to expose the right things to the agent.
-
-For complex environments that require many dependencies, we recommend running `hud dev` in the environment root:
-```bash
-cd ..
-export EXA_API_KEY="your_key"
-hud dev
-```
-
-## Tasks & Evaluation
-
-```bash
-# Build first in the global folder with the Dockerfile (creates deepresearch:0.1.0)
-hud build
-```
-
-Your `tasks.json` uses `docker run` to launch the environment:
-
-```json
-{
-  "prompt": "Research and answer: What is the capital of France?",
-  "mcp_config": {
-    "local": {
-      "command": "docker",
-      "args": ["run", "--rm", "-i", "-e", "EXA_API_KEY", "deepresearch:0.1.0"]
-    }
-  },
-  "evaluator": {
-    "tool_name": "evaluate",
-    "tool_params": {
-      "expected_answer": "Paris"
-    }
-  }
-}
-```
-
-**Note:** The `-e EXA_API_KEY` flag passes your local API key to the container.
-
-**Commands:**
-```bash
-# Build first
-hud build
-
-# Test task locally
-export EXA_API_KEY="your_key"
-hud eval tasks.json
-
-# Push environment for remote running
-hud push
-
-# Production RL training
-hud rl tasks.json  # Auto-converts docker→remote, builds & pushes if needed
-```
-
-## Publishing Your Environment
-
-Once your environment is ready, you can share it with the community:
-
-### 1. Push to Registry
-```bash
-# Build and push your environment (requires docker hub login and hud api key)
-hud build
-hud push
-```
-
-### 2. Create a Dataset
-
-Create a dataset on HuggingFace with your tasks:
-
-**Option A: Upload manually**
-1. Upload your `tasks.json` to HuggingFace
-2. Make sure it's **public** to appear on leaderboards
-
-**Option B: Use the SDK**
-```python
-from hud.datasets import save_tasks
-import json
-
-# Load your tasks
-with open("tasks.json") as f:
-    tasks = json.load(f)
-
-# Push to HuggingFace
-save_tasks(tasks, repo_id="your-org/your-dataset")
-```
-
-### 3. Run and Track Performance
-
-```bash
-# Run Claude on your benchmark
-hud eval "your-org/your-dataset" --agent claude
-
-# View results at:
-# hud.ai/leaderboards/your-org/your-dataset
-```
-
-**Note**: Only public HuggingFace datasets appear as leaderboards!
-
-📚 Learn more: [Creating Benchmarks](https://docs.hud.ai/evaluate-agents/create-benchmarks) | [Leaderboards](https://docs.hud.ai/evaluate-agents/leaderboards)
-
-## Example Research Workflow
-
-```python
-# Agent searches for information
-results = search("latest AI developments 2024")
-
-# Agent fetches detailed content from top result
-content = fetch(results[0]["url"])
-
-# Agent submits final answer
-answer("Based on research, AI developments in 2024 include...")
-
-# Evaluate answer
-result = evaluate(expected_answer="AI developments")
-```
diff --git a/environments/deepresearch/environment/__init__.py b/environments/deepresearch/environment/__init__.py
deleted file mode 100644
index 1f153ab5..00000000
--- a/environments/deepresearch/environment/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-"""DeepResearch environment package."""
diff --git a/environments/deepresearch/environment/pyproject.toml b/environments/deepresearch/environment/pyproject.toml
deleted file mode 100644
index dce44179..00000000
--- a/environments/deepresearch/environment/pyproject.toml
+++ /dev/null
@@ -1,17 +0,0 @@
-[project]
-name = "deepresearch-environment"
-version = "0.1.0"
-description = "Backend service for DeepResearch environment"
-requires-python = ">=3.11"
-dependencies = [
-    "fastapi>=0.104.1",
-    "uvicorn[standard]>=0.24.0",
-    "httpx>=0.24.0",
-]
-
-[build-system]
-requires = ["hatchling"]
-build-backend = "hatchling.build"
-
-[tool.hatch.build.targets.wheel]
-packages = ["environment"]
diff --git a/environments/deepresearch/environment/server.py b/environments/deepresearch/environment/server.py
deleted file mode 100644
index 457e990d..00000000
--- a/environments/deepresearch/environment/server.py
+++ /dev/null
@@ -1,340 +0,0 @@
-"""
-FastAPI server for DeepResearch environment.
-Holds EXA API key on the server side and exposes simple HTTP endpoints
-that the controller calls. Mirrors the browser/blank environment pattern.
-"""
-
-import asyncio
-import logging
-import os
-import socket
-from typing import Any, Awaitable, Callable, Dict, List, Optional, TypeVar
-
-import httpx
-from fastapi import FastAPI, HTTPException
-from pydantic import BaseModel
-
-
-T = TypeVar("T")
-
-
-# Set up logging
-logger = logging.getLogger(__name__)
-
-
-async def call_with_exponential_backoff(
-    func: Callable[..., Awaitable[T]],
-    *args: Any,
-    max_retries: int = 5,
-    initial_delay: float = 1.0,
-    max_delay: float = 60.0,
-    exponential_base: float = 2.0,
-    **kwargs: Any,
-) -> T:
-    """
-    Call an async function with exponential backoff on rate limit errors.
-
-    Args:
-        func: The async function to call
-        *args: Positional arguments for the function
-        max_retries: Maximum number of retry attempts (default: 5)
-        initial_delay: Initial delay in seconds (default: 1.0)
-        max_delay: Maximum delay in seconds (default: 60.0)
-        exponential_base: Base for exponential backoff (default: 2.0)
-        **kwargs: Keyword arguments for the function
-
-    Returns:
-        The result of the function call
-
-    Raises:
-        The last exception if all retries fail
-    """
-    last_exception: Optional[Exception] = None
-    delay = initial_delay
-
-    for attempt in range(max_retries + 1):
-        try:
-            return await func(*args, **kwargs)
-        except httpx.HTTPStatusError as e:
-            if e.response.status_code == 429:
-                last_exception = e
-                if attempt < max_retries:
-                    # Log the retry attempt
-                    logger.warning(
-                        "Rate limit hit (429), retrying in %s seconds... (attempt %s/%s)",
-                        delay,
-                        attempt + 1,
-                        max_retries,
-                    )
-                    await asyncio.sleep(delay)
-                    # Calculate next delay with exponential backoff
-                    delay = min(delay * exponential_base, max_delay)
-                else:
-                    # All retries exhausted
-                    raise
-            else:
-                # Not a rate limit error, raise immediately
-                raise
-        except Exception:
-            # Not an HTTP error, raise immediately
-            raise
-
-    # This should never be reached, but just in case
-    if last_exception:
-        raise last_exception
-    raise RuntimeError("Unexpected error in exponential backoff")
-
-
-class _EnvState:
-    """In-memory environment state for tracking usage and agent answer."""
-
-    def __init__(self) -> None:
-        self.search_count: int = 0
-        self.fetch_count: int = 0
-        self.submitted_answer: Optional[str] = None
-
-    def reset(self) -> None:
-        self.search_count = 0
-        self.fetch_count = 0
-        self.submitted_answer = None
-
-
-state = _EnvState()
-
-
-class SearchRequest(BaseModel):
-    query: str
-
-
-class FetchRequest(BaseModel):
-    url: str
-
-
-class AnswerRequest(BaseModel):
-    final_answer: str
-
-
-class EvaluateRequest(BaseModel):
-    expected_answer: str
-
-
-app = FastAPI(title="DeepResearch Environment API", version="0.1.0")
-
-
-@app.get("/health")
-async def health() -> Dict[str, Any]:
-    return {"status": "healthy"}
-
-
-async def _is_port_open(port: int) -> bool:
-    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-    sock.settimeout(0.15)
-    try:
-        result = sock.connect_ex(("localhost", port))
-        sock.close()
-        return result == 0
-    except Exception:
-        return False
-
-
-@app.post("/setup")
-async def setup() -> Dict[str, Any]:
-    state.reset()
-    return {"ok": True}
-
-
-async def _execute_search(query: str, exa_api_key: str, max_results: int = 1) -> Dict[str, Any]:
-    """Execute the actual Exa search API call."""
-    search_url = "https://api.exa.ai/search"
-    async with httpx.AsyncClient(timeout=30.0) as client:
-        response = await client.post(
-            search_url,
-            headers={"x-api-key": exa_api_key, "Content-Type": "application/json"},
-            json={
-                "query": query,
-                "numResults": max_results,
-                "type": "keyword",
-                "userLocation": "us",
-                "contents": {"text": {"maxCharacters": 1000}},
-            },
-        )
-        response.raise_for_status()
-        return response.json()
-
-
-@app.post("/search")
-async def search(req: SearchRequest) -> List[Dict[str, str]]:
-    results: List[Dict[str, str]] = []
-    max_results: int = 1
-
-    exa_api_key: Optional[str] = os.getenv("EXA_API_KEY")
-    if not exa_api_key:
-        raise HTTPException(status_code=400, detail="EXA_API_KEY not set on environment")
-
-    try:
-        # Use exponential backoff for the API call
-        data = await call_with_exponential_backoff(
-            _execute_search,
-            req.query,
-            exa_api_key,
-            max_results,
-        )
-
-        for item in data.get("results", []):
-            title = item.get("title", "")
-            url = item.get("url", "")
-            if title and url:
-                results.append({"title": title, "url": url})
-
-        if not results:
-            autoprompt = data.get("autopromptString", req.query)
-            return [
-                {
-                    "message": "No results found",
-                    "query": req.query,
-                    "autopromptString": autoprompt,
-                }
-            ]
-
-    except (
-        httpx.HTTPStatusError
-    ) as e:  # pragma: no cover - network errors are environment dependent
-        status_code = e.response.status_code
-        if status_code == 401:
-            raise HTTPException(status_code=401, detail="Invalid EXA_API_KEY")
-        if status_code == 429:
-            # This should be handled by exponential backoff, but if all retries fail
-            raise HTTPException(status_code=429, detail="Exa API rate limit exceeded after retries")
-        raise HTTPException(status_code=502, detail=f"Exa API error: {status_code}")
-    except Exception as e:  # pragma: no cover
-        raise HTTPException(status_code=500, detail=f"Search failed: {type(e).__name__}: {e}")
-
-    state.search_count += 1
-    return results
-
-
-async def _execute_fetch(url: str, exa_api_key: str, max_length: int = 2500) -> Dict[str, Any]:
-    """Execute the actual Exa contents API call."""
-    contents_url = "https://api.exa.ai/contents"
-    async with httpx.AsyncClient(timeout=30.0) as client:
-        response = await client.post(
-            contents_url,
-            headers={"x-api-key": exa_api_key, "Content-Type": "application/json"},
-            json={
-                "urls": [url],
-                "text": {"maxCharacters": max_length, "includeHtmlTags": False},
-                "highlights": {"numSentences": 5, "highlightsPerUrl": 3},
-                "summary": {"query": "main takeaways"},
-                "livecrawl": "fallback",
-            },
-        )
-        response.raise_for_status()
-        return response.json()
-
-
-@app.post("/fetch")
-async def fetch(req: FetchRequest) -> Dict[str, str]:
-    from urllib.parse import urlparse
-
-    max_length: int = 2500
-    parsed = urlparse(req.url)
-    if not parsed.scheme or not parsed.netloc:
-        raise HTTPException(status_code=400, detail=f"Invalid URL: {req.url}")
-
-    exa_api_key: Optional[str] = os.getenv("EXA_API_KEY")
-    if not exa_api_key:
-        raise HTTPException(status_code=400, detail="EXA_API_KEY not set on environment")
-
-    try:
-        # Use exponential backoff for the API call
-        data = await call_with_exponential_backoff(
-            _execute_fetch,
-            req.url,
-            exa_api_key,
-            max_length,
-        )
-
-        results = data.get("results", [])
-        if results:
-            result = results[0]
-            text = result.get("text", "")
-            summary = result.get("summary", "")
-            highlights = result.get("highlights", [])
-
-            parts: List[str] = []
-            if summary:
-                parts.append("=== SUMMARY (Main Takeaways) ===")
-                parts.append(summary)
-                parts.append("")
-            if highlights:
-                parts.append("=== KEY HIGHLIGHTS ===")
-                for idx, hl in enumerate(highlights[:3], 1):
-                    parts.append(f"\nHighlight {idx}:")
-                    parts.append(str(hl))
-                parts.append("")
-            if text:
-                parts.append("=== FULL CONTENT ===")
-                if len(text) > max_length:
-                    text = text[:max_length] + "...[truncated]"
-                parts.append(text)
-
-            content = "\n".join(parts) if parts else "No content available"
-        else:
-            content = "No content available for this URL"
-
-    except httpx.HTTPStatusError as e:  # pragma: no cover
-        status_code = e.response.status_code
-        if status_code == 401:
-            raise HTTPException(status_code=401, detail="Invalid EXA_API_KEY")
-        if status_code == 429:
-            # This should be handled by exponential backoff, but if all retries fail
-            raise HTTPException(status_code=429, detail="Exa API rate limit exceeded after retries")
-        raise HTTPException(status_code=502, detail=f"Exa API error: {status_code}")
-    except Exception as e:  # pragma: no cover
-        raise HTTPException(status_code=500, detail=f"Fetch failed: {type(e).__name__}: {e}")
-
-    state.fetch_count += 1
-    return {"content": content}
-
-
-@app.post("/answer")
-async def answer(req: AnswerRequest) -> Dict[str, Any]:
-    state.submitted_answer = req.final_answer
-    return {"ok": True, "message": "Answer submitted"}
-
-
-@app.post("/evaluate")
-async def evaluate(req: EvaluateRequest) -> Dict[str, Any]:
-    submitted = state.submitted_answer
-    if submitted is None:
-        return {
-            "reward": 0.0,
-            "content": f"No answer submitted. Searches: {state.search_count}, Fetches: {state.fetch_count}",
-            "done": False,
-        }
-
-    submitted_clean = submitted.strip().lower()
-    expected_clean = req.expected_answer.strip().lower()
-    is_correct = expected_clean in submitted_clean if expected_clean else False
-
-    result_msg = (
-        ("Correct! " if is_correct else "Incorrect. ")
-        + f"Submitted: '{submitted}', Expected: '{req.expected_answer}'. "
-        + f"Stats: {state.search_count} searches, {state.fetch_count} fetches, {state.search_count + state.fetch_count} total operations."
-    )
-
-    return {"reward": 1.0 if is_correct else 0.0, "content": result_msg, "done": True}
-
-
-if __name__ == "__main__":
-    import uvicorn
-
-    # Configure logging
-    logging.basicConfig(
-        level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
-    )
-
-    if not os.getenv("EXA_API_KEY"):
-        raise ValueError("EXA_API_KEY is not set")
-
-    uvicorn.run(app, host="0.0.0.0", port=8000)
diff --git a/environments/deepresearch/pyproject.toml b/environments/deepresearch/pyproject.toml
deleted file mode 100644
index 3824ea71..00000000
--- a/environments/deepresearch/pyproject.toml
+++ /dev/null
@@ -1,19 +0,0 @@
-[project]
-name = "deepresearch"
-version = "0.1.0"
-description = "DeepResearch HUD environment with HTTP backend (EXA on server)"
-requires-python = ">=3.11"
-dependencies = [ "hud-python==0.4.42", "fastapi>=0.104.1", "uvicorn[standard]>=0.24.0", "httpx>=0.24.0",]
-
-[build-system]
-requires = [ "hatchling",]
-build-backend = "hatchling.build"
-
-[tool.hud]
-image = "deepresearch:dev"
-
-[tool.hatch.metadata]
-allow-direct-references = true
-
-[tool.hatch.build.targets.wheel]
-packages = [ "controller", "environment",]
diff --git a/environments/deepresearch/remote_tasks.json b/environments/deepresearch/remote_tasks.json
deleted file mode 100644
index d75b866b..00000000
--- a/environments/deepresearch/remote_tasks.json
+++ /dev/null
@@ -1,340 +0,0 @@
-[
-  {
-    "prompt": "Who received the IEEE Frank Rosenblatt Award in 2010?\n\nReturn just the answer, no other text.",
-    "mcp_config": {
-      "hud": {
-        "url": "https://mcp.hud.ai/v3/mcp",
-        "headers": {
-          "Authorization": "Bearer ${HUD_API_KEY}",
-          "Mcp-Image": "hudpython/deepresearch:0.1.7",
-          "Env-Exa-Api-Key": "${EXA_API_KEY}"
-        }
-      }
-    },
-    "setup_tool": {
-      "name": "setup",
-      "arguments": {}
-    },
-    "evaluate_tool": {
-      "name": "evaluate",
-      "arguments": {
-        "expected_answer": "Michio Sugeno"
-      }
-    },
-    "id": "SQA_00000"
-  },
-  {
-    "prompt": "Who was awarded the Oceanography Society's Jerlov Award in 2018?\n\nReturn just the answer, no other text.",
-    "mcp_config": {
-      "hud": {
-        "url": "https://mcp.hud.ai/v3/mcp",
-        "headers": {
-          "Authorization": "Bearer ${HUD_API_KEY}",
-          "Mcp-Image": "hudpython/deepresearch:0.1.7",
-          "Env-Exa-Api-Key": "${EXA_API_KEY}"
-        }
-      }
-    },
-    "setup_tool": {
-      "name": "setup",
-      "arguments": {}
-    },
-    "evaluate_tool": {
-      "name": "evaluate",
-      "arguments": {
-        "expected_answer": "Annick Bricaud"
-      }
-    },
-    "id": "SQA_00001"
-  },
-  {
-    "prompt": "What's the name of the women's liberal arts college in Cambridge, Massachusetts?\n\nReturn just the answer, no other text.",
-    "mcp_config": {
-      "hud": {
-        "url": "https://mcp.hud.ai/v3/mcp",
-        "headers": {
-          "Authorization": "Bearer ${HUD_API_KEY}",
-          "Mcp-Image": "hudpython/deepresearch:0.1.7",
-          "Env-Exa-Api-Key": "${EXA_API_KEY}"
-        }
-      }
-    },
-    "setup_tool": {
-      "name": "setup",
-      "arguments": {}
-    },
-    "evaluate_tool": {
-      "name": "evaluate",
-      "arguments": {
-        "expected_answer": "Radcliffe College"
-      }
-    },
-    "id": "SQA_00002"
-  },
-  {
-    "prompt": "How much money, in euros, was the surgeon held responsible for Stella Obasanjo's death ordered to pay her son?\n\nReturn just the answer, no other text.",
-    "mcp_config": {
-      "hud": {
-        "url": "https://mcp.hud.ai/v3/mcp",
-        "headers": {
-          "Authorization": "Bearer ${HUD_API_KEY}",
-          "Mcp-Image": "hudpython/deepresearch:0.1.7",
-          "Env-Exa-Api-Key": "${EXA_API_KEY}"
-        }
-      }
-    },
-    "setup_tool": {
-      "name": "setup",
-      "arguments": {}
-    },
-    "evaluate_tool": {
-      "name": "evaluate",
-      "arguments": {
-        "expected_answer": "120,000"
-      }
-    },
-    "id": "SQA_00004"
-  },
-  {
-    "prompt": "What were the month and year when Obama told Christianity Today, \"I am a Christian, and I am a devout Christian. I believe in the redemptive death and resurrection of Jesus Christ\"?\n\nReturn just the answer, no other text.",
-    "mcp_config": {
-      "hud": {
-        "url": "https://mcp.hud.ai/v3/mcp",
-        "headers": {
-          "Authorization": "Bearer ${HUD_API_KEY}",
-          "Mcp-Image": "hudpython/deepresearch:0.1.7",
-          "Env-Exa-Api-Key": "${EXA_API_KEY}"
-        }
-      }
-    },
-    "setup_tool": {
-      "name": "setup",
-      "arguments": {}
-    },
-    "evaluate_tool": {
-      "name": "evaluate",
-      "arguments": {
-        "expected_answer": "January 2008"
-      }
-    },
-    "id": "SQA_00005"
-  },
-  {
-    "prompt": "In which year did the Japanese scientist Koichi Mizushima receive the Kato Memorial Prize?\n\nReturn just the answer, no other text.",
-    "mcp_config": {
-      "hud": {
-        "url": "https://mcp.hud.ai/v3/mcp",
-        "headers": {
-          "Authorization": "Bearer ${HUD_API_KEY}",
-          "Mcp-Image": "hudpython/deepresearch:0.1.7",
-          "Env-Exa-Api-Key": "${EXA_API_KEY}"
-        }
-      }
-    },
-    "setup_tool": {
-      "name": "setup",
-      "arguments": {}
-    },
-    "evaluate_tool": {
-      "name": "evaluate",
-      "arguments": {
-        "expected_answer": "1999"
-      }
-    },
-    "id": "SQA_00010"
-  },
-  {
-    "prompt": "In which year did Melbourne's Monash Gallery of Art (MGA) rebrand and become the Museum of Australian Photography (MAPh)?\n\nReturn just the answer, no other text.",
-    "mcp_config": {
-      "hud": {
-        "url": "https://mcp.hud.ai/v3/mcp",
-        "headers": {
-          "Authorization": "Bearer ${HUD_API_KEY}",
-          "Mcp-Image": "hudpython/deepresearch:0.1.7",
-          "Env-Hud-Api-Url": "${HUD_API_URL}",
-          "Env-Exa-Api-Key": "${EXA_API_KEY}"
-        }
-      }
-    },
-    "setup_tool": {
-      "name": "setup",
-      "arguments": {}
-    },
-    "evaluate_tool": {
-      "name": "evaluate",
-      "arguments": {
-        "expected_answer": "2023"
-      }
-    },
-    "id": "SQA_00011"
-  },
-  {
-    "prompt": "What signature piece of the MOBA did Scott Wilson discover on the curb between two trash cans?\n\nReturn just the answer, no other text.",
-    "mcp_config": {
-      "hud": {
-        "url": "https://mcp.hud.ai/v3/mcp",
-        "headers": {
-          "Authorization": "Bearer ${HUD_API_KEY}",
-          "Mcp-Image": "hudpython/deepresearch:0.1.7",
-          "Env-Exa-Api-Key": "${EXA_API_KEY}"
-        }
-      }
-    },
-    "setup_tool": {
-      "name": "setup",
-      "arguments": {}
-    },
-    "evaluate_tool": {
-      "name": "evaluate",
-      "arguments": {
-        "expected_answer": "Lucy in the Field with Flowers"
-      }
-    },
-    "id": "SQA_00012"
-  },
-  {
-    "prompt": "What is the surname of the psychiatrist who prescribes medication for Marie Hanson for her periodic blackouts in Season 1, Episode 20 of Ally McBeal?\n\nReturn just the answer, no other text.",
-    "mcp_config": {
-      "hud": {
-        "url": "https://mcp.hud.ai/v3/mcp",
-        "headers": {
-          "Authorization": "Bearer ${HUD_API_KEY}",
-          "Mcp-Image": "hudpython/deepresearch:0.1.7",
-          "Env-Exa-Api-Key": "${EXA_API_KEY}"
-        }
-      }
-    },
-    "setup_tool": {
-      "name": "setup",
-      "arguments": {}
-    },
-    "evaluate_tool": {
-      "name": "evaluate",
-      "arguments": {
-        "expected_answer": "Peters"
-      }
-    },
-    "id": "SQA_00013"
-  },
-  {
-    "prompt": "Who won the Gerard P. Kuiper Prize in 2001?\n\nReturn just the answer, no other text.",
-    "mcp_config": {
-      "hud": {
-        "url": "https://mcp.hud.ai/v3/mcp",
-        "headers": {
-          "Authorization": "Bearer ${HUD_API_KEY}",
-          "Mcp-Image": "hudpython/deepresearch:0.1.7",
-          "Env-Hud-Api-Url": "${HUD_API_URL}",
-          "Env-Exa-Api-Key": "${EXA_API_KEY}"
-        }
-      }
-    },
-    "setup_tool": {
-      "name": "setup",
-      "arguments": {}
-    },
-    "evaluate_tool": {
-      "name": "evaluate",
-      "arguments": {
-        "expected_answer": "Bruce W. Hapke"
-      }
-    },
-    "id": "SQA_00014"
-  },
-  {
-    "prompt": "On which U.S. TV station did the Canadian reality series *To Serve and Protect* debut?\n\nReturn just the answer, no other text.",
-    "mcp_config": {
-      "hud": {
-        "url": "https://mcp.hud.ai/v3/mcp",
-        "headers": {
-          "Authorization": "Bearer ${HUD_API_KEY}",
-          "Mcp-Image": "hudpython/deepresearch:0.1.7",
-          "Env-Exa-Api-Key": "${EXA_API_KEY}"
-        }
-      }
-    },
-    "setup_tool": {
-      "name": "setup",
-      "arguments": {}
-    },
-    "evaluate_tool": {
-      "name": "evaluate",
-      "arguments": {
-        "expected_answer": "KVOS-TV"
-      }
-    },
-    "id": "SQA_00016"
-  },
-  {
-    "prompt": "What instrument did Alec Aitken play well enough for a professional musician to remark, \"Aitken is the most accomplished amateur musician I have ever known\"?\n\nReturn just the answer, no other text.",
-    "mcp_config": {
-      "hud": {
-        "url": "https://mcp.hud.ai/v3/mcp",
-        "headers": {
-          "Authorization": "Bearer ${HUD_API_KEY}",
-          "Mcp-Image": "hudpython/deepresearch:0.1.7",
-          "Env-Exa-Api-Key": "${EXA_API_KEY}"
-        }
-      }
-    },
-    "setup_tool": {
-      "name": "setup",
-      "arguments": {}
-    },
-    "evaluate_tool": {
-      "name": "evaluate",
-      "arguments": {
-        "expected_answer": "Violin"
-      }
-    },
-    "id": "SQA_00017"
-  },
-  {
-    "prompt": "On what day, month, and year did Tara Chand (a politician and a Dalit leader from Jammu and Kashmir) resign from the Indian National Congress in support of Ghulam Nabi Azad?\n\nReturn just the answer, no other text.",
-    "mcp_config": {
-      "hud": {
-        "url": "https://mcp.hud.ai/v3/mcp",
-        "headers": {
-          "Authorization": "Bearer ${HUD_API_KEY}",
-          "Mcp-Image": "hudpython/deepresearch:0.1.7",
-          "Env-Exa-Api-Key": "${EXA_API_KEY}"
-        }
-      }
-    },
-    "setup_tool": {
-      "name": "setup",
-      "arguments": {}
-    },
-    "evaluate_tool": {
-      "name": "evaluate",
-      "arguments": {
-        "expected_answer": "August 30, 2022"
-      }
-    },
-    "id": "SQA_00018"
-  },
-  {
-    "prompt": "In the series \"El guardi\u00e1n invisible,\" who portrays the character Alfonso \u00c1lvarez de Toledo?\n\nReturn just the answer, no other text.",
-    "mcp_config": {
-      "hud": {
-        "url": "https://mcp.hud.ai/v3/mcp",
-        "headers": {
-          "Authorization": "Bearer ${HUD_API_KEY}",
-          "Mcp-Image": "hudpython/deepresearch:0.1.7",
-          "Env-Exa-Api-Key": "${EXA_API_KEY}"
-        }
-      }
-    },
-    "setup_tool": {
-      "name": "setup",
-      "arguments": {}
-    },
-    "evaluate_tool": {
-      "name": "evaluate",
-      "arguments": {
-        "expected_answer": "Ram\u00f3n Barea"
-      }
-    },
-    "id": "SQA_00020"
-  }
-]
\ No newline at end of file
diff --git a/environments/deepresearch/server/__init__.py b/environments/deepresearch/server/__init__.py
deleted file mode 100644
index 33c11062..00000000
--- a/environments/deepresearch/server/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-"""DeepResearch MCP"""
diff --git a/environments/deepresearch/server/main.py b/environments/deepresearch/server/main.py
deleted file mode 100644
index 60ec1fb4..00000000
--- a/environments/deepresearch/server/main.py
+++ /dev/null
@@ -1,78 +0,0 @@
-"""Controller bridges MCP tools to the DeepResearch environment HTTP API."""
-
-from typing import List, Dict
-import httpx
-import os
-import sys
-import logging
-
-from hud.tools.types import EvaluationResult
-
-from hud.server import MCPServer
-
-# Configure logging
-logging.basicConfig(
-    stream=sys.stderr,
-    level=logging.INFO,
-    format="[%(levelname)s] %(asctime)s | %(name)s | %(message)s",
-    force=True,  # Force all loggers to use stderr
-)
-
-# MCP server
-mcp = MCPServer(name="deepresearch")
-
-# Environment server URL (backend)
-ENV_SERVER_URL = os.getenv("ENV_SERVER_URL", "http://localhost:8000")
-
-# Shared HTTP client to talk to the environment
-http_client = httpx.AsyncClient(
-    base_url=ENV_SERVER_URL,
-    timeout=30.0,
-    headers={"User-Agent": "HUD-DeepResearch-Controller/1.0"},
-)
-
-
-@mcp.initialize
-async def init():
-    # Ensure environment server is reachable
-    await http_client.get("/health")
-
-
-@mcp.shutdown
-async def cleanup():
-    await http_client.aclose()
-
-
-@mcp.tool()
-async def setup() -> str:
-    await http_client.post("/setup")
-    return "Environment setup"
-
-
-@mcp.tool()
-async def search(query: str) -> List[Dict[str, str]]:
-    resp = await http_client.post("/search", json={"query": query})
-    return resp.json()
-
-
-@mcp.tool()
-async def fetch(url: str) -> str:
-    resp = await http_client.post("/fetch", json={"url": url})
-    data = resp.json()
-    return data.get("content", "")
-
-
-@mcp.tool()
-async def answer(final_answer: str) -> str:
-    await http_client.post("/answer", json={"final_answer": final_answer})
-    return f"Answer submitted: {final_answer}"
-
-
-@mcp.tool()
-async def evaluate(expected_answer: str) -> EvaluationResult:
-    resp = await http_client.post("/evaluate", json={"expected_answer": expected_answer})
-    return EvaluationResult(**resp.json())
-
-
-if __name__ == "__main__":
-    mcp.run()
diff --git a/environments/deepresearch/server/pyproject.toml b/environments/deepresearch/server/pyproject.toml
deleted file mode 100644
index fb8ba0b6..00000000
--- a/environments/deepresearch/server/pyproject.toml
+++ /dev/null
@@ -1,19 +0,0 @@
-[project]
-name = "deepresearch-mcp"
-version = "0.1.0"
-description = "MCP server for DeepResearch environment"
-requires-python = ">=3.11"
-dependencies = [
-    "hud-python>=0.4.54",
-    "httpx>=0.24.0",
-]
-
-[build-system]
-requires = ["hatchling"]
-build-backend = "hatchling.build"
-
-[tool.hatch.metadata]
-allow-direct-references = true
-
-[tool.hatch.build.targets.wheel]
-packages = ["mcp"]
diff --git a/environments/deepresearch/tasks.json b/environments/deepresearch/tasks.json
deleted file mode 100644
index c8d86b1b..00000000
--- a/environments/deepresearch/tasks.json
+++ /dev/null
@@ -1,366 +0,0 @@
-[
-  {
-    "id": "SQA_00000",
-    "prompt": "Who received the IEEE Frank Rosenblatt Award in 2010?\n\nReturn just the answer, no other text.",
-    "mcp_config": {
-      "local": {
-        "command": "docker",
-        "args": [
-          "run",
-          "--rm",
-          "-i",
-          "deepresearch:0.1.7"
-        ]
-      }
-    },
-    "setup_tool": {
-      "name": "setup",
-      "arguments": {}
-    },
-    "evaluate_tool": {
-      "name": "evaluate",
-      "arguments": {
-        "expected_answer": "Michio Sugeno"
-      }
-    },
-    "system_prompt": "You are an AI research assistant that can search the web and fetch content to answer questions. You must call the answer tool with your final answer when you have finished your research. Call search tool with the exact search query then fetch tool with the most promising URLs. After calling answer, stop generating - your task is complete. Do not call random urls. Call the urls that are returned by search. If you do not do this you will inevitably get the wrong answer. Call tools one by one and do not call them all at a time. Again, you must call the answer tool with your final answer when you have finished your research, or else the task will not be completed and you will not get a reward. Simply printing your final answer is not enough."
-  },
-  {
-    "id": "SQA_00001",
-    "prompt": "Who was awarded the Oceanography Society's Jerlov Award in 2018?\n\nReturn just the answer, no other text.",
-    "mcp_config": {
-      "local": {
-        "command": "docker",
-        "args": [
-          "run",
-          "--rm",
-          "-i",
-          "deepresearch:0.1.7"
-        ]
-      }
-    },
-    "setup_tool": {
-      "name": "setup",
-      "arguments": {}
-    },
-    "evaluate_tool": {
-      "name": "evaluate",
-      "arguments": {
-        "expected_answer": "Annick Bricaud"
-      }
-    },
-    "system_prompt": "You are an AI research assistant that can search the web and fetch content to answer questions. You must call the answer tool with your final answer when you have finished your research. Call search tool with the exact search query then fetch tool with the most promising URLs. After calling answer, stop generating - your task is complete. Do not call random urls. Call the urls that are returned by search. If you do not do this you will inevitably get the wrong answer. Call tools one by one and do not call them all at a time. Again, you must call the answer tool with your final answer when you have finished your research, or else the task will not be completed and you will not get a reward. Simply printing your final answer is not enough."
-  },
-  {
-    "id": "SQA_00002",
-    "prompt": "What's the name of the women's liberal arts college in Cambridge, Massachusetts?\n\nReturn just the answer, no other text.",
-    "mcp_config": {
-      "local": {
-        "command": "docker",
-        "args": [
-          "run",
-          "--rm",
-          "-i",
-          "deepresearch:0.1.7"
-        ]
-      }
-    },
-    "setup_tool": {
-      "name": "setup",
-      "arguments": {}
-    },
-    "evaluate_tool": {
-      "name": "evaluate",
-      "arguments": {
-        "expected_answer": "Radcliffe College"
-      }
-    },
-    "system_prompt": "You are an AI research assistant that can search the web and fetch content to answer questions. You must call the answer tool with your final answer when you have finished your research. Call search tool with the exact search query then fetch tool with the most promising URLs. After calling answer, stop generating - your task is complete. Do not call random urls. Call the urls that are returned by search. If you do not do this you will inevitably get the wrong answer. Call tools one by one and do not call them all at a time. Again, you must call the answer tool with your final answer when you have finished your research, or else the task will not be completed and you will not get a reward. Simply printing your final answer is not enough."
-  },
-  {
-    "id": "SQA_00004",
-    "prompt": "How much money, in euros, was the surgeon held responsible for Stella Obasanjo's death ordered to pay her son?\n\nReturn just the answer, no other text.",
-    "mcp_config": {
-      "local": {
-        "command": "docker",
-        "args": [
-          "run",
-          "--rm",
-          "-i",
-          "deepresearch:0.1.7"
-        ]
-      }
-    },
-    "setup_tool": {
-      "name": "setup",
-      "arguments": {}
-    },
-    "evaluate_tool": {
-      "name": "evaluate",
-      "arguments": {
-        "expected_answer": "120,000"
-      }
-    },
-    "system_prompt": "You are an AI research assistant that can search the web and fetch content to answer questions. You must call the answer tool with your final answer when you have finished your research. Call search tool with the exact search query then fetch tool with the most promising URLs. After calling answer, stop generating - your task is complete. Do not call random urls. Call the urls that are returned by search. If you do not do this you will inevitably get the wrong answer. Call tools one by one and do not call them all at a time. Again, you must call the answer tool with your final answer when you have finished your research, or else the task will not be completed and you will not get a reward. Simply printing your final answer is not enough."
-  },
-  {
-    "id": "SQA_00005",
-    "prompt": "What were the month and year when Obama told Christianity Today, \"I am a Christian, and I am a devout Christian. I believe in the redemptive death and resurrection of Jesus Christ\"?\n\nReturn just the answer, no other text.",
-    "mcp_config": {
-      "local": {
-        "command": "docker",
-        "args": [
-          "run",
-          "--rm",
-          "-i",
-          "deepresearch:0.1.7"
-        ]
-      }
-    },
-    "setup_tool": {
-      "name": "setup",
-      "arguments": {}
-    },
-    "evaluate_tool": {
-      "name": "evaluate",
-      "arguments": {
-        "expected_answer": "January 2008"
-      }
-    },
-    "system_prompt": "You are an AI research assistant that can search the web and fetch content to answer questions. You must call the answer tool with your final answer when you have finished your research. Call search tool with the exact search query then fetch tool with the most promising URLs. After calling answer, stop generating - your task is complete. Do not call random urls. Call the urls that are returned by search. If you do not do this you will inevitably get the wrong answer. Call tools one by one and do not call them all at a time. Again, you must call the answer tool with your final answer when you have finished your research, or else the task will not be completed and you will not get a reward. Simply printing your final answer is not enough."
-  },
-  {
-    "id": "SQA_00010",
-    "prompt": "In which year did the Japanese scientist Koichi Mizushima receive the Kato Memorial Prize?\n\nReturn just the answer, no other text.",
-    "mcp_config": {
-      "local": {
-        "command": "docker",
-        "args": [
-          "run",
-          "--rm",
-          "-i",
-          "deepresearch:0.1.7"
-        ]
-      }
-    },
-    "setup_tool": {
-      "name": "setup",
-      "arguments": {}
-    },
-    "evaluate_tool": {
-      "name": "evaluate",
-      "arguments": {
-        "expected_answer": "1999"
-      }
-    },
-    "system_prompt": "You are an AI research assistant that can search the web and fetch content to answer questions. You must call the answer tool with your final answer when you have finished your research. Call search tool with the exact search query then fetch tool with the most promising URLs. After calling answer, stop generating - your task is complete. Do not call random urls. Call the urls that are returned by search. If you do not do this you will inevitably get the wrong answer. Call tools one by one and do not call them all at a time. Again, you must call the answer tool with your final answer when you have finished your research, or else the task will not be completed and you will not get a reward. Simply printing your final answer is not enough."
-  },
-  {
-    "id": "SQA_00011",
-    "prompt": "In which year did Melbourne's Monash Gallery of Art (MGA) rebrand and become the Museum of Australian Photography (MAPh)?\n\nReturn just the answer, no other text.",
-    "mcp_config": {
-      "local": {
-        "command": "docker",
-        "args": [
-          "run",
-          "--rm",
-          "-i",
-          "deepresearch:0.1.7"
-        ]
-      }
-    },
-    "setup_tool": {
-      "name": "setup",
-      "arguments": {}
-    },
-    "evaluate_tool": {
-      "name": "evaluate",
-      "arguments": {
-        "expected_answer": "2023"
-      }
-    },
-    "system_prompt": "You are an AI research assistant that can search the web and fetch content to answer questions. You must call the answer tool with your final answer when you have finished your research. Call search tool with the exact search query then fetch tool with the most promising URLs. After calling answer, stop generating - your task is complete. Do not call random urls. Call the urls that are returned by search. If you do not do this you will inevitably get the wrong answer. Call tools one by one and do not call them all at a time. Again, you must call the answer tool with your final answer when you have finished your research, or else the task will not be completed and you will not get a reward. Simply printing your final answer is not enough."
-  },
-  {
-    "id": "SQA_00012",
-    "prompt": "What signature piece of the MOBA did Scott Wilson discover on the curb between two trash cans?\n\nReturn just the answer, no other text.",
-    "mcp_config": {
-      "local": {
-        "command": "docker",
-        "args": [
-          "run",
-          "--rm",
-          "-i",
-          "deepresearch:0.1.7"
-        ]
-      }
-    },
-    "setup_tool": {
-      "name": "setup",
-      "arguments": {}
-    },
-    "evaluate_tool": {
-      "name": "evaluate",
-      "arguments": {
-        "expected_answer": "Lucy in the Field with Flowers"
-      }
-    },
-    "system_prompt": "You are an AI research assistant that can search the web and fetch content to answer questions. You must call the answer tool with your final answer when you have finished your research. Call search tool with the exact search query then fetch tool with the most promising URLs. After calling answer, stop generating - your task is complete. Do not call random urls. Call the urls that are returned by search. If you do not do this you will inevitably get the wrong answer. Call tools one by one and do not call them all at a time. Again, you must call the answer tool with your final answer when you have finished your research, or else the task will not be completed and you will not get a reward. Simply printing your final answer is not enough."
-  },
-  {
-    "id": "SQA_00013",
-    "prompt": "What is the surname of the psychiatrist who prescribes medication for Marie Hanson for her periodic blackouts in Season 1, Episode 20 of Ally McBeal?\n\nReturn just the answer, no other text.",
-    "mcp_config": {
-      "local": {
-        "command": "docker",
-        "args": [
-          "run",
-          "--rm",
-          "-i",
-          "deepresearch:0.1.7"
-        ]
-      }
-    },
-    "setup_tool": {
-      "name": "setup",
-      "arguments": {}
-    },
-    "evaluate_tool": {
-      "name": "evaluate",
-      "arguments": {
-        "expected_answer": "Peters"
-      }
-    },
-    "system_prompt": "You are an AI research assistant that can search the web and fetch content to answer questions. You must call the answer tool with your final answer when you have finished your research. Call search tool with the exact search query then fetch tool with the most promising URLs. After calling answer, stop generating - your task is complete. Do not call random urls. Call the urls that are returned by search. If you do not do this you will inevitably get the wrong answer. Call tools one by one and do not call them all at a time. Again, you must call the answer tool with your final answer when you have finished your research, or else the task will not be completed and you will not get a reward. Simply printing your final answer is not enough."
-  },
-  {
-    "id": "SQA_00014",
-    "prompt": "Who won the Gerard P. Kuiper Prize in 2001?\n\nReturn just the answer, no other text.",
-    "mcp_config": {
-      "local": {
-        "command": "docker",
-        "args": [
-          "run",
-          "--rm",
-          "-i",
-          "deepresearch:0.1.7"
-        ]
-      }
-    },
-    "setup_tool": {
-      "name": "setup",
-      "arguments": {}
-    },
-    "evaluate_tool": {
-      "name": "evaluate",
-      "arguments": {
-        "expected_answer": "Bruce W. Hapke"
-      }
-    },
-    "system_prompt": "You are an AI research assistant that can search the web and fetch content to answer questions. You must call the answer tool with your final answer when you have finished your research. Call search tool with the exact search query then fetch tool with the most promising URLs. After calling answer, stop generating - your task is complete. Do not call random urls. Call the urls that are returned by search. If you do not do this you will inevitably get the wrong answer. Call tools one by one and do not call them all at a time. Again, you must call the answer tool with your final answer when you have finished your research, or else the task will not be completed and you will not get a reward. Simply printing your final answer is not enough."
-  },
-  {
-    "id": "SQA_00016",
-    "prompt": "On which U.S. TV station did the Canadian reality series *To Serve and Protect* debut?\n\nReturn just the answer, no other text.",
-    "mcp_config": {
-      "local": {
-        "command": "docker",
-        "args": [
-          "run",
-          "--rm",
-          "-i",
-          "deepresearch:0.1.7"
-        ]
-      }
-    },
-    "setup_tool": {
-      "name": "setup",
-      "arguments": {}
-    },
-    "evaluate_tool": {
-      "name": "evaluate",
-      "arguments": {
-        "expected_answer": "KVOS-TV"
-      }
-    },
-    "system_prompt": "You are an AI research assistant that can search the web and fetch content to answer questions. You must call the answer tool with your final answer when you have finished your research. Call search tool with the exact search query then fetch tool with the most promising URLs. After calling answer, stop generating - your task is complete. Do not call random urls. Call the urls that are returned by search. If you do not do this you will inevitably get the wrong answer. Call tools one by one and do not call them all at a time. Again, you must call the answer tool with your final answer when you have finished your research, or else the task will not be completed and you will not get a reward. Simply printing your final answer is not enough."
-  },
-  {
-    "id": "SQA_00017",
-    "prompt": "What instrument did Alec Aitken play well enough for a professional musician to remark, \"Aitken is the most accomplished amateur musician I have ever known\"?\n\nReturn just the answer, no other text.",
-    "mcp_config": {
-      "local": {
-        "command": "docker",
-        "args": [
-          "run",
-          "--rm",
-          "-i",
-          "deepresearch:0.1.7"
-        ]
-      }
-    },
-    "setup_tool": {
-      "name": "setup",
-      "arguments": {}
-    },
-    "evaluate_tool": {
-      "name": "evaluate",
-      "arguments": {
-        "expected_answer": "Violin"
-      }
-    },
-    "system_prompt": "You are an AI research assistant that can search the web and fetch content to answer questions. You must call the answer tool with your final answer when you have finished your research. Call search tool with the exact search query then fetch tool with the most promising URLs. After calling answer, stop generating - your task is complete. Do not call random urls. Call the urls that are returned by search. If you do not do this you will inevitably get the wrong answer. Call tools one by one and do not call them all at a time. Again, you must call the answer tool with your final answer when you have finished your research, or else the task will not be completed and you will not get a reward. Simply printing your final answer is not enough."
-  },
-  {
-    "id": "SQA_00018",
-    "prompt": "On what day, month, and year did Tara Chand (a politician and a Dalit leader from Jammu and Kashmir) resign from the Indian National Congress in support of Ghulam Nabi Azad?\n\nReturn just the answer, no other text.",
-    "mcp_config": {
-      "local": {
-        "command": "docker",
-        "args": [
-          "run",
-          "--rm",
-          "-i",
-          "deepresearch:0.1.7"
-        ]
-      }
-    },
-    "setup_tool": {
-      "name": "setup",
-      "arguments": {}
-    },
-    "evaluate_tool": {
-      "name": "evaluate",
-      "arguments": {
-        "expected_answer": "August 30, 2022"
-      }
-    },
-    "system_prompt": "You are an AI research assistant that can search the web and fetch content to answer questions. You must call the answer tool with your final answer when you have finished your research. Call search tool with the exact search query then fetch tool with the most promising URLs. After calling answer, stop generating - your task is complete. Do not call random urls. Call the urls that are returned by search. If you do not do this you will inevitably get the wrong answer. Call tools one by one and do not call them all at a time. Again, you must call the answer tool with your final answer when you have finished your research, or else the task will not be completed and you will not get a reward. Simply printing your final answer is not enough."
-  },
-  {
-    "id": "SQA_00020",
-    "prompt": "In the series \"El guardi\u00e1n invisible,\" who portrays the character Alfonso \u00c1lvarez de Toledo?\n\nReturn just the answer, no other text.",
-    "mcp_config": {
-      "local": {
-        "command": "docker",
-        "args": [
-          "run",
-          "--rm",
-          "-i",
-          "deepresearch:0.1.7"
-        ]
-      }
-    },
-    "setup_tool": {
-      "name": "setup",
-      "arguments": {}
-    },
-    "evaluate_tool": {
-      "name": "evaluate",
-      "arguments": {
-        "expected_answer": "Ram\u00f3n Barea"
-      }
-    },
-    "system_prompt": "You are an AI research assistant that can search the web and fetch content to answer questions. You must call the answer tool with your final answer when you have finished your research. Call search tool with the exact search query then fetch tool with the most promising URLs. After calling answer, stop generating - your task is complete. Do not call random urls. Call the urls that are returned by search. If you do not do this you will inevitably get the wrong answer. Call tools one by one and do not call them all at a time. Again, you must call the answer tool with your final answer when you have finished your research, or else the task will not be completed and you will not get a reward. Simply printing your final answer is not enough."
-  }
-]
\ No newline at end of file
diff --git a/environments/jupyter/.gitignore b/environments/jupyter/.gitignore
deleted file mode 100644
index a54edc2a..00000000
--- a/environments/jupyter/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-data
-hud.lock.yaml
\ No newline at end of file
diff --git a/environments/jupyter/Dockerfile b/environments/jupyter/Dockerfile
deleted file mode 100644
index e64bfeec..00000000
--- a/environments/jupyter/Dockerfile
+++ /dev/null
@@ -1,41 +0,0 @@
-FROM quay.io/jupyter/minimal-notebook:python-3.11
-
-USER root
-
-ENV PYTHONUNBUFFERED=1 \
-    PYTHONDONTWRITEBYTECODE=1 \
-    PYTHONPATH=/app \
-    PIP_NO_CACHE_DIR=1 \
-    MCP_TRANSPORT="stdio" \
-    HUD_LOG_STREAM="stderr"
-
-WORKDIR /app
-
-# Step 1: Copy dependency files for caching
-COPY server/pyproject.toml /app/server/
-# COPY environment/pyproject.toml /app/environment/
-
-# Step 2: Install all dependencies
-RUN pip install --no-cache-dir -e /app/server/ 
-
-# Step 3: Copy code
-COPY server/ /app/server/
-# COPY environment/ /app/environment/
-
-# Create directories with proper ownership
-RUN mkdir -p /app/data /app/logs /app/shared_data /app/workspace /app/notebooks/ && \
-    chown -R $NB_UID:$NB_GID /app
-
-USER $NB_UID
-
-# Step 4: Download and extract data from HuggingFace (as non-root user)
-RUN wget -q https://huggingface.co/datasets/KAKA22/SpreadsheetBench/resolve/main/all_data_912.tar.gz -O /tmp/data.tar.gz && \
-    tar --no-same-permissions -xzf /tmp/data.tar.gz -C /app/data && \
-    rm /tmp/data.tar.gz
-
-EXPOSE 8000 8888
-
-CMD ["sh", "-c", "\
-    jupyter kernelgateway --KernelGatewayApp.ip=0.0.0.0 --KernelGatewayApp.port=8888 >&2 & \
-    sleep 5 && cd /app/server && exec python3 -m server.main\
-    "]
diff --git a/environments/jupyter/README.md b/environments/jupyter/README.md
deleted file mode 100644
index 37b1d04d..00000000
--- a/environments/jupyter/README.md
+++ /dev/null
@@ -1,68 +0,0 @@
-# Jupyter Env (for SpreadSheetBench)
-
-## QuickStart
-
-### MCP Server from Dockerhub (Don't Have to Build Docker Image)
-
-Run task by
-```
-hud eval Genteki/SpreadSheetBench
-```
-
-### Local MCP Server
-
-First we build the docker image with
-```
-docker build -t <image/name> .
-```
-Then modify the docker image name in `test_task.json`. Finally, load all `api_key` needed into environment varible and run
-
-```
-hud eval
-```
-
-## File Structure
-
-`environments/jupyter` file sturcture:
-```
-├── Dockerfile
-├── server
-│   ├── config.py
-│   ├── evaluate
-│   │   ├── compare.py
-│   │   ├── dumb.py
-│   │   ├── eval_all.py
-│   │   ├── eval_single.py
-│   │   ├── generalize.py
-│   │   └── __init__.py
-│   ├── __init__.py
-│   ├── main.py
-│   ├── pyproject.toml
-│   ├── setup
-│   │   ├── __init__.py
-│   │   └── load_spreadsheet.py
-│   └── tools
-│       ├── __init__.py
-│       └── jupyter_with_record.py
-└── test_task.json
-```
-Here we introduce the main parts of the environments
-* `main.py` start point of MCP server
-* `tools/jupyter_with_record.py`: offer `execute_code` method to allow agent interacting with jupyter kernel and record the solution
-* `setup/`: setup methods for eval task
-* `evaluate/` evaluations method for eval task
-
-
-## Related Linkd
-### Hugginface:
-* [Genteki/SpreadSheetBench-Tiny](https://huggingface.co/datasets/Genteki/SpreadSheetBench-Tiny) (Size: 10)
-* [Genteki/SpreadSheetBench-200](https://huggingface.co/datasets/Genteki/SpreadSheetBench-200) (Size: 200)
-* [Genteki/SpreadSheetBench](https://huggingface.co/datasets/Genteki/SpreadSheetBench) (Size: 912)
-
-### Example Traces (May require permission)
-* [Single Test Task](https://www.hud.ai/trace/d31de170-e70a-4abb-8f95-70512515dade)
-* [Genteki/SpreadSheetBench-Tiny Test](https://www.hud.ai/jobs/2c426368-e352-4c79-af4a-aefb136e3f58)
-
-### Github
-
-* Feature Branch: [New-Env-Jupyter](https://github.com/Genteki/hud-python/tree/New-Env-Jupyter)
\ No newline at end of file
diff --git a/environments/jupyter/server/__init__.py b/environments/jupyter/server/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/environments/jupyter/server/config.py b/environments/jupyter/server/config.py
deleted file mode 100644
index fe7afd74..00000000
--- a/environments/jupyter/server/config.py
+++ /dev/null
@@ -1,4 +0,0 @@
-"""Configuration loader for Jupyter server."""
-
-VOLUMES_PATH = "/app/data/"
-SOLUTIONS_PATH = "/app/shared_data"
diff --git a/environments/jupyter/server/evaluate/__init__.py b/environments/jupyter/server/evaluate/__init__.py
deleted file mode 100644
index dd8381d7..00000000
--- a/environments/jupyter/server/evaluate/__init__.py
+++ /dev/null
@@ -1,11 +0,0 @@
-"""Evaluation tools for the Jupyter environment."""
-
-from hud.tools.base import BaseHub
-
-# Create evaluate hub (tools will be hidden from agents)
-evaluate = BaseHub("evaluate")
-
-# Import evaluation tools to register them with the hub
-from . import eval_all
-
-__all__ = ["evaluate"]
diff --git a/environments/jupyter/server/evaluate/compare.py b/environments/jupyter/server/evaluate/compare.py
deleted file mode 100644
index 49e25728..00000000
--- a/environments/jupyter/server/evaluate/compare.py
+++ /dev/null
@@ -1,186 +0,0 @@
-import os, json, logging
-from hud.tools.types import EvaluationResult
-import openpyxl
-import datetime
-
-
-logger = logging.getLogger(__name__)
-
-
-# def compare(proc_file, gt_file, answer_position, instruction_type):
-def compare(proc_file, gt_file, answer_position):
-    if not os.path.exists(proc_file):
-        return False, "File not exist"
-    # Open workbooks
-    try:
-        wb_gt = openpyxl.load_workbook(filename=gt_file, data_only=True)
-        wb_proc = openpyxl.load_workbook(filename=proc_file, data_only=True)
-    except Exception as e:
-        return False, str(e)
-
-    # Initialize report
-    result = False
-    msg = ""
-
-    sheet_cell_ranges = answer_position.split(",")
-    result_list = []
-    msg_list = []
-    for sheet_cell_range in sheet_cell_ranges:
-        if "!" in sheet_cell_range:
-            sheet_name, cell_range = sheet_cell_range.split("!")
-            sheet_name = sheet_name.lstrip("'").rstrip("'")
-        else:
-            sheet_name = wb_gt.sheetnames[0]
-            cell_range = sheet_cell_range
-
-        # process sheet_name and cell_range
-        sheet_name = sheet_name.lstrip("'").rstrip("'")
-        cell_range = cell_range.lstrip("'").rstrip("'")
-
-        result, msg = cell_level_compare(wb_gt, wb_proc, sheet_name, cell_range)
-        result_list.append(result)
-        msg_list.append(msg)
-
-    return all(result_list), "; ".join(msg_list)
-
-
-def cell_level_compare(wb_gt, wb_proc, sheet_name, cell_range):
-    if sheet_name not in wb_proc:
-        return False, "worksheet not found"
-    ws_gt = wb_gt[sheet_name]
-    ws_proc = wb_proc[sheet_name]
-
-    cell_names = generate_cell_names(cell_range)
-
-    for cell_name in cell_names:
-        cell_gt = ws_gt[cell_name]
-        cell_proc = ws_proc[cell_name]
-
-        if not compare_cell_value(cell_gt.value, cell_proc.value):
-            msg = f"Value difference at cell {cell_gt.coordinate}: ws_gt has {cell_gt.value},\
-                    ws_proc has {cell_proc.value}"
-            return False, msg
-
-        # if not compare_fill_color(cell_gt.fill, cell_proc.fill):
-        #     msg = f"Fill color difference at cell {cell_gt.coordinate}: ws_gt has {cell_gt.fill.fgColor.rgb},\
-        #             ws_proc has {cell_proc.fill.fgColor.rgb}"
-        #     return False, msg
-
-        # if not compare_font_color(cell_gt.font, cell_proc.font):
-        #     # msg = f"Font color difference at cell {cell_gt.coordinate}: ws_gt has {cell_gt.font.color.rgb},\
-        #     #        ws_proc has {cell_proc.font.color.rgb}"
-        #     msg = f"Font color difference at cell {cell_gt.coordinate}"
-        #     return False, msg
-
-    print("Cell values in the specified range are identical.")
-    return True, ""
-
-
-def datetime_to_float(dt):
-    excel_start_date = datetime.datetime(1899, 12, 30)
-    delta = dt - excel_start_date
-    return delta.days + delta.seconds / 86400.0
-
-
-def transform_value(v):
-    if isinstance(v, (int, float)):
-        v = round(float(v), 2)
-    elif isinstance(v, datetime.time):
-        v = str(v)[:-3]
-    elif isinstance(v, datetime.datetime):
-        v = round(datetime_to_float(v), 0)
-    elif isinstance(v, str):
-        try:
-            v = round(float(v), 2)
-        except ValueError:
-            pass
-    return v
-
-
-def compare_cell_value(v1, v2):
-    v1 = transform_value(v1)
-    v2 = transform_value(v2)
-    if (v1 == "" and v2 is None) or (v1 is None and v2 == ""):
-        return True
-    if (v1 == "" and v2 == "") or (v1 is None and v2 is None):
-        return True
-    if type(v1) != type(v2):
-        # print(type(v1), type(v2))
-        return False
-    if v1 == v2:
-        return True
-    else:
-        return False
-
-
-def _get_color_rgb(color) -> str:
-    """Extract RGB value from color object, defaulting to '00000000' if not a string."""
-    if color and isinstance(color.rgb, str):
-        return color.rgb
-    return "00000000"
-
-
-def _compare_colors(color1, color2) -> bool:
-    """Compare two colors using only last 6 characters (RGB), ignoring alpha channel."""
-    rgb1 = _get_color_rgb(color1)
-    rgb2 = _get_color_rgb(color2)
-    return rgb1[-6:] == rgb2[-6:]
-
-
-def compare_fill_color(fill1, fill2) -> bool:
-    """Compare fill colors between two cells."""
-    return _compare_colors(fill1.fgColor, fill2.fgColor) and _compare_colors(
-        fill1.bgColor, fill2.bgColor
-    )
-
-
-def compare_font_color(font_gt, font_proc) -> bool:
-    """Compare font colors between two cells."""
-    return _compare_colors(font_gt.color, font_proc.color)
-
-
-def col_num2name(n):
-    """Convert a column number to an Excel column name"""
-    name = ""
-    while n > 0:
-        n, remainder = divmod(n - 1, 26)
-        name = chr(65 + remainder) + name
-    return name
-
-
-def col_name2num(name):
-    """Convert an Excel column name to a column number"""
-    num = 0
-    for c in name:
-        num = num * 26 + (ord(c) - ord("A") + 1)
-    return num
-
-
-def parse_cell_range(range_str):
-    """Parse a range string like 'A1:AB12'"""
-    start_cell, end_cell = range_str.split(":")
-    start_col, start_row = "", ""
-    for char in start_cell:
-        if char.isdigit():
-            start_row += char
-        else:
-            start_col += char
-
-    end_col, end_row = "", ""
-    for char in end_cell:
-        if char.isdigit():
-            end_row += char
-        else:
-            end_col += char
-
-    return (col_name2num(start_col), int(start_row)), (col_name2num(end_col), int(end_row))
-
-
-def generate_cell_names(range_str):
-    """Generate a list of all cell names in the specified range"""
-    if ":" not in range_str:
-        return [range_str]
-    (start_col, start_row), (end_col, end_row) = parse_cell_range(range_str)
-    columns = [col_num2name(i) for i in range(start_col, end_col + 1)]
-    cell_names = [f"{col}{row}" for col in columns for row in range(start_row, end_row + 1)]
-    return cell_names
diff --git a/environments/jupyter/server/evaluate/eval_all.py b/environments/jupyter/server/evaluate/eval_all.py
deleted file mode 100644
index 03b716de..00000000
--- a/environments/jupyter/server/evaluate/eval_all.py
+++ /dev/null
@@ -1,146 +0,0 @@
-import os
-import logging
-from pathlib import Path
-from .compare import compare
-from .generalize import generalize_code
-from ..config import VOLUMES_PATH, SOLUTIONS_PATH
-from ..tools import JupyterToolWithRecord
-from . import evaluate
-
-logger = logging.getLogger(__name__)
-
-
-@evaluate.tool("eval_all")
-async def eval_all(id: str, answer_position: str, dataset_path: str = "all_data_912"):
-    """
-    Evaluate solution on all three instances (generalization test).
-
-    Similar to SpreadsheetBench's run_solution():
-    1. Generalize code from instance 1 to instances 2 & 3
-    2. Execute all three solutions
-    3. Evaluate each output against ground truth
-
-    Args:
-        id: Task ID
-        dataset_path: Path to dataset directory
-
-    Returns:
-        EvaluationResult with aggregated results for all instances
-    """
-    try:
-        # Connect to the shared kernel
-        jupyter_tool = JupyterToolWithRecord.from_shared_kernel("SpreadSheetBench")
-
-        dataset_dir = Path(VOLUMES_PATH) / dataset_path
-        spreadsheet_dir = dataset_dir / "spreadsheet" / id
-        if not spreadsheet_dir.exists():
-            raise FileNotFoundError(f"Spreadsheet directory not found: {spreadsheet_dir}")
-
-        # Step 1: Generalize code
-        logger.info(f"Generalizing solution for task {id}")
-        gen_results = generalize_code(id)
-        if "error" in gen_results:
-            raise RuntimeError(f"Generalization failed: {gen_results['error']}")
-
-        # Step 2: Execute and evaluate all three instances
-        results = {}
-        total_passed = 0
-        for i in range(1, 4):
-            instance_key = f"instance_{i}"
-            solution_path = os.path.join(SOLUTIONS_PATH, f"{i}_solution.py")
-            output_file = spreadsheet_dir / f"{i}_{id}_output.xlsx"
-            answer_file = spreadsheet_dir / f"{i}_{id}_answer.xlsx"
-
-            # Execute solution
-            logger.info(f"Executing solution for instance {i}")
-            try:
-                with open(solution_path, "r") as f:
-                    solution_code = f.read()
-                exec_result = await jupyter_tool._execute(solution_code)
-
-                # Check for execution errors
-                is_error = (
-                    "-----" in exec_result or "Error" in exec_result or "Traceback" in exec_result
-                )
-                if is_error:
-                    results[instance_key] = {
-                        "passed": False,
-                        "execution_error": exec_result[:500],  # Truncate long errors
-                        "reward": 0.0,
-                    }
-                    continue
-
-                # Evaluate output
-                if not output_file.exists():
-                    results[instance_key] = {
-                        "passed": False,
-                        "error": "Output file not created by solution",
-                        "reward": 0.0,
-                    }
-                    continue
-
-                # Compare with ground truth (get answer_position from first instance)
-                passed, msg = compare(str(output_file), str(answer_file), answer_position)
-
-                results[instance_key] = {
-                    "passed": passed,
-                    "message": msg,
-                    "reward": 1.0 if passed else 0.0,
-                }
-
-                if passed:
-                    total_passed += 1
-
-            except Exception as e:
-                results[instance_key] = {"passed": False, "error": str(e), "reward": 0.0}
-
-        # Calculate final score
-        total_instances = 3
-        success_rate = total_passed / total_instances
-
-        # Build summary
-        summary = f"✅ Passed: {total_passed}/{total_instances} instances\n"
-        summary += f"📊 Success Rate: {success_rate:.1%}\n\n"
-
-        for i in range(1, 4):
-            instance_key = f"instance_{i}"
-            if instance_key in results:
-                result = results[instance_key]
-                status = "✅ PASS" if result.get("passed", False) else "❌ FAIL"
-                summary += f"Instance {i}: {status}\n"
-                if not result.get("passed", False):
-                    error_msg = (
-                        result.get("error")
-                        or result.get("execution_error")
-                        or result.get("message", "Unknown error")
-                    )
-                    summary += f"  Error: {error_msg[:200]}\n"
-
-        logger.info(f"Evaluation complete: {total_passed}/{total_instances} passed")
-
-        # Return plain dict (like browser environment) instead of EvaluationResult
-        return {
-            "reward": success_rate,
-            "done": True,
-            "isError": False,
-            "content": summary,
-            "info": {
-                "task_id": id,
-                "total_passed": total_passed,
-                "total_instances": total_instances,
-                "success_rate": success_rate,
-                "generalization": gen_results,
-                "instance_results": results,
-            },
-        }
-
-    except Exception as e:
-        logger.error(f"Evaluation error: {e}")
-        # Return plain dict (like browser environment) instead of EvaluationResult
-        return {
-            "reward": 0.0,
-            "done": True,
-            "isError": True,
-            "content": f"❌ ERROR: {str(e)}",
-            "info": {"task_id": id, "error": str(e)},
-        }
diff --git a/environments/jupyter/server/evaluate/generalize.py b/environments/jupyter/server/evaluate/generalize.py
deleted file mode 100644
index 52f08384..00000000
--- a/environments/jupyter/server/evaluate/generalize.py
+++ /dev/null
@@ -1,83 +0,0 @@
-import os, json, logging
-import copy
-from ..config import SOLUTIONS_PATH
-
-logger = logging.getLogger(__name__)
-
-
-def kmp(text: str, pattern: str) -> list[int]:
-    """Find all occurrences of pattern in text using KMP algorithm.
-    Returns:
-        List of starting positions where pattern is found in text
-    """
-    if not pattern or not text:
-        return []
-
-    n = len(text)
-    m = len(pattern)
-    pi = [0] * m
-    j = 0
-    for i in range(1, m):
-        while j > 0 and pattern[i] != pattern[j]:
-            j = pi[j - 1]
-        if pattern[i] == pattern[j]:
-            j += 1
-        pi[i] = j
-    positions = []
-    j = 0
-    for i in range(n):
-        while j > 0 and text[i] != pattern[j]:
-            j = pi[j - 1]
-        if text[i] == pattern[j]:
-            j += 1
-        if j == m:
-            positions.append(i - m + 1)
-            j = pi[j - 1]
-
-    return positions
-
-
-def generalize_code(id: str):
-    """Generalize solution code from instance 1 to instances 2 and 3."""
-    # src_path = "/app/shared_data/1_solution.py"
-    src_path = os.path.join(SOLUTIONS_PATH, "1_solution.py")
-
-    # Read code from source file
-    if not os.path.exists(src_path):
-        logger.warning(f"Solution file not found: {src_path}")
-        return {"error": f"Solution file not found: {src_path}"}
-
-    with open(src_path, "r") as f:
-        code = f.read()
-
-    if not code.strip():
-        return {"error": "Solution file is empty"}
-
-    results = {}
-
-    for i in range(2, 4):
-        src_pattern = f"1_{id}_"
-        tgt_pattern = f"{i}_{id}_"
-        # tgt_path = f"/app/shared_data/{i}_solution.py"
-        tgt_path = os.path.join(SOLUTIONS_PATH, f"{i}_solution.py")
-
-        positions = kmp(code, src_pattern)
-
-        # Convert to list for efficient in-place replacement (O(n) instead of O(k*n))
-        code_list = list(code)
-        for pos in positions:
-            for j, char in enumerate(tgt_pattern):
-                code_list[pos + j] = char
-        new_code = "".join(code_list)
-
-        # Write to target file
-        with open(tgt_path, "w") as f:
-            f.write(new_code)
-
-        results[f"instance_{i}"] = {
-            "path": tgt_path,
-            "replacements": len(positions),
-            "pattern": f"{src_pattern} -> {tgt_pattern}",
-        }
-
-    return results
diff --git a/environments/jupyter/server/main.py b/environments/jupyter/server/main.py
deleted file mode 100644
index 12cc6f98..00000000
--- a/environments/jupyter/server/main.py
+++ /dev/null
@@ -1,60 +0,0 @@
-import sys
-import logging
-from pathlib import Path
-from hud.server import MCPServer
-
-from .tools import JupyterToolWithRecord as JupyterTool
-from .setup import setup as setup_hub
-from .evaluate import evaluate as evaluate_hub
-
-logging.basicConfig(
-    stream=sys.stderr,
-    level=logging.INFO,
-    format="[%(levelname)s] %(asctime)s | %(name)s | %(message)s",
-)
-
-logger = logging.getLogger(__name__)
-
-mcp = MCPServer(name="Jupyter")
-
-# Global tool instance
-jupyter_tool = None
-
-
-@mcp.initialize
-async def initialize_environment():
-    """Initialize the Jupyter environment."""
-    global jupyter_tool
-    logger.info("Initializing jupyter environment")
-
-    # Create tool (kernel will be created on first use)
-    jupyter_tool = JupyterTool(url_suffix="localhost:8888", kernel_name="python3")
-    mcp.add_tool(jupyter_tool)
-
-    # Ensure kernel is started and register it for reuse
-    await jupyter_tool._ensure_kernel()
-    JupyterTool.register_shared_kernel("SpreadSheetBench", jupyter_tool._kernel_id)
-
-    # Set environment on hubs so they can access the jupyter tool
-    setup_hub.env = jupyter_tool
-    evaluate_hub.env = jupyter_tool
-
-    # Mount hubs (this creates dispatcher tools, hiding individual tools from agents)
-    mcp.mount(setup_hub)
-    mcp.mount(evaluate_hub)
-
-    logger.info("Jupyter environment initialized successfully")
-
-
-@mcp.shutdown
-async def shutdown_environment():
-    """Clean shutdown of the Jupyter environment."""
-    global jupyter_tool
-
-    if jupyter_tool:
-        await jupyter_tool.shutdown()
-        logger.info("Jupyter kernel shut down")
-
-
-if __name__ == "__main__":
-    mcp.run()
diff --git a/environments/jupyter/server/pyproject.toml b/environments/jupyter/server/pyproject.toml
deleted file mode 100644
index 187f68de..00000000
--- a/environments/jupyter/server/pyproject.toml
+++ /dev/null
@@ -1,34 +0,0 @@
-[project]
-name = "sheet-mcp-server"
-version = "0.1.0"
-description = "MCP server for XLSX spreadsheet manipulation"
-authors = [{name = "HUD Team"}]
-requires-python = ">=3.11,<3.14"
-dependencies = [
-    "hud-python==0.4.62",
-    "pandas>=2.0.0",
-    "openpyxl>=3.1.0",
-    "xlsxwriter>=3.1.0",
-    "jupyter-client>=8.0.0",
-    "jupyter-kernel-gateway>=3.0.0",
-    "ipython>=8.0.0",
-    "nbformat>=5.7.0",
-    "fastapi>=0.100.0",
-    "uvicorn>=0.23.0",
-    "tornado>=6.0.0",
-    "aiohttp>=3.8.0",
-]
-
-[project.optional-dependencies]
-dev = [
-    "pytest>=7.0.0",
-    "pytest-asyncio>=0.21.0",
-]
-
-[build-system]
-requires = ["setuptools>=61.0", "wheel"]
-build-backend = "setuptools.build_meta"
-
-[tool.setuptools.packages.find]
-where = ["."]
-include = ["server*"]
\ No newline at end of file
diff --git a/environments/jupyter/server/setup/__init__.py b/environments/jupyter/server/setup/__init__.py
deleted file mode 100644
index f83241de..00000000
--- a/environments/jupyter/server/setup/__init__.py
+++ /dev/null
@@ -1,10 +0,0 @@
-"""Setup tools for the sheet environment."""
-
-from hud.tools.base import BaseHub
-
-# Create setup hub (tools will be hidden from agents)
-setup = BaseHub("setup")
-
-# No Setup Tools for Jupyter Environment for now
-
-__all__ = ["setup"]
diff --git a/environments/jupyter/server/tools/__init__.py b/environments/jupyter/server/tools/__init__.py
deleted file mode 100644
index cf3a73ff..00000000
--- a/environments/jupyter/server/tools/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from .jupyter import JupyterToolWithRecord
diff --git a/environments/jupyter/server/tools/jupyter.py b/environments/jupyter/server/tools/jupyter.py
deleted file mode 100644
index 553667ea..00000000
--- a/environments/jupyter/server/tools/jupyter.py
+++ /dev/null
@@ -1,24 +0,0 @@
-import os
-from hud.tools.jupyter import JupyterTool
-from ..config import SOLUTIONS_PATH
-
-
-class JupyterToolWithRecord(JupyterTool):
-    """Jupyter Tool with code recording"""
-
-    async def _execute(self, code: str, execution_timeout: int = 15) -> str:
-        result = await super()._execute(code, execution_timeout)
-
-        # Record code if no error
-        is_error = (
-            "-----" in result
-            or "Error" in result
-            or "Traceback" in result
-            or "Execution timed out" in result
-        )
-        if not is_error:
-            with open(os.path.join(SOLUTIONS_PATH, "1_solution.py"), "a") as f:
-                f.write(code)
-                f.write("\n\n")
-
-        return result
diff --git a/environments/jupyter/test_task.json b/environments/jupyter/test_task.json
deleted file mode 100644
index 225a1779..00000000
--- a/environments/jupyter/test_task.json
+++ /dev/null
@@ -1,27 +0,0 @@
-[
-    {
-        "id": "59196",
-        "prompt": "You are a spreadsheet expert who can manipulate spreadsheets through Python code.\n\nYou need to solve the given spreadsheet manipulation question, which contains six types of information:\n- instruction: The question about spreadsheet manipulation.\n- spreadsheet_path: The path of the spreadsheet file you need to manipulate.\n- spreadsheet_content: The first few rows of the content of speadsheet file.\n- instruction_type: There are two values (Cell-Level Manipulation, Sheet-Level Manipulation) used to indicate whether the answer to this question applies only to specific cells or to the entire worksheet.\n- answer_position: The position need to be modified or filled. For Cell-Level Manipulation questions, this field is filled with the cell position; for Sheet-Level Manipulation, it is the maximum range of cells you need to modify. You only need to modify or fill in values within the cell range specified by answer_position.\n- output_path: You need to generate the modified spreadsheet file in this new path.\n\nBelow is the spreadsheet manipulation question you need to solve:\n### instruction\nI need a formula to determine which column contains the highest value in a row, and then return the heading of that column. My Excel sheet example is attached.\n\n### spreadsheet_path\n/app/data/all_data_912/spreadsheet/59196/1_59196_input.xlsx\n\n### spreadsheet_content\nSheet Name: Sheet1\n   Unnamed: 0  Unnamed: 1  Unnamed: 2 Unnamed: 3 Unnamed: 4 Unnamed: 5 Unnamed: 6 Unnamed: 7\n0         NaN         NaN         NaN          A          B          C          D        MAX\n1         NaN         NaN         NaN          0          0          1          2          D\n2         NaN         NaN         NaN          2          0          0          1        NaN\n3         NaN         NaN         NaN          0          1          0          2        NaN\n--------------------------------------------------\n\n\n### instruction_type\nCell-Level Manipulation\n\n### answer_position\nH3:H5\n\n### output_path\n/app/data/all_data_912/spreadsheet/59196/1_59196_output.xlsx\n\nThe solution of the question can be generate through 10 rounds of interaction and you can do two types of actions.\n1. Spreadsheet information acquisition: You can generate Python code to obtain the information in the spreadsheet file. In the next turn, the execution result of you Python code will provide to you.\n2. Question solution generation: You can generate Python code for the final solution of the question. If error occur when executing code, the error traceback will provide to you for code refinement.\n\n**IMPORTANT**: PLEASE FILL THE ANSWER WITH VALUES RATHER THAN FORMULAS\n",
-        "mcp_config": {
-            "local": {
-                "command": "docker",
-                "args": [
-                    "run",
-                    "--rm",
-                    "-i",
-                    "hudevals/jupyter:v1.2"
-                ]
-            }
-        },
-        "evaluate_tool": {
-            "name": "evaluate",
-            "arguments": {
-                "name": "eval_all",
-                "arguments": {
-                    "id": "59196",
-                    "answer_position": "H3:H5"
-                }
-            }
-        }
-    }
-]
\ No newline at end of file
diff --git a/environments/online_mind2web/.gitignore b/environments/online_mind2web/.gitignore
deleted file mode 100644
index 1916bd2d..00000000
--- a/environments/online_mind2web/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-gcp.json
-test.ipynb
\ No newline at end of file
diff --git a/environments/online_mind2web/Dockerfile b/environments/online_mind2web/Dockerfile
deleted file mode 100644
index abc41f9f..00000000
--- a/environments/online_mind2web/Dockerfile
+++ /dev/null
@@ -1,36 +0,0 @@
-# Use our HUD base browser image with Playwright and uv pre-installed
-FROM hudpython/base-browser:latest
-
-# Create app-specific working directory
-WORKDIR /app
-
-# Copy project files
-COPY pyproject.toml ./
-COPY src/ ./src/
-
-# Install the package using the existing venv at /opt/venv
-# The --python flag tells uv to use this specific Python instead of creating a new venv
-RUN uv pip install --python /opt/venv -e .
-
-# Create directories for logs and data
-RUN mkdir -p /app/logs /app/data
-
-ENV DISPLAY_WIDTH=1400
-ENV DISPLAY_HEIGHT=850
-
-ENV PYTHONUNBUFFERED=1 \
-    PYTHONDONTWRITEBYTECODE=1
-# Note: Environment variables for browser providers should be set at runtime:
-# - BROWSER_PROVIDER: anchorbrowser, steel, browserbase, hyperbrowser, kernel
-# - Provider-specific API keys: ANCHOR_API_KEY, STEEL_API_KEY, etc.
-# - GCP_CREDENTIALS_JSON: For Google Sheets functionality (if needed)
-
-# Run remote browser with persistent context
-CMD ["sh", "-c", "\
-    # Start context server in background \
-    python3 -m hud_controller.context >&2 & \
-    # Wait a bit for context server to start \
-    sleep 2 && \
-    # Run MCP server in foreground with exec \
-    exec python3 -m hud_controller.server \
-"]
\ No newline at end of file
diff --git a/environments/online_mind2web/README.md b/environments/online_mind2web/README.md
deleted file mode 100644
index 622f1477..00000000
--- a/environments/online_mind2web/README.md
+++ /dev/null
@@ -1,36 +0,0 @@
-# HUD Online Mind2Web Taskset
-
-Based on hud remote-browser, this MCP server provides environment for Online-Mind2Web task exacution and evaluation.
-
-## Running with Docker
-
-The Docker image supports both production and development modes using the same Dockerfile.
-
-### Building the Image
-
-```bash
-# Production build (default)
-docker build -t hud-om2w:latest .
-```
-
-### Running the Test Task
-```bash
-hud eval ./test_task.json 
-```
-
-### Running Whole Online-Mind2Web Dataset From HuggingFace
-```bash
-hud eval Genteki/Online-Mind2Web --full --max-concurrent=5
-```
-
-### Different Evaluation Method
-
-To chosse different evaluation method, you can change different `task["evaluate_tool"]["evaluate"]["name"]` value in task json file. Here are the different evaluation method we support for you:
-
-| Evaluation Method | Final Screenshot | Screenshot History | Action Histroy | 
-|:---|:---:|:---:| :---: |
-| `autonomous` | ✔ | ✗ | ✔ |
-| `webjudge` | ✔ | ✔ | ✔ |
-| `overall_judge`[^1] | - | - | - |
-
-[^1]: `overall_judge` will execute all evaluation methods above and return the average of the rewards of them.
diff --git a/environments/online_mind2web/pyproject.toml b/environments/online_mind2web/pyproject.toml
deleted file mode 100644
index 0e90c864..00000000
--- a/environments/online_mind2web/pyproject.toml
+++ /dev/null
@@ -1,22 +0,0 @@
-[project]
-name = "hud-om2w"
-version = "0.1.0"
-description = "HUD Remote Browser Controller with MCP tools for cloud browser providers"
-requires-python = ">=3.11,<3.13"
-dependencies = [ "hud-python>=0.4.69", "anthropic>=0.74.0", "pyautogui", "playwright", "httpx", "typer", "google-api-python-client", "google-auth",]
-
-[build-system]
-requires = [ "hatchling",]
-build-backend = "hatchling.build"
-
-[project.scripts]
-hud-om2w = "hud_controller.__main__:main"
-
-[tool.hud]
-image = "hud-om2w:dev"
-
-[tool.hatch.metadata]
-allow-direct-references = true
-
-[tool.hatch.build.targets.wheel]
-packages = [ "src/hud_controller",]
diff --git a/environments/online_mind2web/src/hud_controller/__init__.py b/environments/online_mind2web/src/hud_controller/__init__.py
deleted file mode 100644
index 20a3997d..00000000
--- a/environments/online_mind2web/src/hud_controller/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-"""Online Mind2Web Env, From ../remote-browser"""
-
-__version__ = "0.1.0"
diff --git a/environments/online_mind2web/src/hud_controller/context.py b/environments/online_mind2web/src/hud_controller/context.py
deleted file mode 100644
index 0e45a766..00000000
--- a/environments/online_mind2web/src/hud_controller/context.py
+++ /dev/null
@@ -1,139 +0,0 @@
-"""
-Context server for remote browser environment that persists state across hot-reloads.
-
-Run this as a separate process to maintain browser session state during development:
-    python -m hud_controller.context
-"""
-
-import asyncio
-import logging
-from datetime import datetime
-from typing import Dict, Any, Optional
-from hud.server.context import run_context_server
-
-logger = logging.getLogger(__name__)
-
-
-class RemoteBrowserContext:
-    """Context that holds remote browser state across reloads."""
-
-    def __init__(self):
-        """Initialize the remote browser context."""
-        self.browser_provider = None
-        self.is_initialized = False
-        self.provider_config: Optional[Dict[str, Any]] = None
-        self.launch_options: Optional[Dict[str, Any]] = None
-        self._startup_complete = False
-        self.playwright_tool = None  # Store the playwright tool
-        self._telemetry: Optional[Dict[str, Any]] = None  # Store full telemetry data
-
-        logger.info("[RemoteBrowserContext] Created new remote browser context")
-
-    def startup(self):
-        """One-time startup when context server starts."""
-        if self._startup_complete:
-            logger.info("[RemoteBrowserContext] Startup already complete, skipping")
-            return
-
-        logger.info("[RemoteBrowserContext] Performing one-time startup")
-        self._startup_complete = True
-
-    # === Proxy-friendly methods for multiprocessing.Manager ===
-    # Note: These are needed because direct attribute access doesn't always
-    # work correctly through the multiprocessing proxy
-
-    def get_browser_provider(self):
-        """Get the browser provider instance."""
-        return self.browser_provider
-
-    def set_browser_provider(self, provider) -> None:
-        """Set the browser provider instance."""
-        self.browser_provider = provider
-        if provider:
-            self.provider_name = provider.__class__.__name__.replace("Provider", "").lower()
-            logger.info(f"[RemoteBrowserContext] Set browser provider: {self.provider_name}")
-
-    def get_cdp_url(self) -> Optional[str]:
-        """Get the CDP URL from telemetry."""
-        return self._telemetry.get("cdp_url") if self._telemetry else None
-
-    def get_is_initialized(self) -> bool:
-        """Check if environment is initialized."""
-        return self.is_initialized
-
-    def set_initialized(self, value: bool) -> None:
-        """Set initialization status."""
-        self.is_initialized = value
-        logger.info(f"[RemoteBrowserContext] Initialization status: {value}")
-
-    def get_provider_config(self) -> Optional[Dict[str, Any]]:
-        """Get provider configuration."""
-        return self.provider_config
-
-    def set_provider_config(self, config: Dict[str, Any]) -> None:
-        """Set provider configuration."""
-        self.provider_config = config
-        logger.info(f"[RemoteBrowserContext] Set provider config")
-
-    def get_launch_options(self) -> Optional[Dict[str, Any]]:
-        """Get launch options."""
-        return self.launch_options
-
-    def set_launch_options(self, options: Dict[str, Any]) -> None:
-        """Set launch options."""
-        self.launch_options = options
-        logger.info(f"[RemoteBrowserContext] Set launch options")
-
-    def get_playwright_tool(self):
-        """Get the playwright tool instance."""
-        return self.playwright_tool
-
-    def set_playwright_tool(self, tool) -> None:
-        """Set the playwright tool instance."""
-        self.playwright_tool = tool
-        logger.info(f"[RemoteBrowserContext] Set playwright tool")
-
-    def set_telemetry(self, telemetry: Dict[str, Any]) -> None:
-        """Set the full telemetry data."""
-        self._telemetry = telemetry
-        logger.info(f"[RemoteBrowserContext] Set telemetry: {telemetry}")
-
-    def get_state_summary(self) -> Dict[str, Any]:
-        """Get a summary of the current state."""
-        return {
-            "is_initialized": self.is_initialized,
-            "startup_complete": self._startup_complete,
-            "provider_name": self._telemetry.get("provider") if self._telemetry else None,
-            "has_cdp_url": self.get_cdp_url() is not None,
-            "has_browser_provider": self.browser_provider is not None,
-            "has_playwright_tool": self.playwright_tool is not None,
-        }
-
-    def get_telemetry(self) -> Dict[str, Any]:
-        """Get telemetry data from the browser provider."""
-        # If we have stored telemetry, return it
-        if self._telemetry:
-            return self._telemetry
-
-        # Otherwise return basic telemetry data
-        return {
-            "provider": "unknown",
-            "status": "not_initialized",
-            "live_url": None,
-            "cdp_url": None,
-            "instance_id": None,
-            "timestamp": datetime.now().isoformat(),
-        }
-
-
-if __name__ == "__main__":
-    # Run the context server with RemoteBrowserContext
-    context = RemoteBrowserContext()
-    context.startup()
-
-    # Log initial state
-    logger.info(f"[Context] Starting remote browser context server")
-    logger.info(f"[Context] Initial state: {context.get_state_summary()}")
-
-    # Run the context server
-    asyncio.run(run_context_server(context, "/tmp/hud_remote_browser_ctx.sock"))
diff --git a/environments/online_mind2web/src/hud_controller/evaluate/__init__.py b/environments/online_mind2web/src/hud_controller/evaluate/__init__.py
deleted file mode 100644
index 4f8b909d..00000000
--- a/environments/online_mind2web/src/hud_controller/evaluate/__init__.py
+++ /dev/null
@@ -1,11 +0,0 @@
-"""Evaluation layer for remote browser environment."""
-
-from __future__ import annotations
-
-from hud.tools.base import BaseHub
-
-evaluate = BaseHub("evaluate")
-
-from . import autonomous_eval, webjudge, overall_judge
-
-__all__ = ["evaluate"]
diff --git a/environments/online_mind2web/src/hud_controller/evaluate/autonomous_eval.py b/environments/online_mind2web/src/hud_controller/evaluate/autonomous_eval.py
deleted file mode 100644
index af4aed84..00000000
--- a/environments/online_mind2web/src/hud_controller/evaluate/autonomous_eval.py
+++ /dev/null
@@ -1,170 +0,0 @@
-"""online=mind2web evaluators."""
-
-import os, json, logging
-from hud.tools.types import EvaluationResult
-from . import evaluate
-from fastmcp import Context
-
-logger = logging.getLogger(__name__)
-
-
-@evaluate.tool("autonomous")
-async def autonomous(
-    ctx: Context,
-    task_description: dict | str,
-) -> dict | EvaluationResult:
-    return await autonomous_eval(ctx, task_description)
-
-
-async def autonomous_eval(
-    ctx: Context,
-    task_description: dict | str,
-) -> dict | EvaluationResult:
-    logging.info((task_description))
-    if type(task_description) == str:
-        task_description = json.loads(task_description)
-    try:
-        # check openai api key
-        openai_api_key = os.getenv("OPENAI_API_KEY")
-        if openai_api_key is None:
-            logging.error("OPENAI_API_KEY environment variable not set")
-            return EvaluationResult(
-                reward=0.0,
-                done=False,
-                info={"error": "OPENAI_API_KEY environment variable not set"},
-                isError=True,
-            )
-
-        persistent_ctx = evaluate.env
-        playwright_tool = getattr(persistent_ctx, "playwright_tool", None)
-        if not playwright_tool or not hasattr(playwright_tool, "page") or not playwright_tool.page:
-            logger.error("No browser page available")
-            return EvaluationResult(
-                reward=0.0,
-                done=False,
-                content="No browser page available",
-                info={"error": "No browser page available"},
-                isError=True,
-            )
-
-        # Load action history from file
-        action_history = []
-        try:
-            action_history_file = "/action_history/action_history.txt"
-            if os.path.exists(action_history_file):
-                with open(action_history_file, "r", encoding="utf-8") as f:
-                    action_history = [line.strip() for line in f if line.strip()]
-                logging.info(f"Loaded {len(action_history)} actions from history file")
-            else:
-                logging.warning("No action history file found")
-        except Exception as e:
-            logging.warning(f"Failed to load action history: {e}")
-            action_history = []
-
-        # Get last 10 actions for evaluation
-        last_actions = action_history[-10:] if action_history else []
-
-        logging.info("Taking screenshot for VLM evaluation...")
-
-        # take screen shot
-        screenshot_bytes = await playwright_tool.page.screenshot()
-        import base64
-
-        screenshot_b64 = base64.b64encode(screenshot_bytes).decode("utf-8")
-
-        # Create evaluation prompt using Autonomous_eval structure
-        system_msg = """You are an expert in evaluating the performance of a web navigation agent. The agent is designed to help a human user navigate a website to complete a task. Given the user's intent, the agent's action history, the final state of the webpage, and the agent's response to the user, your goal is to decide whether the agent's execution is successful or not.
-
-There are three types of tasks:
-1. Information seeking: The user wants to obtain certain information from the webpage, such as the information of a product, reviews, map info, comparison of map routes, etc. The bot's response must contain the information the user wants, or explicitly state that the information is not available. Otherwise, e.g. the bot encounters an exception and respond with the error content, the task is considered a failure. Besides, be careful about the sufficiency of the agent's actions. For example, when asked to list the top-searched items in a shop, the agent should order the items by the number of searches, and then return the top items. If the ordering action is missing, the task is likely to fail.
-2. Site navigation: The user wants to navigate to a specific page. Carefully examine the bot's action history and the final state of the webpage to determine whether the bot successfully completes the task. No need to consider the bot's response.
-3. Content modification: The user wants to modify the content of a webpage or configuration. Carefully examine the bot's action history and the final state of the webpage to determine whether the bot successfully completes the task. No need to consider the bot's response.
-
-*IMPORTANT*
-Format your response into two lines as shown below:
-
-Thoughts: <your thoughts and reasoning process>
-Status: "success" or "failure"
-"""
-
-        prompt = f"""User Intent: {task_description["confirmed_task"]}
-
-Action History:
-{chr(10).join(f"{i + 1}. {action}" for i, action in enumerate(last_actions))}
-
-The last snapshot of the web page is shown in the image."""
-        logging.info("Calling GPT-4.1 for evaluation...")
-        import openai
-
-        # Check for any environment variables that might affect OpenAI client
-        proxy_related_vars = {
-            k: v for k, v in os.environ.items() if "proxy" in k.lower() or "http" in k.lower()
-        }
-        if proxy_related_vars:
-            logging.info(f"Found proxy-related env vars: {proxy_related_vars}")
-
-        try:
-            # Try creating client with minimal parameters to avoid 'proxies' error
-            client = openai.OpenAI(api_key=openai_api_key)
-            logging.info("OpenAI client created successfully")
-        except Exception as e:
-            logging.error(f"Failed to create OpenAI client: {e}")
-            logging.error(f"OpenAI version: {openai.__version__}")
-            import traceback
-
-            logging.error(traceback.format_exc())
-            raise
-        messages = [
-            {"role": "system", "content": system_msg},
-            {
-                "role": "user",
-                "content": [
-                    {"type": "text", "text": prompt},
-                    {
-                        "type": "image_url",
-                        "image_url": {
-                            "url": f"data:image/png;base64,{screenshot_b64}",
-                            "detail": "high",
-                        },
-                    },
-                ],
-            },
-        ]
-
-        response = client.chat.completions.create(
-            model="gpt-4o",
-            messages=messages,
-            temperature=0.0,
-            max_tokens=500,  # Increased for detailed thoughts and status
-        )
-
-        # Parse result according to new format
-        result_text = response.choices[0].message.content.strip()
-
-        # Extract thoughts and status
-        try:
-            thoughts = result_text.split("Thoughts:")[1].split("Status:")[0].strip()
-            status = result_text.split("Status:")[1].strip().strip('"').lower()
-            success = status == "success"
-        except:
-            thoughts = result_text
-            success = "success" in result_text.lower()
-            status = "success" if success else "failure"
-
-        logging.info(f"Autonomous evaluation result: {status} (thoughts: {thoughts[:100]}...)")
-
-        return EvaluationResult(
-            reward=1.0 if success else 0.0,
-            done=True,
-            content=f"Status: {status}\nThoughts: {thoughts}",
-            info={
-                "status": status,
-                "thoughts": thoughts,
-                "task_description": task_description,
-                "actions_count": len(action_history),
-            },
-        )
-
-    except Exception as e:
-        logging.error(f"VLM evaluation failed: {e}")
-        return EvaluationResult(isError=True, info={"Exception": str(e)})
diff --git a/environments/online_mind2web/src/hud_controller/evaluate/overall_judge.py b/environments/online_mind2web/src/hud_controller/evaluate/overall_judge.py
deleted file mode 100644
index 1ad90258..00000000
--- a/environments/online_mind2web/src/hud_controller/evaluate/overall_judge.py
+++ /dev/null
@@ -1,48 +0,0 @@
-import logging
-from fastmcp import Context
-from hud.tools.types import EvaluationResult
-from . import evaluate
-from .autonomous_eval import autonomous_eval
-from .webjudge import webjudge_eval
-
-logger = logging.getLogger(__name__)
-
-
-@evaluate.tool("overall_judge")
-async def overall_judge(ctx: Context, task_description: dict | str) -> dict | EvaluationResult:
-    """Judge and return the results from all evalution methods
-
-    Args:
-        ctx: Context, passed automatically
-        task_description: Task description (dict or JSON string)
-
-    Returns:
-        Dict containing rewards and info
-    """
-    evaluation_methods = [autonomous_eval, webjudge_eval]
-
-    info = {}
-    reward = 0.0
-    errors = 0
-    done = 0.0
-    n = float(len(evaluation_methods))
-
-    try:
-        for f in evaluation_methods:
-            r: EvaluationResult = await f(ctx, task_description)
-            reward += r.reward
-            errors += r.isError
-            done += int(r.done)
-            info[f.__name__] = {
-                "reward": r.reward,
-                "done": r.done,
-                "isError": r.isError,
-                "info": r.info,
-            }
-
-        return EvaluationResult(
-            reward=reward / n, done=(done >= n / 2), info=info, isError=(errors > 0)
-        )
-    except Exception as e:
-        logger.error(f"Overall evaluation failed: {e}")
-        return EvaluationResult(isError=True, info={"Exception": str(e)})
diff --git a/environments/online_mind2web/src/hud_controller/evaluate/webjudge.py b/environments/online_mind2web/src/hud_controller/evaluate/webjudge.py
deleted file mode 100644
index 388f7e24..00000000
--- a/environments/online_mind2web/src/hud_controller/evaluate/webjudge.py
+++ /dev/null
@@ -1,502 +0,0 @@
-"""online-mind2web evaluators webjudge"""
-
-""" reference: https://github.com/OSU-NLP-Group/Online-Mind2Web/blob/main/src/methods/webjudge_online_mind2web.py """
-
-import os, json, logging, base64, re
-import openai
-from typing import Optional
-from fastmcp import Context
-from hud.tools.types import EvaluationResult
-from . import evaluate
-
-logger = logging.getLogger(__name__)
-
-MAX_IMAGE = 50  # Maximum screenshot of history to judge
-
-
-@evaluate.tool("webjudge")
-async def webjudge(ctx: Context, task_description: dict | str):
-    return await webjudge_eval(ctx, task_description)
-
-
-async def identify_key_point(task_description: dict | str) -> dict:
-    """Identify key points in a task description using GPT-4.
-
-    Args:
-        task_description: The task to analyze (dict or JSON string)
-
-    Returns:
-        Dict containing the identified key points
-    """
-
-    if type(task_description) == str:
-        task_description = json.loads(task_description)
-
-    # Check OpenAI API key
-    openai_api_key = os.getenv("OPENAI_API_KEY")
-
-    # Debug what we get from environment
-    logging.info(f"DEBUG: Raw environment variable type: {type(openai_api_key)}")
-    if openai_api_key:
-        logging.info(
-            f"DEBUG: Raw key repr: {repr(openai_api_key[:10])}"
-        )  # Show first 10 chars with repr to see any weird characters
-    if openai_api_key is None:
-        logging.error("OPENAI_API_KEY environment variable not set")
-        return {"success": False, "error": "OPENAI_API_KEY environment variable not set"}
-
-    try:
-        logging.info("Webjudge evaluation: identify_key_point")
-
-        # Extract task text
-        task_text = (
-            task_description.get("confirmed_task", str(task_description))
-            if isinstance(task_description, dict)
-            else str(task_description)
-        )
-
-        system_msg = """You are an expert tasked with analyzing a given task to identify the key points explicitly stated in the task description.
-
-**Objective**: Carefully analyze the task description and extract the critical elements explicitly mentioned in the task for achieving its goal.
-
-**Instructions**:
-1. Read the task description carefully.
-2. Identify and extract **key points** directly stated in the task description.
-   - A **key point** is a critical element, condition, or step explicitly mentioned in the task description.
-   - Do not infer or add any unstated elements.
-   - Words such as "best," "highest," "cheapest," "latest," "most recent," "lowest," "closest," "highest-rated," "largest," and "newest" must go through the sort function(e.g., the key point should be "Filter by highest").
-
-**Respond with**:
-- **Key Points**: A numbered list of the explicit key points for completing this task, one per line, without explanations or additional details."""
-
-        user_prompt = f"Task: {task_text}"
-
-        messages = [
-            {"role": "system", "content": system_msg},
-            {"role": "user", "content": user_prompt},
-        ]
-
-        # Debug the actual API key being used
-        logging.info(
-            f"DEBUG: Creating OpenAI client with key: {openai_api_key[:10]}...{openai_api_key[-10:]}"
-        )
-        logging.info(f"DEBUG: Full API key length: {len(openai_api_key)}")
-
-        # Check if the key looks valid (should start with sk-)
-        if not openai_api_key.startswith("sk-"):
-            logging.error(f"DEBUG: API key doesn't start with 'sk-': {openai_api_key[:10]}")
-
-        client = openai.OpenAI(api_key=openai_api_key)
-        logging.info("DEBUG: OpenAI client created successfully")
-
-        # Log the request we're about to make
-        logging.info(f"DEBUG: Making request to model: gpt-4o")
-        logging.info(f"DEBUG: Message count: {len(messages)}")
-
-        try:
-            response = client.chat.completions.create(
-                model="gpt-4o",
-                messages=messages,
-                temperature=0.0,
-                max_tokens=500,  # Increased for key points list
-            )
-            logging.info("DEBUG: API call completed successfully")
-        except Exception as api_error:
-            logging.error(f"DEBUG: API call failed with error: {api_error}")
-            logging.error(f"DEBUG: Error type: {type(api_error)}")
-            if hasattr(api_error, "response"):
-                logging.error(f"DEBUG: Error response: {api_error.response}")
-            # Re-check the environment variable at the moment of failure
-            current_key = os.getenv("OPENAI_API_KEY")
-            logging.error(
-                f"DEBUG: Env var at failure - length: {len(current_key) if current_key else 'None'}"
-            )
-            if current_key:
-                logging.error(
-                    f"DEBUG: Env var at failure: {current_key[:20]}...{current_key[-10:]}"
-                )
-            raise
-
-        # Parse the response
-        key_points_text = response.choices[0].message.content.strip()
-
-        logging.info(f"Identified key points: {key_points_text}")
-
-        return {
-            "success": True,
-            "key_points": key_points_text,
-            "task_description": task_text,
-            "model": "gpt-4o",
-        }
-
-    except Exception as e:
-        logging.error(f"Key point identification failed: {e}")
-        return {
-            "success": False,
-            "error": str(e),
-            "task_description": task_text if "task_text" in locals() else str(task_description),
-        }
-
-
-async def judge_image(
-    base64_images: list | str, task_description: dict | str, key_points: str
-) -> dict:
-    """Judge image(s) for task completion using GPT-4V
-
-    Args:
-        base64_images: List of base64 encoded images or single base64 string
-        task_description: The task to evaluate (dict or JSON string)
-        key_points: Key points for task completion
-
-    Returns:
-        Dict containing evaluation results with reasoning and scores
-    """
-
-    if type(task_description) == str:
-        task_description = json.loads(task_description)
-
-    if type(base64_images) == str:
-        base64_images = [base64_images]
-
-    # Check OpenAI API key
-    openai_api_key = os.getenv("OPENAI_API_KEY")
-    if openai_api_key is None:
-        logging.error("OPENAI_API_KEY environment variable not set")
-        return {"success": False, "error": "OPENAI_API_KEY environment variable not set"}
-
-    try:
-        logging.info(f"Judging {len(base64_images)} images for task completion")
-
-        # Extract task text
-        task_text = (
-            task_description.get("confirmed_task", str(task_description))
-            if isinstance(task_description, dict)
-            else str(task_description)
-        )
-
-        system_msg = """You are an expert evaluator tasked with determining whether the provided images contain information about the necessary steps to complete a task.
-
-**Objective**: Analyze the provided images and decide if they show essential steps or evidence required for completing the task. Use your reasoning to explain your decision before assigning a score.
-
-**Instructions**:
-1. Provide a detailed description of each image, including its contents, visible elements, text (if any), and any notable features.
-
-2. Carefully examine the images and evaluate whether they contain necessary steps or evidence crucial to task completion:
-- Identify key points that could be relevant to task completion, such as actions, progress indicators, tool usage, applied filters, or step-by-step instructions.
-- Do the images show actions, progress indicators, or critical information directly related to completing the task?
-- Is this information indispensable for understanding or ensuring task success?
-- If the images contain partial but relevant information, consider their usefulness rather than dismissing them outright.
-- Consider the progression across multiple images if provided.
-
-3. Provide your response in the following format:
-- **Reasoning**: Explain your thought process and observations. Mention specific elements in the images that indicate necessary steps, evidence, or lack thereof.
-- **Score**: Assign a score based on the reasoning, using the following scale:
-    - **1**: The images do not contain any necessary steps or relevant information.
-    - **2**: The images contain minimal or ambiguous information, unlikely to be essential.
-    - **3**: The images include some relevant steps or hints but lack clarity or completeness.
-    - **4**: The images contain important steps or evidence that are highly relevant but not fully comprehensive.
-    - **5**: The images clearly display necessary steps or evidence crucial for completing the task.
-
-Respond with:
-1. **Reasoning**: [Your detailed explanation]
-2. **Score**: [1-5]"""
-
-        prompt = f"""**Task**: {task_text}
-
-**Key Points for Task Completion**: {key_points}
-
-The snapshots of the web page progression are shown in the images below."""
-
-        # Create message content with text and images
-        message_content = [{"type": "text", "text": prompt}]
-
-        for base64_img in base64_images:
-            message_content.append(
-                {
-                    "type": "image_url",
-                    "image_url": {"url": f"data:image/png;base64,{base64_img}", "detail": "high"},
-                }  # type: ignore
-            )
-
-        messages = [
-            {"role": "system", "content": system_msg},
-            {"role": "user", "content": message_content},
-        ]
-
-        client = openai.OpenAI(api_key=openai_api_key)
-
-        response = client.chat.completions.create(
-            model="gpt-4o",  # GPT-4V
-            messages=messages,
-            temperature=0.0,
-            max_tokens=1000,
-        )
-
-        # Parse the response
-        result_text = response.choices[0].message.content.strip()
-
-        logging.info(f"Image judgment result: {result_text[:200]}...")
-
-        return {
-            "success": True,
-            "judgment": result_text,
-            "task_description": task_text,
-            "key_points": key_points,
-            "images_processed": len(base64_images),
-            "model": "gpt-4o",
-        }
-
-    except Exception as e:
-        logging.error(f"Image judgment failed: {e}")
-        return {
-            "success": False,
-            "error": str(e),
-            "task_description": task_text if "task_text" in locals() else str(task_description),
-            "images_processed": len(base64_images) if "base64_images" in locals() else 0,
-        }
-
-
-async def webjudge_eval(ctx: Context, task_description: dict | str, score_threshold: int = 3):
-    """WebJudge Online Mind2Web evaluation using screenshot history and action history
-
-    Args:
-        task_description: Task description (dict or JSON string)
-        score_threshold: Minimum score threshold for image filtering (1-5)
-
-    Returns:
-        Dict containing evaluation results with success/failure status
-    """
-    if type(task_description) == str:
-        task_description = json.loads(task_description)
-
-    # Check OpenAI API key
-    openai_api_key = os.getenv("OPENAI_API_KEY")
-    if openai_api_key is None:
-        logging.error("OPENAI_API_KEY environment variable not set")
-        return EvaluationResult(
-            isError=True, info={"Exception": f"OPENAI_API_KEY environment variable not set"}
-        )
-
-    try:
-        logging.info("Starting WebJudge Online Mind2Web evaluation")
-
-        # Extract task text
-        task_text = task_description.get("confirmed_task")
-
-        # Get screenshots from /screenshot directory
-        screenshot_dir = "/screenshot"
-        screenshot_history = []
-
-        try:
-            if os.path.exists(screenshot_dir):
-                # Get all PNG files sorted by modification time (newest last)
-                screenshot_files = []
-                for file in os.listdir(screenshot_dir):
-                    if file.endswith(".png") and file.startswith("screenshot_"):
-                        filepath = os.path.join(screenshot_dir, file)
-                        mtime = os.path.getmtime(filepath)
-                        screenshot_files.append((mtime, filepath))
-
-                # Sort by modification time
-                screenshot_files.sort(key=lambda x: x[0])
-
-                for _, filepath in screenshot_files[-MAX_IMAGE:]:
-                    try:
-                        with open(filepath, "rb") as f:
-                            image_data = f.read()
-                            screenshot_b64 = base64.b64encode(image_data).decode("utf-8")
-                            screenshot_history.append(screenshot_b64)
-                    except Exception as e:
-                        logging.warning(f"Failed to read screenshot {filepath}: {e}")
-
-                logging.info(f"Loaded {len(screenshot_history)} screenshots from {screenshot_dir}")
-            else:
-                logging.warning(f"Screenshot directory {screenshot_dir} does not exist")
-
-        except Exception as e:
-            logging.error(f"Failed to load screenshots from {screenshot_dir}: {e}")
-
-        if not screenshot_history:
-            logging.warning("No screenshot history available")
-            return EvaluationResult(
-                reward=0.0,
-                done=True,
-                content="No screenshot avaliable",
-                info={"task_description": task_text, "status": "No screenshot avaliable"},
-            )
-
-        # Get action history from file
-        action_history = []
-        try:
-            action_history_file = "/action_history/action_history.txt"
-            if os.path.exists(action_history_file):
-                with open(action_history_file, "r", encoding="utf-8") as f:
-                    action_history = [line.strip() for line in f if line.strip()]
-                logging.info(f"Loaded {len(action_history)} actions from history file")
-            else:
-                logging.warning("No action history file found")
-        except Exception as e:
-            logging.warning(f"Failed to load action history: {e}")
-            action_history = []
-
-        # Get all actions for evaluation
-        last_actions = action_history if action_history else []
-
-        logging.info(
-            f"Found {len(screenshot_history)} screenshots and {len(action_history)} actions"
-        )
-
-        # Step 1: Identify key points
-        logging.info(f"Webjudge step 1: Identify key points")
-        key_points_result = await identify_key_point(task_description)
-        if not key_points_result.get("success"):
-            logger.error(f"Key point identification failed: {key_points_result.get('error')}")
-            return EvaluationResult(
-                isError=True,
-                info={
-                    "Exception": f"Key point identification failed: {key_points_result.get('error')}"
-                },
-            )
-
-        key_points = key_points_result["key_points"]
-
-        # Clean up key points formatting
-        key_points = key_points.replace("\n\n", "\n")
-        try:
-            if "**Key Points**:" in key_points:
-                key_points = key_points.split("**Key Points**:")[1]
-            elif "Key Points:" in key_points:
-                key_points = key_points.split("Key Points:")[-1]
-            key_points = "\n".join(line.lstrip() for line in key_points.splitlines())
-        except:
-            pass
-
-        # Step 2: Judge images using screenshot history
-        logging.info(f"Webjudge step 2: Judge images using screenshot history")
-        judge_result = await judge_image(
-            base64_images=screenshot_history,
-            task_description=task_description,
-            key_points=key_points,
-        )
-
-        if not judge_result.get("success"):
-            return EvaluationResult(
-                isError=True,
-                info={"Exception": f"Image judgment failed: {judge_result.get('error')}"},
-            )
-
-        # Parse judgment result for score
-        judgment_text = judge_result["judgment"]
-        pattern = r"[1-5]"
-        try:
-            scores = re.findall(pattern, judgment_text.split("Score")[1])
-            main_score = int(scores[0]) if scores else 3
-        except:
-            main_score = 3  # Default score if parsing fails
-        logger.info(f"Score: {main_score}")
-
-        # Extract reasoning
-        try:
-            reasoning = (
-                judgment_text.split("**Reasoning**:")[-1].strip().split("**Score**:")[0].strip()
-            )
-        except:
-            reasoning = "Unable to extract reasoning"
-
-        # Step 3: Final evaluation using GPT-4
-        logging.info(f"Webjudge step 3: Final evaluation using GPT-4")
-
-        system_msg = """You are an expert in evaluating the performance of a web navigation agent. The agent is designed to help a human user navigate a website to complete a task. Given the user's task, the agent's action history, key points for task completion, and analysis of important web pages, your goal is to determine whether the agent has completed the task and achieved all requirements.
-
-Your response must strictly follow the following evaluation criteria!
-*Important Evaluation Criteria*:
-1: The filtered results must be displayed correctly. If filters were not properly applied (i.e., missing selection, missing confirmation, or no visible effect in results), the task is not considered successful.
-2: You must carefully check whether these screenshots and action history meet these key points. Ensure that specific filter conditions, such as "best," "highest," "cheapest," "latest," "most recent," "lowest," "closest," "highest-rated," "largest," and "newest" are correctly applied using the filter function(e.g., sort function).
-3: Certain key points or requirements should be applied by the filter. Otherwise, a search with all requirements as input will be deemed a failure since it cannot guarantee that all results meet the requirements!
-4: If the task requires filtering by a specific range of money, years, or the number of beds and bathrooms, the applied filter must exactly match the given requirement. Any deviation results in failure.
-5: Some tasks require a submission action or a display of results to be considered successful.
-6: If the retrieved information is invalid or empty(e.g., No match was found), but the agent has correctly performed the required action, it should still be considered successful.
-7: If the current page already displays all available items, then applying a filter is not necessary. As long as the agent selects items that meet the requirements, the task is still considered successful.
-
-*IMPORTANT*
-Format your response into two lines as shown below:
-
-Thoughts: <your thoughts and reasoning process based on double-checking each key points and the evaluation criteria>
-Status: "success" or "failure"
-"""
-
-        # Prepare final evaluation content
-        if main_score >= score_threshold:
-            # Include high-scoring screenshots in final evaluation
-            final_images = []
-            for screenshot_b64 in screenshot_history:  # All screenshots
-                final_images.append(
-                    {
-                        "type": "image_url",
-                        "image_url": {
-                            "url": f"data:image/png;base64,{screenshot_b64}",
-                            "detail": "high",
-                        },
-                    }
-                )
-
-            prompt_with_images = f"""User Task: {task_text}
-
-Key Points: {key_points}
-
-Action History:
-{chr(10).join(f"{i + 1}. {action}" for i, action in enumerate(last_actions))}
-
-Image Analysis Results:
-Score: {main_score}/5
-Reasoning: {reasoning}"""
-
-            content = [{"type": "text", "text": prompt_with_images}] + final_images
-        else:
-            # Text-only evaluation if images don't meet threshold
-            prompt_text_only = f"""User Task: {task_text}
-
-Key Points: {key_points}
-
-Action History:
-{chr(10).join(f"{i + 1}. {action}" for i, action in enumerate(last_actions))}
-
-Note: Screenshot analysis scored {main_score}/5, below threshold of {score_threshold}."""
-
-            content = [{"type": "text", "text": prompt_text_only}]
-
-        # Final evaluation
-        client = openai.OpenAI(api_key=openai_api_key)
-
-        messages = [{"role": "system", "content": system_msg}, {"role": "user", "content": content}]
-
-        response = client.chat.completions.create(
-            model="gpt-4o", messages=messages, temperature=0.0, max_tokens=500
-        )
-
-        final_result = response.choices[0].message.content.strip()
-
-        # Parse final result
-        try:
-            thoughts = final_result.split("Thoughts:")[1].split("Status:")[0].strip()
-            status = final_result.split("Status:")[1].strip().strip('"').lower()
-            success = status == "success"
-        except:
-            logging.info(f"Warning: Final result parsing failed: {final_result}")
-            thoughts = final_result
-            success = "success" in final_result.lower()
-            status = "success" if success else "failure"
-
-        logging.info(f"WebJudge evaluation result: {status}")
-
-        return EvaluationResult(
-            reward=1.0 if success else 0.0,
-            done=True,
-            content=final_result,
-            info={"task_description": task_text, "status": status, "thoughts": thoughts},
-            isError=False,
-        )
-
-    except Exception as e:
-        logging.error(f"WebJudge evaluation failed: {e}")
-        return EvaluationResult(isError=True, info={"Exception": str(e)})
diff --git a/environments/online_mind2web/src/hud_controller/providers/README.md b/environments/online_mind2web/src/hud_controller/providers/README.md
deleted file mode 100644
index db3213a7..00000000
--- a/environments/online_mind2web/src/hud_controller/providers/README.md
+++ /dev/null
@@ -1,110 +0,0 @@
-# Remote Browser Providers
-
-This directory contains implementations for various cloud browser providers that can be used with the HUD Remote Browser environment.
-
-## Supported Providers
-
-### 1. **AnchorBrowser** ✅ (Implemented)
-- **API Endpoint**: `https://api.anchorbrowser.io/v1/sessions`
-- **Features**:
-  - Residential proxy support
-  - CAPTCHA solving
-  - Ad blocking
-  - Popup blocking
-- **API Key**: `ANCHOR_API_KEY` environment variable
-- **Documentation**: Internal
-
-### 2. **BrowserBase** 🚧 (To be implemented)
-- **API Endpoint**: `https://api.browserbase.com/v1/sessions`
-- **Features**:
-  - Multiple regions support
-  - Context persistence
-  - Live view URLs
-  - Session recordings
-  - Proxy support
-- **API Key**: `X-BB-API-Key` header
-- **Documentation**: https://docs.browserbase.com/reference/api/create-a-session
-
-### 3. **HyperBrowser** 🚧 (To be implemented)
-- **API Endpoint**: `https://api.hyperbrowser.ai/api/session`
-- **Features**:
-  - Stealth mode
-  - Advanced proxy configuration (country/state/city)
-  - Profile management
-  - Web recording
-  - CAPTCHA solving
-  - Ad blocking
-  - Browser fingerprinting
-- **API Key**: `x-api-key` header
-- **Documentation**: https://docs.hyperbrowser.ai/reference/api-reference/sessions
-
-### 4. **Steel** 🚧 (To be implemented)
-- **API Endpoint**: `https://api.steel.dev/v1/sessions`
-- **Features**:
-  - Session management
-  - Browser automation
-  - Proxy support
-- **API Key**: `steel_api_key` header or `STEEL_API_KEY` env variable
-- **Documentation**: https://docs.steel.dev/api-reference
-
-### 5. **Kernel** ❌ (Not yet available)
-- **Status**: API not yet available for browser sessions
-- **Documentation**: N/A
-
-## Provider Lifecycle
-
-Each provider follows a similar lifecycle pattern:
-
-1. **Initialization**
-   - Set up API credentials
-   - Configure base URLs and default options
-
-2. **Session Creation** (`launch()`)
-   - Make API request to create a new browser session
-   - Handle provider-specific options (proxy, stealth, etc.)
-   - Return CDP WebSocket URL for Playwright connection
-
-3. **Session Management**
-   - Track session IDs and metadata
-   - Provide status checks
-   - Handle session-specific features (live view, recordings, etc.)
-
-4. **Session Termination** (`close()`)
-   - Clean up resources
-   - End the browser session via API
-   - Handle any provider-specific cleanup
-
-## Implementation Guide
-
-To add a new provider:
-
-1. Create a new file in this directory (e.g., `browserbase.py`)
-2. Inherit from `BrowserProvider` base class
-3. Implement required methods:
-   - `__init__()` - Initialize with API credentials
-   - `launch()` - Create a new session and return CDP URL
-   - `close()` - Terminate the session
-   - `get_status()` - Return session status
-4. Add provider to the registry in `__init__.py`
-5. Update environment variables in the main README
-
-## Environment Variables
-
-Each provider uses specific environment variables:
-
-- **AnchorBrowser**: `ANCHOR_API_KEY`
-- **BrowserBase**: `BROWSERBASE_API_KEY`
-- **HyperBrowser**: `HYPERBROWSER_API_KEY`
-- **Steel**: `STEEL_API_KEY`
-
-## Common Features Across Providers
-
-| Feature | AnchorBrowser | BrowserBase | HyperBrowser | Steel |
-|---------|---------------|-------------|--------------|-------|
-| Proxy Support | ✅ | ✅ | ✅ | ✅ |
-| CAPTCHA Solving | ✅ | ❓ | ✅ | ❓ |
-| Ad Blocking | ✅ | ❓ | ✅ | ❓ |
-| Session Recording | ❌ | ✅ | ✅ | ❓ |
-| Live View | ✅ | ✅ | ✅ | ❓ |
-| Profile Persistence | ❌ | ✅ | ✅ | ❓ |
-| Multi-Region | ❌ | ✅ | ✅ | ❓ |
\ No newline at end of file
diff --git a/environments/online_mind2web/src/hud_controller/providers/__init__.py b/environments/online_mind2web/src/hud_controller/providers/__init__.py
deleted file mode 100644
index 3ed34b64..00000000
--- a/environments/online_mind2web/src/hud_controller/providers/__init__.py
+++ /dev/null
@@ -1,33 +0,0 @@
-"""Browser provider implementations for remote browser control."""
-
-from .base import BrowserProvider
-from .anchorbrowser import AnchorBrowserProvider
-from .browserbase import BrowserBaseProvider
-from .steel import SteelProvider
-from .hyperbrowser import HyperBrowserProvider
-# from .kernel import KernelProvider  # Not yet implemented
-
-__all__ = [
-    "BrowserProvider",
-    "AnchorBrowserProvider",
-    "BrowserBaseProvider",
-    "SteelProvider",
-    "HyperBrowserProvider",
-    # "KernelProvider",  # Not yet implemented
-]
-
-# Provider registry for easy lookup
-PROVIDERS = {
-    "anchorbrowser": AnchorBrowserProvider,
-    "browserbase": BrowserBaseProvider,
-    "steel": SteelProvider,
-    "hyperbrowser": HyperBrowserProvider,
-    # "kernel": KernelProvider,  # Not yet implemented
-}
-
-
-def get_provider(name: str) -> type[BrowserProvider]:
-    """Get a provider class by name."""
-    if name not in PROVIDERS:
-        raise ValueError(f"Unknown provider: {name}. Available: {list(PROVIDERS.keys())}")
-    return PROVIDERS[name]
diff --git a/environments/online_mind2web/src/hud_controller/providers/anchorbrowser.py b/environments/online_mind2web/src/hud_controller/providers/anchorbrowser.py
deleted file mode 100644
index 38815bbc..00000000
--- a/environments/online_mind2web/src/hud_controller/providers/anchorbrowser.py
+++ /dev/null
@@ -1,183 +0,0 @@
-"""AnchorBrowser provider implementation."""
-
-import os
-import logging
-from typing import Optional, Dict, Any
-import httpx
-import requests
-
-from .base import BrowserProvider
-from .helper.proxy import get_proxy_config
-
-logger = logging.getLogger(__name__)
-
-
-class AnchorBrowserProvider(BrowserProvider):
-    """AnchorBrowser provider for remote browser control.
-
-    AnchorBrowser provides cloud-based browser instances with features like:
-    - Proxy support
-    - CAPTCHA solving
-    - Ad blocking
-    - Popup blocking
-    """
-
-    def __init__(self, config: Dict[str, Any] | None = None):
-        super().__init__(config)
-        self.api_key = config.get("api_key") if config else os.getenv("ANCHOR_API_KEY")
-        self.base_url = (
-            config.get("base_url", "https://api.anchorbrowser.io")
-            if config
-            else "https://api.anchorbrowser.io"
-        )
-        self._session_data: Dict[str, Any] | None = None  # Initialize session data storage
-
-        if not self.api_key:
-            raise ValueError("AnchorBrowser API key not provided")
-
-    async def launch(self, **kwargs) -> str:
-        """Launch an AnchorBrowser instance.
-
-        Args:
-            **kwargs: Launch options including:
-                - max_duration: Maximum session duration in seconds (default: 300)
-                - idle_timeout: Idle timeout in seconds (default: 120)
-                - proxy: Proxy configuration dict with:
-                    - type: "custom" or "anchor_residential"
-                    - server: Proxy server address (for custom)
-                    - username: Proxy username (for custom)
-                    - password: Proxy password (for custom)
-                    - country_code: Country code (for anchor_residential)
-                - headless: Whether to run headless
-                - viewport: Viewport size
-                - captcha_solver: Enable CAPTCHA solving
-                - adblock: Enable ad blocking
-                - popup_blocker: Enable popup blocking
-
-        Returns:
-            CDP URL for connecting to the browser
-        """
-        # Build request payload
-        request_data = {
-            "session": {
-                "timeout": {
-                    "max_duration": kwargs.get("max_duration", 300),
-                    "idle_timeout": kwargs.get("idle_timeout", 120),
-                },
-            },
-            "browser": {
-                "adblock": {"active": True},
-                "popup_blocker": {"active": True},
-                "captcha_solver": {"active": True},
-            },
-        }
-
-        # Add viewport configuration
-        if "viewport" in kwargs:
-            request_data["browser"]["viewport"] = kwargs["viewport"]
-        else:
-            # Use environment variables or AnchorBrowser's recommended default (1440x900)
-            request_data["browser"]["viewport"] = {
-                "width": int(os.getenv("DISPLAY_WIDTH", "1440")),
-                "height": int(os.getenv("DISPLAY_HEIGHT", "900")),
-            }
-            logger.info(
-                f"Setting viewport to {request_data['browser']['viewport']['width']}x{request_data['browser']['viewport']['height']}"
-            )
-
-        proxy_config = await get_proxy_config()
-
-        # Default to residential proxy if nothing configured
-        if not proxy_config:
-            proxy_config = {
-                "type": "anchor_residential",
-                "active": True,
-                "country_code": os.getenv("PROXY_COUNTRY", "us"),
-            }
-            logger.info("Using default AnchorBrowser residential proxy")
-
-        # Add proxy to request data
-        request_data["session"]["proxy"] = proxy_config
-
-        # Make API request
-        async with httpx.AsyncClient() as client:
-            response = await client.post(
-                f"{self.base_url}/v1/sessions",
-                json=request_data,
-                headers={"anchor-api-key": str(self.api_key), "Content-Type": "application/json"},
-                timeout=30.0,
-            )
-            response.raise_for_status()
-
-        # Extract session data
-        data = response.json()
-        session_data = data.get("data", {})
-        self._instance_id = session_data.get("id")
-        self._session_data = session_data  # Store full session data
-        self._cdp_url = session_data.get("cdp_url")
-
-        if not self._instance_id:
-            raise Exception("Failed to get session ID from AnchorBrowser response")
-        if not self._cdp_url:
-            raise Exception("Failed to get CDP URL from AnchorBrowser response")
-
-        self._is_running = True
-
-        logger.info(f"Launched AnchorBrowser session: {self._instance_id}")
-        logger.info(f"Using proxy type: {proxy_config.get('type')}")
-        return self._cdp_url
-
-    def close(self) -> None:
-        """Terminate the AnchorBrowser session."""
-        if not self._instance_id:
-            return
-
-        try:
-            logger.info(f"Terminating AnchorBrowser session: {self._instance_id}")
-            response = requests.delete(
-                f"{self.base_url}/v1/sessions/{self._instance_id}",
-                headers={
-                    "anchor-api-key": str(self.api_key),
-                    "Content-Type": "application/json",
-                },
-                timeout=25.0,
-            )
-            response.raise_for_status()
-
-            logger.info(f"Terminated AnchorBrowser session: {self._instance_id}")
-        except Exception as e:
-            logger.error(f"Error terminating session {self._instance_id}: {e}")
-        finally:
-            self._is_running = False
-            self._cdp_url = None
-            self._instance_id = None
-
-    async def get_status(self) -> Dict[str, Any]:
-        """Get status including AnchorBrowser-specific info."""
-        status = await super().get_status()
-
-        # Add AnchorBrowser-specific status
-        if self._instance_id and self._is_running:
-            try:
-                async with httpx.AsyncClient() as client:
-                    response = await client.get(
-                        f"{self.base_url}/v1/sessions/{self._instance_id}/status",
-                        headers={
-                            "anchor-api-key": str(self.api_key),
-                            "Content-Type": "application/json",
-                        },
-                        timeout=10.0,
-                    )
-                    if response.status_code == 200:
-                        session_status = response.json().get("data", {})
-                        status["session_status"] = session_status
-            except Exception as e:
-                logger.warning(f"Failed to get session status: {e}")
-
-        return status
-
-    def get_live_view_url(self) -> Optional[str]:
-        """Get the live view URL for the AnchorBrowser instance."""
-        if self._session_data:
-            return self._session_data.get("live_view_url")
-        return None
diff --git a/environments/online_mind2web/src/hud_controller/providers/base.py b/environments/online_mind2web/src/hud_controller/providers/base.py
deleted file mode 100644
index 32807635..00000000
--- a/environments/online_mind2web/src/hud_controller/providers/base.py
+++ /dev/null
@@ -1,96 +0,0 @@
-"""Base class for browser providers."""
-
-from abc import ABC, abstractmethod
-from typing import Optional, Dict, Any
-import logging
-
-logger = logging.getLogger(__name__)
-
-
-class BrowserProvider(ABC):
-    """Abstract base class for browser providers.
-
-    Each provider manages the lifecycle of a remote browser instance
-    and provides access to its Chrome DevTools Protocol (CDP) endpoint.
-    """
-
-    def __init__(self, config: Dict[str, Any] | None = None):
-        """Initialize the provider with optional configuration.
-
-        Args:
-            config: Provider-specific configuration
-        """
-        self.config = config or {}
-        self._cdp_url: Optional[str] = None
-        self._instance_id: Optional[str] = None
-        self._is_running = False
-
-    @abstractmethod
-    async def launch(self, **kwargs) -> str:
-        """Launch a browser instance and return its CDP URL.
-
-        Args:
-            **kwargs: Provider-specific launch options
-
-        Returns:
-            CDP URL (e.g., "ws://localhost:9222/devtools/browser/xxx")
-
-        Raises:
-            Exception: If launch fails
-        """
-        pass
-
-    @abstractmethod
-    def close(self) -> None:
-        """Close the browser instance and cleanup resources.
-
-        Raises:
-            Exception: If close fails
-        """
-        pass
-
-    async def get_status(self) -> Dict[str, Any]:
-        """Get the current status of the browser instance.
-
-        Returns:
-            Dictionary with status information including:
-            - is_running: Whether the browser is running
-            - cdp_url: The CDP URL if available
-            - instance_id: Provider-specific instance identifier
-            - additional provider-specific status info
-        """
-        return {
-            "is_running": self._is_running,
-            "cdp_url": self._cdp_url,
-            "instance_id": self._instance_id,
-            "provider": self.__class__.__name__,
-        }
-
-    def get_live_view_url(self) -> Optional[str]:
-        """Get the live view URL for the browser instance.
-
-        Returns:
-            Live view URL if available, None otherwise
-        """
-        # Default implementation returns None
-        # Providers should override this method
-        return None
-
-    @property
-    def cdp_url(self) -> Optional[str]:
-        """Get the CDP URL of the running browser instance."""
-        return self._cdp_url
-
-    @property
-    def is_running(self) -> bool:
-        """Check if the browser instance is running."""
-        return self._is_running
-
-    async def __aenter__(self):
-        """Async context manager entry - launch the browser."""
-        await self.launch()
-        return self
-
-    async def __aexit__(self, exc_type, exc_val, exc_tb):
-        """Async context manager exit - close the browser."""
-        self.close()
diff --git a/environments/online_mind2web/src/hud_controller/providers/browserbase.py b/environments/online_mind2web/src/hud_controller/providers/browserbase.py
deleted file mode 100644
index a2abf262..00000000
--- a/environments/online_mind2web/src/hud_controller/providers/browserbase.py
+++ /dev/null
@@ -1,176 +0,0 @@
-"""BrowserBase provider implementation."""
-
-import os
-import logging
-from typing import Optional, Dict, Any
-import httpx
-
-from .base import BrowserProvider
-
-logger = logging.getLogger(__name__)
-
-
-class BrowserBaseProvider(BrowserProvider):
-    """BrowserBase provider for remote browser control.
-
-    BrowserBase provides cloud browser instances with features like:
-    - Multiple regions support
-    - Context persistence
-    - Live view URLs
-    - Session recordings
-    - Proxy support
-
-    API Documentation: https://docs.browserbase.com/reference/api/create-a-session
-    """
-
-    def __init__(self, config: Dict[str, Any] | None = None):
-        super().__init__(config)
-        self.api_key = config.get("api_key") if config else os.getenv("BROWSERBASE_API_KEY")
-        self.base_url = (
-            config.get("base_url", "https://api.browserbase.com")
-            if config
-            else "https://api.browserbase.com"
-        )
-        self.project_id = (
-            config.get("project_id") if config else os.getenv("BROWSERBASE_PROJECT_ID")
-        )
-        self._session_data: Dict[str, Any] | None = None
-
-        if not self.api_key:
-            raise ValueError("BrowserBase API key not provided")
-
-    async def launch(self, **kwargs) -> str:
-        """Launch a BrowserBase instance.
-
-        Args:
-            **kwargs: Launch options including:
-                - projectId: Project ID (required if not set in config)
-                - region: Browser region (e.g., "us-west-2")
-                - keepAlive: Keep session alive after disconnect
-                - contextId: Reuse browser context
-                - browserSettings: Additional browser settings
-                - proxies: Enable proxy support
-
-        Returns:
-            CDP URL for connecting to the browser
-        """
-        # Build request payload
-        request_data = {"projectId": kwargs.get("projectId", self.project_id)}
-
-        # Add optional parameters
-        if "region" in kwargs:
-            request_data["region"] = kwargs["region"]
-
-        if "keepAlive" in kwargs:
-            request_data["keepAlive"] = kwargs["keepAlive"]
-
-        if "contextId" in kwargs:
-            request_data["contextId"] = kwargs["contextId"]
-
-        if "browserSettings" in kwargs:
-            request_data["browserSettings"] = kwargs["browserSettings"]
-
-        if "proxies" in kwargs:
-            request_data["proxies"] = kwargs["proxies"]
-
-        # Ensure we have a project ID
-        if not request_data.get("projectId"):
-            raise ValueError("BrowserBase project ID not provided")
-
-        # Make API request
-        async with httpx.AsyncClient() as client:
-            response = await client.post(
-                f"{self.base_url}/v1/sessions",
-                json=request_data,
-                headers={"X-BB-API-Key": str(self.api_key), "Content-Type": "application/json"},
-                timeout=30.0,
-            )
-            response.raise_for_status()
-
-        # Extract session data
-        data = response.json()
-        self._session_data = data
-        self._instance_id = data.get("id")
-
-        if not self._instance_id:
-            raise Exception("Failed to get session ID from BrowserBase response")
-
-        # Get CDP URL - BrowserBase returns connectUrl directly
-        self._cdp_url = data.get("connectUrl")
-        if not self._cdp_url:
-            raise Exception("Failed to get connect URL from BrowserBase response")
-
-        self._is_running = True
-
-        logger.info(f"Launched BrowserBase session: {self._instance_id}")
-        logger.info(f"CDP URL: {self._cdp_url}")
-
-        # Store additional URLs for reference
-        self._live_view_url = data.get("liveViewUrl")
-        self._selenium_remote_url = data.get("seleniumRemoteUrl")
-
-        return self._cdp_url
-
-    def close(self) -> None:
-        """Terminate the BrowserBase session."""
-        if not self._instance_id:
-            return
-
-        try:
-            # BrowserBase sessions automatically close after disconnect unless keepAlive is true
-            # We can explicitly update the session to mark it as ended
-            with httpx.Client() as client:
-                response = client.post(
-                    f"{self.base_url}/v1/sessions/{self._instance_id}",
-                    json={"status": "COMPLETED"},
-                    headers={"X-BB-API-Key": str(self.api_key), "Content-Type": "application/json"},
-                    timeout=30.0,
-                )
-                # BrowserBase may return 404 if session already ended
-                if response.status_code != 404:
-                    response.raise_for_status()
-
-            logger.info(f"Terminated BrowserBase session: {self._instance_id}")
-        except Exception as e:
-            logger.error(f"Error terminating session {self._instance_id}: {e}")
-        finally:
-            self._is_running = False
-            self._cdp_url = None
-            self._instance_id = None
-
-    async def get_status(self) -> Dict[str, Any]:
-        """Get status including BrowserBase-specific info."""
-        status = await super().get_status()
-
-        # Add BrowserBase-specific status
-        if self._instance_id and self._is_running:
-            try:
-                async with httpx.AsyncClient() as client:
-                    response = await client.get(
-                        f"{self.base_url}/v1/sessions/{self._instance_id}",
-                        headers={
-                            "X-BB-API-Key": str(self.api_key),
-                            "Content-Type": "application/json",
-                        },
-                        timeout=10.0,
-                    )
-                    if response.status_code == 200:
-                        session_data = response.json()
-                        status["session_data"] = session_data
-                        status["status"] = session_data.get("status", "UNKNOWN")
-                        status["region"] = session_data.get("region")
-                        status["proxy_bytes"] = session_data.get("proxyBytes")
-                        status["cpu_usage"] = session_data.get("avgCpuUsage")
-                        status["memory_usage"] = session_data.get("memoryUsage")
-            except Exception as e:
-                logger.warning(f"Failed to get session status: {e}")
-
-        return status
-
-    def get_live_view_url(self) -> Optional[str]:
-        """Get the live view URL for the BrowserBase instance."""
-        return self._live_view_url if hasattr(self, "_live_view_url") else None
-
-    def get_selenium_remote_url(self) -> Optional[str]:
-        """Get the Selenium remote URL for the BrowserBase instance."""
-        return self._selenium_remote_url if hasattr(self, "_selenium_remote_url") else None
diff --git a/environments/online_mind2web/src/hud_controller/providers/helper/__init__.py b/environments/online_mind2web/src/hud_controller/providers/helper/__init__.py
deleted file mode 100644
index f3181c79..00000000
--- a/environments/online_mind2web/src/hud_controller/providers/helper/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-"""Helper utilities for browser providers."""
-
-from .proxy import get_proxy_config
-
-__all__ = ["get_proxy_config"]
diff --git a/environments/online_mind2web/src/hud_controller/providers/helper/proxy.py b/environments/online_mind2web/src/hud_controller/providers/helper/proxy.py
deleted file mode 100644
index 2eedffd7..00000000
--- a/environments/online_mind2web/src/hud_controller/providers/helper/proxy.py
+++ /dev/null
@@ -1,86 +0,0 @@
-# Copied simplified global proxy helper
-import os, random, asyncio, logging
-from typing import Optional, Dict, Any
-import httpx
-
-logger = logging.getLogger(__name__)
-
-
-# ----------------------- provider helpers ---------------------------
-async def _decodo_proxy() -> Optional[Dict[str, Any]]:
-    user = os.getenv("DECODO_USERNAME")
-    pwd = os.getenv("DECODO_PASSWORD")
-    if not user or not pwd:
-        return None
-    host = os.getenv("DECODO_HOST", "us.decodo.com")
-    rotating = os.getenv("DECODO_ROTATING", "true").lower() == "true"
-    if rotating:
-        port = 10000
-        logger.info("Using Decodo rotating proxy (port 10000)")
-        return {
-            "type": "custom",
-            "server": f"{host}:{port}",
-            "username": user,
-            "password": pwd,
-            "active": True,
-        }
-    logger.info("Searching Decodo ports 10001-11000 …")
-    tried = set()
-    for _ in range(5):
-        port = random.randint(10001, 11000)
-        while port in tried:
-            port = random.randint(10001, 11000)
-        tried.add(port)
-        proxy_url = f"http://{user}:{pwd}@{host}:{port}"
-        try:
-            async with httpx.AsyncClient(proxy=proxy_url, timeout=5.0) as client:
-                if (await client.get("http://httpbin.org/ip")).status_code == 200:
-                    logger.info("Decodo port %s works", port)
-                    return {
-                        "type": "custom",
-                        "server": f"{host}:{port}",
-                        "username": user,
-                        "password": pwd,
-                        "active": True,
-                    }
-        except Exception:
-            continue
-    logger.warning("No working Decodo port found")
-    return None
-
-
-def _standard_proxy() -> Optional[Dict[str, Any]]:
-    server = os.getenv("PROXY_SERVER")
-    if not server:
-        return None
-    return {
-        "type": "custom",
-        "server": server,
-        "username": os.getenv("PROXY_USERNAME"),
-        "password": os.getenv("PROXY_PASSWORD"),
-        "active": True,
-    }
-
-
-# ----------------------- public API ---------------------------------
-async def get_proxy_config() -> Optional[Dict[str, Any]]:
-    provider = os.getenv("PROXY_PROVIDER", "auto").lower()
-
-    if provider == "none":
-        logger.info("Proxy explicitly disabled")
-        return None
-
-    if provider == "decodo":
-        config = await _decodo_proxy()
-        if not config:
-            logger.warning("Decodo proxy requested but credentials not found")
-        return config
-
-    if provider == "standard":
-        config = _standard_proxy()
-        if not config:
-            logger.warning("Standard proxy requested but PROXY_SERVER not set")
-        return config
-
-    # auto or unknown - let browser use its default
-    return None
diff --git a/environments/online_mind2web/src/hud_controller/providers/hyperbrowser.py b/environments/online_mind2web/src/hud_controller/providers/hyperbrowser.py
deleted file mode 100644
index e290a9a1..00000000
--- a/environments/online_mind2web/src/hud_controller/providers/hyperbrowser.py
+++ /dev/null
@@ -1,244 +0,0 @@
-"""HyperBrowser provider implementation."""
-
-import os
-import logging
-from typing import Optional, Dict, Any, List
-import httpx
-
-from .base import BrowserProvider
-
-logger = logging.getLogger(__name__)
-
-
-class HyperBrowserProvider(BrowserProvider):
-    """HyperBrowser provider for remote browser control.
-
-    HyperBrowser provides cloud browser instances with advanced features like:
-    - Stealth mode with fingerprinting
-    - Advanced proxy configuration (country/state/city)
-    - Profile management and persistence
-    - Web recording (video and screenshots)
-    - CAPTCHA solving
-    - Ad blocking and tracker blocking
-    - Browser fingerprinting customization
-
-    API Documentation: https://docs.hyperbrowser.ai/reference/api-reference/sessions
-    """
-
-    def __init__(self, config: Dict[str, Any] | None = None):
-        super().__init__(config)
-        self.api_key = config.get("api_key") if config else os.getenv("HYPERBROWSER_API_KEY")
-        self.base_url = (
-            config.get("base_url", "https://api.hyperbrowser.ai")
-            if config
-            else "https://api.hyperbrowser.ai"
-        )
-        self._session_data: Dict[str, Any] | None = None
-
-        if not self.api_key:
-            raise ValueError("HyperBrowser API key not provided")
-
-    async def launch(self, **kwargs) -> str:
-        """Launch a HyperBrowser instance.
-
-        Args:
-            **kwargs: Launch options including:
-                - useStealth: Enable stealth mode (default: False)
-                - useProxy: Enable proxy (default: False)
-                - proxyCountry: Country code for proxy
-                - proxyState: State code for US proxies
-                - proxyCity: City name for proxy
-                - proxyServer: Custom proxy server
-                - proxyServerUsername: Proxy username
-                - proxyServerPassword: Proxy password
-                - solveCaptchas: Enable CAPTCHA solving
-                - adblock: Enable ad blocking
-                - trackers: Enable tracker blocking
-                - annoyances: Enable annoyances blocking
-                - enableWebRecording: Enable session recording
-                - enableVideoWebRecording: Enable video recording
-                - profile: Profile configuration dict with id and persistChanges
-                - acceptCookies: Auto-accept cookies
-                - extensionIds: List of extension IDs to load
-                - browserArgs: Additional browser arguments
-                - timeoutMinutes: Session timeout (1-720 minutes)
-
-        Returns:
-            CDP URL for connecting to the browser
-        """
-        # Build request payload with defaults
-        request_data = {
-            "useStealth": kwargs.get("useStealth", False),
-            "useProxy": kwargs.get("useProxy", False),
-        }
-
-        # Add proxy configuration
-        if request_data["useProxy"]:
-            if "proxyServer" in kwargs:
-                request_data["proxyServer"] = kwargs["proxyServer"]
-                request_data["proxyServerUsername"] = kwargs.get("proxyServerUsername")
-                request_data["proxyServerPassword"] = kwargs.get("proxyServerPassword")
-            else:
-                # Use HyperBrowser's residential proxy
-                request_data["proxyCountry"] = kwargs.get("proxyCountry", "US")
-                if "proxyState" in kwargs:
-                    request_data["proxyState"] = kwargs["proxyState"]
-                if "proxyCity" in kwargs:
-                    request_data["proxyCity"] = kwargs["proxyCity"]
-
-        # Add optional features
-        if "solveCaptchas" in kwargs:
-            request_data["solveCaptchas"] = kwargs["solveCaptchas"]
-
-        if "adblock" in kwargs:
-            request_data["adblock"] = kwargs["adblock"]
-
-        if "trackers" in kwargs:
-            request_data["trackers"] = kwargs["trackers"]
-
-        if "annoyances" in kwargs:
-            request_data["annoyances"] = kwargs["annoyances"]
-
-        # Recording options
-        if "enableWebRecording" in kwargs:
-            request_data["enableWebRecording"] = kwargs["enableWebRecording"]
-
-        if "enableVideoWebRecording" in kwargs:
-            request_data["enableVideoWebRecording"] = kwargs["enableVideoWebRecording"]
-
-        # Profile management
-        if "profile" in kwargs:
-            request_data["profile"] = kwargs["profile"]
-
-        if "acceptCookies" in kwargs:
-            request_data["acceptCookies"] = kwargs["acceptCookies"]
-
-        # Extensions and browser args
-        if "extensionIds" in kwargs:
-            request_data["extensionIds"] = kwargs["extensionIds"]
-
-        if "browserArgs" in kwargs:
-            request_data["browserArgs"] = kwargs["browserArgs"]
-
-        # Timeout
-        if "timeoutMinutes" in kwargs:
-            request_data["timeoutMinutes"] = kwargs["timeoutMinutes"]
-
-        # Make API request
-        async with httpx.AsyncClient() as client:
-            response = await client.post(
-                f"{self.base_url}/api/session",
-                json=request_data,
-                headers={"x-api-key": str(self.api_key), "Content-Type": "application/json"},
-                timeout=30.0,
-            )
-            response.raise_for_status()
-
-        # Extract session data
-        data = response.json()
-        self._session_data = data
-        self._instance_id = data.get("id")
-
-        if not self._instance_id:
-            raise Exception("Failed to get session ID from HyperBrowser response")
-
-        # Get WebSocket endpoint - HyperBrowser returns wsEndpoint
-        self._cdp_url = data.get("wsEndpoint")
-        if not self._cdp_url:
-            raise Exception("Failed to get WebSocket endpoint from HyperBrowser response")
-
-        self._is_running = True
-
-        logger.info(f"Launched HyperBrowser session: {self._instance_id}")
-        logger.info(f"CDP URL: {self._cdp_url}")
-
-        # Store additional URLs for reference
-        self._session_url = data.get("sessionUrl")
-        self._live_url = data.get("liveUrl")
-        self._token = data.get("token")
-
-        return self._cdp_url
-
-    def close(self) -> None:
-        """Terminate the HyperBrowser session."""
-        if not self._instance_id:
-            return
-
-        try:
-            with httpx.Client() as client:
-                response = client.put(
-                    f"{self.base_url}/api/session/{self._instance_id}/stop",
-                    headers={"x-api-key": str(self.api_key), "Content-Type": "application/json"},
-                    timeout=30.0,
-                )
-                response.raise_for_status()
-
-            logger.info(f"Terminated HyperBrowser session: {self._instance_id}")
-        except Exception as e:
-            logger.error(f"Error terminating session {self._instance_id}: {e}")
-        finally:
-            self._is_running = False
-            self._cdp_url = None
-            self._instance_id = None
-
-    async def get_status(self) -> Dict[str, Any]:
-        """Get status including HyperBrowser-specific info."""
-        status = await super().get_status()
-
-        # Add HyperBrowser-specific status
-        if self._instance_id and self._is_running:
-            try:
-                async with httpx.AsyncClient() as client:
-                    response = await client.get(
-                        f"{self.base_url}/api/session/{self._instance_id}",
-                        headers={
-                            "x-api-key": str(self.api_key),
-                            "Content-Type": "application/json",
-                        },
-                        timeout=10.0,
-                    )
-                    if response.status_code == 200:
-                        session_data = response.json()
-                        status["session_data"] = session_data
-                        status["status"] = session_data.get("status", "UNKNOWN")
-                        status["start_time"] = session_data.get("startTime")
-                        status["end_time"] = session_data.get("endTime")
-            except Exception as e:
-                logger.warning(f"Failed to get session status: {e}")
-
-        return status
-
-    def get_live_view_url(self) -> Optional[str]:
-        """Get the live view URL for the HyperBrowser instance."""
-        return self._live_url if hasattr(self, "_live_url") else None
-
-    def get_session_url(self) -> Optional[str]:
-        """Get the session URL for the HyperBrowser instance."""
-        return self._session_url if hasattr(self, "_session_url") else None
-
-    async def get_sessions_list(
-        self, page: int = 1, status: Optional[str] = None
-    ) -> Dict[str, Any]:
-        """Get list of sessions.
-
-        Args:
-            page: Page number (default: 1)
-            status: Filter by status ("active", "closed", "error")
-
-        Returns:
-            Dict with sessions list and pagination info
-        """
-        params = {"page": page}
-        if status:
-            params["status"] = status
-
-        async with httpx.AsyncClient() as client:
-            response = await client.get(
-                f"{self.base_url}/api/sessions",
-                params=params,
-                headers={"x-api-key": str(self.api_key), "Content-Type": "application/json"},
-                timeout=10.0,
-            )
-            response.raise_for_status()
-
-        return response.json()
diff --git a/environments/online_mind2web/src/hud_controller/providers/kernel.py b/environments/online_mind2web/src/hud_controller/providers/kernel.py
deleted file mode 100644
index 14cee895..00000000
--- a/environments/online_mind2web/src/hud_controller/providers/kernel.py
+++ /dev/null
@@ -1,13 +0,0 @@
-"""Kernel browser provider implementation (stub)."""
-
-from .base import BrowserProvider
-
-
-class KernelProvider(BrowserProvider):
-    """Kernel browser-as-a-service platform - placeholder implementation."""
-
-    async def launch(self, **kwargs) -> str:
-        raise NotImplementedError("Kernel provider not yet implemented")
-
-    def close(self) -> None:
-        raise NotImplementedError("Kernel provider not yet implemented")
diff --git a/environments/online_mind2web/src/hud_controller/providers/steel.py b/environments/online_mind2web/src/hud_controller/providers/steel.py
deleted file mode 100644
index a85209c3..00000000
--- a/environments/online_mind2web/src/hud_controller/providers/steel.py
+++ /dev/null
@@ -1,203 +0,0 @@
-"""Steel provider implementation."""
-
-import os
-import logging
-from typing import Optional, Dict, Any
-import httpx
-
-from .base import BrowserProvider
-
-logger = logging.getLogger(__name__)
-
-
-class SteelProvider(BrowserProvider):
-    """Steel provider for remote browser control.
-
-    Steel is an open-source browser API that provides cloud browser instances with features like:
-    - CAPTCHA solving
-    - Proxy support
-    - Session management
-    - Context persistence (cookies, local storage)
-    - Live view and recordings
-    - Anti-detection measures
-    - Up to 24-hour sessions
-
-    API Documentation: https://docs.steel.dev/api-reference
-    """
-
-    def __init__(self, config: Dict[str, Any] | None = None):
-        super().__init__(config)
-        self.api_key = config.get("api_key") if config else os.getenv("STEEL_API_KEY")
-        self.base_url = (
-            config.get("base_url", "https://api.steel.dev") if config else "https://api.steel.dev"
-        )
-        self._session_data: Dict[str, Any] | None = None
-
-        if not self.api_key:
-            raise ValueError("Steel API key not provided")
-
-    async def launch(self, **kwargs) -> str:
-        """Launch a Steel browser instance.
-
-        Args:
-            **kwargs: Launch options including:
-                - sessionTimeout: Session timeout in milliseconds (max 24 hours)
-                - proxy: Proxy configuration (user:pass@host:port)
-                - blockAds: Block ads (default: False)
-                - stealth: Enable stealth mode
-                - isSelenium: Create Selenium-compatible session
-                - loadExtensions: Load Chrome extensions
-                - solveCaptchas: Enable CAPTCHA solving
-                - context: Saved context (cookies, localStorage)
-
-        Returns:
-            CDP WebSocket URL for connecting to the browser
-        """
-        # Build request payload using Steel's format
-        request_data = {
-            "sessionId": kwargs.get("sessionId", ""),
-            "userAgent": kwargs.get("userAgent", ""),
-            "useProxy": kwargs.get("useProxy", False),
-            "proxyUrl": kwargs.get("proxyUrl", ""),
-            "blockAds": kwargs.get("blockAds", False),
-            "solveCaptcha": kwargs.get("solveCaptcha", False),
-            "timeout": kwargs.get("timeout", 1800000),  # 30 minutes default
-            "concurrency": kwargs.get("concurrency", 1),
-            "isSelenium": kwargs.get("isSelenium", False),
-            "region": kwargs.get("region", "lax"),
-        }
-
-        # Add dimensions if specified
-        if "dimensions" in kwargs:
-            request_data["dimensions"] = kwargs["dimensions"]
-        else:
-            request_data["dimensions"] = {"width": 1920, "height": 1080}
-
-        # Add session context if provided
-        if "sessionContext" in kwargs:
-            request_data["sessionContext"] = kwargs["sessionContext"]
-
-        # Add stealth config
-        if "stealthConfig" in kwargs:
-            request_data["stealthConfig"] = kwargs["stealthConfig"]
-
-        # Make API request
-        async with httpx.AsyncClient() as client:
-            response = await client.post(
-                f"{self.base_url}/v1/sessions",
-                json=request_data,
-                headers={"Content-Type": "application/json", "Steel-Api-Key": str(self.api_key)},
-                timeout=30.0,
-            )
-            response.raise_for_status()
-
-        # Extract session data
-        data = response.json()
-        self._session_data = data
-        self._instance_id = data.get("id")
-
-        if not self._instance_id:
-            raise Exception("Failed to get session ID from Steel response")
-
-        # Get WebSocket URL - Steel returns wsUrl
-        self._cdp_url = data.get("wsUrl")
-        if not self._cdp_url:
-            # Fallback to constructing URL if not provided
-            self._cdp_url = f"wss://api.steel.dev/sessions/{self._instance_id}"
-
-        self._is_running = True
-
-        logger.info(f"Launched Steel session: {self._instance_id}")
-        logger.info(f"CDP URL: {self._cdp_url}")
-
-        # Store additional URLs for reference
-        self._debug_url = data.get("debugUrl")
-        self._live_view_url = data.get("liveViewUrl")
-
-        return self._cdp_url
-
-    def close(self) -> None:
-        """Terminate the Steel session."""
-        if not self._instance_id:
-            return
-
-        try:
-            with httpx.Client() as client:
-                response = client.delete(
-                    f"{self.base_url}/v1/sessions/{self._instance_id}",
-                    headers={
-                        "Content-Type": "application/json",
-                        "Steel-Api-Key": str(self.api_key),
-                    },
-                    timeout=30.0,
-                )
-                # Steel may return 404 if session already ended
-                if response.status_code != 404:
-                    response.raise_for_status()
-
-            logger.info(f"Terminated Steel session: {self._instance_id}")
-        except Exception as e:
-            logger.error(f"Error terminating session {self._instance_id}: {e}")
-        finally:
-            self._is_running = False
-            self._cdp_url = None
-            self._instance_id = None
-
-    async def get_status(self) -> Dict[str, Any]:
-        """Get status including Steel-specific info."""
-        status = await super().get_status()
-
-        # Add Steel-specific status
-        if self._instance_id and self._is_running:
-            try:
-                async with httpx.AsyncClient() as client:
-                    response = await client.get(
-                        f"{self.base_url}/v1/sessions/{self._instance_id}",
-                        headers={
-                            "steel_api_key": str(self.api_key),
-                            "Content-Type": "application/json",
-                        },
-                        timeout=10.0,
-                    )
-                    if response.status_code == 200:
-                        session_data = response.json()
-                        status["session_data"] = session_data
-                        status["status"] = session_data.get("status", "UNKNOWN")
-                        status["context"] = session_data.get("context")
-            except Exception as e:
-                logger.warning(f"Failed to get session status: {e}")
-
-        return status
-
-    def get_debug_url(self) -> Optional[str]:
-        """Get the debug URL for the Steel instance."""
-        return self._debug_url if hasattr(self, "_debug_url") else None
-
-    def get_live_view_url(self) -> Optional[str]:
-        """Get the live view URL for the Steel instance."""
-        return self._live_view_url if hasattr(self, "_live_view_url") else None
-
-    async def save_context(self) -> Optional[Dict[str, Any]]:
-        """Save the current browser context (cookies, localStorage).
-
-        Returns:
-            Context data that can be passed to launch() to restore state
-        """
-        if not self._instance_id:
-            return None
-
-        try:
-            async with httpx.AsyncClient() as client:
-                response = await client.get(
-                    f"{self.base_url}/v1/sessions/{self._instance_id}/context",
-                    headers={
-                        "Content-Type": "application/json",
-                        "Steel-Api-Key": str(self.api_key),
-                    },
-                    timeout=10.0,
-                )
-                response.raise_for_status()
-                return response.json()
-        except Exception as e:
-            logger.error(f"Failed to save context: {e}")
-            return None
diff --git a/environments/online_mind2web/src/hud_controller/server.py b/environments/online_mind2web/src/hud_controller/server.py
deleted file mode 100644
index ce8d24fe..00000000
--- a/environments/online_mind2web/src/hud_controller/server.py
+++ /dev/null
@@ -1,358 +0,0 @@
-"""MCP server for remote browser environment."""
-
-import sys
-import logging
-import os
-import asyncio
-from datetime import datetime
-from typing import Optional, TypedDict, Any
-
-# Configure stderr logging
-logging.basicConfig(
-    stream=sys.stderr,
-    level=logging.INFO,
-    format="[%(levelname)s] %(asctime)s | %(name)s | %(message)s",
-    force=True,
-)
-logger = logging.getLogger(__name__)
-
-from hud.server import MCPServer
-from hud.server.context import attach_context
-
-# Import tools
-from .tools import BrowserExecutor, AnthropicComputerToolWithRecord, OpenAIComputerToolWithRecord
-from hud.tools.computer import (
-    AnthropicComputerTool,
-    OpenAIComputerTool,
-    HudComputerTool,
-)
-from .tools import OnlineMind2Web_PlaywrightTool as PlaywrightTool
-
-# Import setup and evaluate hubs
-from .setup import setup as setup_hub
-from .evaluate import evaluate as evaluate_hub
-
-# Import providers
-from .providers import get_provider, BrowserProvider
-
-# Global persistent context (initialized during startup)
-persistent_ctx = None
-playwright_tool = None
-browser_executor: Optional[BrowserExecutor] = None
-
-# Create Hud FastMCP instance
-mcp = MCPServer(
-    name="HUD Remote Browser Environment",
-    instructions="""
-    This is a remote browser automation environment that connects to cloud browser providers.
-    The browser provider is configured via the BROWSER_PROVIDER environment variable.
-    
-    Available tools:
-    - setup: Initialize browser environment with various setup functions
-    - evaluate: Evaluate browser state with various evaluator functions
-    - playwright tools: Browser automation (navigate, click, type, etc.)
-    - computer tools: Control browser as if it were a desktop application
-    """,
-)
-
-
-class Telemetry(TypedDict):
-    """Standard evaluation result format."""
-
-    provider: str
-    status: str
-    live_url: str | None
-    timestamp: str
-    cdp_url: str | None
-    instance_id: str | None
-
-
-@mcp.resource("telemetry://live")
-async def get_telemetry_resource() -> Telemetry:
-    """MCP resource containing telemetry data including provider's live view URL."""
-    global persistent_ctx
-
-    if persistent_ctx:
-        try:
-            telemetry = persistent_ctx.get_telemetry()  # Now synchronous
-            return Telemetry(
-                provider=telemetry["provider"],
-                status=telemetry["status"],
-                live_url=telemetry["live_url"],
-                timestamp=datetime.now().isoformat(),
-                cdp_url=None,
-                instance_id=telemetry["instance_id"],
-            )
-        except Exception as e:
-            logger.error(f"Error getting telemetry data: {e}")
-            # Return default telemetry on error instead of None
-            return Telemetry(
-                provider=os.getenv("BROWSER_PROVIDER", "unknown"),
-                status="error",
-                live_url=None,
-                timestamp=datetime.now().isoformat(),
-                cdp_url=None,
-                instance_id=None,
-            )
-
-    return Telemetry(
-        provider=os.getenv("BROWSER_PROVIDER", "unknown"),
-        status="not_initialized",
-        live_url=None,
-        timestamp=datetime.now().isoformat(),
-        cdp_url=None,
-        instance_id=None,
-    )
-
-
-@mcp.initialize
-async def initialize_environment(ctx):
-    """Initialize the remote browser environment with progress reporting."""
-    global persistent_ctx, playwright_tool, browser_executor
-
-    # Extract progress token from context if available
-    progress_token = None
-    if ctx.meta and hasattr(ctx.meta, "progressToken"):
-        progress_token = ctx.meta.progressToken
-
-    async def send_progress(progress: int, message: str):
-        if progress_token and hasattr(ctx, "session"):
-            try:
-                await ctx.session.send_progress_notification(
-                    progress_token=progress_token,
-                    progress=progress,
-                    total=100,
-                    message=message,
-                )
-            except Exception as e:
-                logger.warning(f"Failed to send progress notification: {e}")
-        logger.info(f"[{progress}%] {message}")
-
-    try:
-        await send_progress(5, "Connecting to persistent context...")
-
-        # Connect to persistent context server
-        max_retries = 10
-        retry_delay = 1.0
-
-        for attempt in range(max_retries):
-            try:
-                persistent_ctx = attach_context("/tmp/hud_remote_browser_ctx.sock")
-                if persistent_ctx is None:
-                    raise ConnectionError("Failed to attach to context server")
-                logger.info("Connected to persistent remote browser context")
-
-                # Log current state
-                state = persistent_ctx.get_state_summary()
-                logger.info(f"Context state: {state}")
-
-                if persistent_ctx.get_is_initialized():
-                    logger.info("Resuming with existing browser session")
-                else:
-                    logger.info("Starting fresh browser session")
-                break
-
-            except Exception as e:
-                if attempt < max_retries - 1:
-                    logger.warning(
-                        f"Context server not ready yet (attempt {attempt + 1}/{max_retries}): {e}"
-                    )
-                    await asyncio.sleep(retry_delay)
-                else:
-                    logger.error(
-                        f"Failed to connect to context server after {max_retries} attempts: {e}"
-                    )
-                    logger.error(
-                        "The context server should be started automatically. Check container logs."
-                    )
-                    raise
-
-        await send_progress(10, "Connected to persistent context")
-
-        # At this point, persistent_ctx is guaranteed to be set
-        assert persistent_ctx is not None
-
-        # Check if we need to initialize a new browser session
-        if not persistent_ctx.get_is_initialized():
-            await send_progress(15, "Initializing new browser session...")
-
-            # Get provider configuration from environment
-            provider_name = os.getenv("BROWSER_PROVIDER")
-            if not provider_name:
-                error_msg = (
-                    "BROWSER_PROVIDER environment variable is required. "
-                    "Supported providers: anchorbrowser, steel, browserbase, hyperbrowser, kernel"
-                )
-                logger.error(error_msg)
-                raise ValueError(error_msg)
-
-            provider_name = provider_name.lower()
-            await send_progress(20, f"Using browser provider: {provider_name}")
-
-            # Initialize the browser provider
-            provider_class = get_provider(provider_name)
-            provider_config = {}
-
-            # Add provider-specific configuration
-            if provider_name == "anchorbrowser":
-                provider_config["api_key"] = os.getenv("ANCHOR_API_KEY")
-                provider_config["base_url"] = os.getenv(
-                    "ANCHOR_BASE_URL", "https://api.anchorbrowser.io"
-                )
-            elif provider_name == "steel":
-                provider_config["api_key"] = os.getenv("STEEL_API_KEY")
-                provider_config["base_url"] = os.getenv("STEEL_BASE_URL", "https://api.steel.dev")
-            elif provider_name == "browserbase":
-                provider_config["api_key"] = os.getenv("BROWSERBASE_API_KEY")
-                provider_config["project_id"] = os.getenv("BROWSERBASE_PROJECT_ID")
-            elif provider_name == "hyperbrowser":
-                provider_config["api_key"] = os.getenv("HYPERBROWSER_API_KEY")
-            elif provider_name == "kernel":
-                provider_config["api_key"] = os.getenv("KERNEL_API_KEY")
-
-            # Store provider config in context
-            persistent_ctx.set_provider_config(provider_config)
-
-            browser_provider = provider_class(provider_config)
-            persistent_ctx.set_browser_provider(browser_provider)
-            await send_progress(30, "Browser provider initialized")
-
-            # Launch the browser and get CDP URL
-            await send_progress(40, "Launching remote browser...")
-
-            # Build launch options
-            launch_options = {}
-
-            # Add other launch options from environment
-            max_duration = os.getenv("BROWSER_MAX_DURATION")
-            if max_duration:
-                launch_options["max_duration"] = int(max_duration)
-            idle_timeout = os.getenv("BROWSER_IDLE_TIMEOUT")
-            if idle_timeout:
-                launch_options["idle_timeout"] = int(idle_timeout)
-
-            # Store launch options in context
-            persistent_ctx.set_launch_options(launch_options)
-
-            # Create browser session
-            cdp_url = await browser_provider.launch(**launch_options)
-
-            # Build and store telemetry data
-            telemetry_data = {
-                "provider": provider_name,
-                "status": "running",
-                "live_url": browser_provider.get_live_view_url()
-                if hasattr(browser_provider, "get_live_view_url")
-                else None,
-                "cdp_url": cdp_url,
-                "instance_id": browser_provider._instance_id
-                if hasattr(browser_provider, "_instance_id")
-                else None,
-                "timestamp": datetime.now().isoformat(),
-            }
-            persistent_ctx.set_telemetry(telemetry_data)
-
-            await send_progress(60, f"Browser launched")
-        else:
-            # Reuse existing browser session
-            await send_progress(20, "Reusing existing browser session...")
-
-            # Get existing CDP URL from context
-            cdp_url = persistent_ctx.get_cdp_url()
-            if not cdp_url:
-                raise ValueError("No CDP URL in persistent context")
-
-            await send_progress(60, f"Using existing CDP URL")
-
-        # Initialize PlaywrightToolWithMemory with CDP URL from context
-        # This reconnects to the existing browser session on reloads
-        # playwright_tool = PlaywrightTool(cdp_url=cdp_url)
-        playwright_tool = PlaywrightTool(cdp_url=cdp_url)
-
-        # Ensure browser is connected before registering tools
-        await playwright_tool._ensure_browser()
-        await send_progress(65, "Browser connection established")
-
-        # Add playwright tool to MCP
-        mcp.add_tool(playwright_tool)
-        await send_progress(70, "Playwright tool registered")
-
-        # Initialize browser executor
-        browser_executor = BrowserExecutor(playwright_tool)
-        await send_progress(75, "Browser executor initialized")
-
-        # Create and register computer tools with default dimensions
-        mcp.add_tool(HudComputerTool(executor=browser_executor))
-        mcp.add_tool(
-            AnthropicComputerToolWithRecord(
-                executor=browser_executor, name="anthropic_computer_tool"
-            )
-        )
-        mcp.add_tool(
-            OpenAIComputerToolWithRecord(executor=browser_executor, name="openai_computer_tool")
-        )
-
-        await send_progress(80, "Registered hud computer tools")
-
-        # Set the persistent context as environment for setup and evaluate hubs
-        setup_hub.env = persistent_ctx
-        evaluate_hub.env = persistent_ctx
-
-        # Also store the current playwright tool on the persistent context
-        # Note: This is NOT pickled/persisted, it's just for current session access
-        persistent_ctx.playwright_tool = playwright_tool
-
-        # Mount the hubs
-        mcp.mount(setup_hub)
-        mcp.mount(evaluate_hub)
-        await send_progress(90, "Setup and evaluate tools registered")
-
-        # Navigate to initial URL if specified (only for new sessions)
-        if not persistent_ctx.get_is_initialized():
-            initial_url = os.getenv("BROWSER_URL")
-            if initial_url:
-                await send_progress(95, f"Navigating to {initial_url}")
-                await playwright_tool.navigate(initial_url)
-
-            # Mark as initialized
-            persistent_ctx.set_initialized(True)
-
-        await send_progress(100, "Remote browser environment ready!")
-
-    except Exception as e:
-        logger.error(f"Initialization failed: {e}")
-        import traceback
-
-        logger.error(f"Traceback: {traceback.format_exc()}")
-        raise
-
-
-@mcp.shutdown
-async def shutdown_environment():
-    """Shutdown the remote browser environment (only called on SIGTERM)."""
-    global persistent_ctx, playwright_tool, browser_executor
-
-    logger.info("🔧 SIGTERM received - shutting down browser provider")
-    try:
-        # Close the browser provider
-        if persistent_ctx:
-            logger.info("Closing browser provider...")
-            try:
-                provider = persistent_ctx.get_browser_provider()
-                if provider and hasattr(provider, "close"):
-                    provider.close()
-                    logger.info("Browser provider closed")
-            except Exception as e:
-                logger.error(f"Error closing provider: {e}")
-
-        logger.info("✅ Browser shutdown completed")
-    except Exception as e:
-        logger.error(f"Error during shutdown: {e}")
-    finally:
-        # Clear local references
-        playwright_tool = None
-        browser_executor = None
-
-
-if __name__ == "__main__":
-    mcp.run()
diff --git a/environments/online_mind2web/src/hud_controller/setup/__init__.py b/environments/online_mind2web/src/hud_controller/setup/__init__.py
deleted file mode 100644
index f657ee4f..00000000
--- a/environments/online_mind2web/src/hud_controller/setup/__init__.py
+++ /dev/null
@@ -1,16 +0,0 @@
-"""Setup layer for remote browser environment.
-
-This module exposes:
-- ``setup``, the BaseHub instance for setup operations
-"""
-
-from __future__ import annotations
-
-from hud.tools.base import BaseHub
-
-setup = BaseHub("setup")
-
-# Import all setup functions to register them
-from . import navigate
-
-__all__ = ["setup"]
diff --git a/environments/online_mind2web/src/hud_controller/setup/navigate.py b/environments/online_mind2web/src/hud_controller/setup/navigate.py
deleted file mode 100644
index 03aeba2f..00000000
--- a/environments/online_mind2web/src/hud_controller/setup/navigate.py
+++ /dev/null
@@ -1,41 +0,0 @@
-"""Navigation setup for remote browser environment."""
-
-import logging
-from fastmcp import Context
-from mcp.types import TextContent
-from . import setup
-
-logger = logging.getLogger(__name__)
-
-
-@setup.tool("navigate_to_url")
-async def navigate_to_url(ctx: Context, url: str, wait_for_load_state: str = "load"):
-    """Navigate browser to a specific URL.
-
-    Args:
-        url: The URL to navigate to
-        wait_for_load_state: State to wait for after navigation
-
-    Returns:
-        Setup result with navigation status
-    """
-    logger.info(f"Navigating to URL: {url}")
-
-    # Get the playwright tool from the environment context
-    persistent_ctx = setup.env
-    playwright_tool = getattr(persistent_ctx, "playwright_tool", None)
-    if not playwright_tool or not hasattr(playwright_tool, "page") or not playwright_tool.page:
-        logger.error("No playwright tool available")
-        return TextContent(text="No browser available for navigation", type="text")
-
-    # Navigate using the playwright tool
-    result = await playwright_tool.navigate(url, wait_for_load_state)
-
-    if result.get("success"):
-        logger.info(f"Successfully navigated to {url}")
-        return TextContent(
-            text=f"Navigated to {url} - Title: {result.get('title', 'Unknown')}", type="text"
-        )
-    else:
-        logger.error(f"Failed to navigate: {result.get('error')}")
-        return TextContent(text=f"Navigation failed: {result.get('error')}", type="text")
diff --git a/environments/online_mind2web/src/hud_controller/tools/__init__.py b/environments/online_mind2web/src/hud_controller/tools/__init__.py
deleted file mode 100644
index 0379d639..00000000
--- a/environments/online_mind2web/src/hud_controller/tools/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-"""Tools module for remote browser environment."""
-
-from .executor import BrowserExecutor
-from .anthropic import AnthropicComputerToolWithRecord
-from .openai import OpenAIComputerToolWithRecord
-from .playwright import OnlineMind2Web_PlaywrightTool
-
-__all__ = [
-    "BrowserExecutor",
-    "AnthropicComputerToolWithRecord",
-    "OpenAIComputerToolWithRecord",
-    "OnlineMind2Web_PlaywrightTool",
-]
diff --git a/environments/online_mind2web/src/hud_controller/tools/anthropic.py b/environments/online_mind2web/src/hud_controller/tools/anthropic.py
deleted file mode 100644
index 36406796..00000000
--- a/environments/online_mind2web/src/hud_controller/tools/anthropic.py
+++ /dev/null
@@ -1,265 +0,0 @@
-"""PlaywrightTool with memory/history tracking for remote browser environment."""
-
-import logging, os, base64
-from typing import Any, Dict, List, Optional, Literal
-from datetime import datetime
-from pydantic import Field
-from hud.tools import AnthropicComputerTool
-from mcp.types import ContentBlock, ImageContent, TextContent
-from hud.tools.computer.settings import computer_settings
-from _collections_abc import Callable, Awaitable
-from hud.tools.executors import BaseExecutor
-
-logger = logging.getLogger(__name__)
-
-
-class AnthropicComputerToolWithRecord(AnthropicComputerTool):
-    def __init__(
-        self,
-        # Define within environment based on platform
-        executor: BaseExecutor | None = None,
-        platform_type: Literal["auto", "xdo", "pyautogui"] = "auto",
-        display_num: int | None = None,
-        # Overrides for what dimensions the agent thinks it operates in
-        width: int = computer_settings.ANTHROPIC_COMPUTER_WIDTH,
-        height: int = computer_settings.ANTHROPIC_COMPUTER_HEIGHT,
-        rescale_images: bool = computer_settings.ANTHROPIC_RESCALE_IMAGES,
-        # What the agent sees as the tool's name, title, and description
-        name: str | None = None,
-        title: str | None = None,
-        description: str | None = None,
-        **kwargs: Any,
-    ):
-        super().__init__(
-            executor=executor,
-            platform_type=platform_type,
-            display_num=display_num,
-            width=width,
-            height=height,
-            rescale_images=rescale_images,
-            name=name,
-            title=title,
-            description=description,
-            **kwargs,
-        )
-        self.add_callback("on_screenshot_action", self._on_screenshot_action)
-        self.add_callback("on_recorded_action", self._on_recorded_action)
-
-    async def _on_screenshot_action(self, **_) -> None:
-        """Callback function to take and save screenshots to /screenshot directory"""
-        try:
-            # Check if executor is available and properly initialized
-            if not hasattr(self, "executor") or self.executor is None:
-                logger.debug("Executor not yet initialized, skipping screenshot")
-                return
-
-            # Additional check for executor readiness
-            if not hasattr(self.executor, "screenshot"):
-                logger.debug("Executor screenshot method not available, skipping screenshot")
-                return
-
-            screenshot_base64 = await self.executor.screenshot()
-            if screenshot_base64:
-                # Create screenshot directory if it doesn't exist
-                screenshot_dir = "/screenshot"
-                os.makedirs(screenshot_dir, exist_ok=True)
-
-                # Generate timestamp-based filename
-                timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")[:-3]  # Include milliseconds
-                filename = f"screenshot_{timestamp}.png"
-                filepath = os.path.join(screenshot_dir, filename)
-
-                # Decode base64 and save to file
-                image_data = base64.b64decode(screenshot_base64)
-                with open(filepath, "wb") as f:
-                    f.write(image_data)
-
-                logger.info(f"Saved screenshot to {filepath}")
-
-        except Exception as e:
-            logger.debug(f"Screenshot callback failed (this is normal during initialization): {e}")
-
-    async def _on_recorded_action(
-        self,
-        action=None,
-        coordinate=None,
-        text=None,
-        start_coordinate=None,
-        scroll_direction=None,
-        scroll_amount=None,
-        **_,
-    ):
-        """Record action in unified representation format
-
-        Creates unified action representations like:
-        - <coordinate=[123, 456]> -> CLICK
-        - <coordinate=[123, 456]> -> TYPE hello@example.com
-        - <start=[100, 200], end=[300, 400]> -> DRAG
-        """
-        if not action:
-            return
-
-        try:
-            # Create unified action representation
-            action_repr = self._to_action_repr(
-                action, coordinate, text, start_coordinate, scroll_direction, scroll_amount
-            )
-
-            # Dump to file
-            action_history_dir = "/action_history"
-            os.makedirs(action_history_dir, exist_ok=True)
-            action_file = os.path.join(action_history_dir, "action_history.txt")
-
-            with open(action_file, "a", encoding="utf-8") as f:
-                f.write(f"{action_repr}\n")
-
-            logger.info(f"Recorded action: {action_repr}")
-
-        except Exception as e:
-            logger.warning(f"Failed to record action: {e}")
-
-    def _scale_coordinates(self, x: int | None, y: int | None) -> tuple[int | None, int | None]:
-        """Scale coordinates from target space to screen space."""
-        if x is not None and self.scale_x != 1.0:
-            x = round(x / self.scale_x)
-        if y is not None and self.scale_y != 1.0:
-            y = round(y / self.scale_y)
-
-        return x, y
-
-    async def __call__(
-        self,
-        action: str = Field(..., description="The action to perform on the computer"),
-        coordinate: list[int] | tuple[int, int] | None = Field(
-            None, description="The coordinate to interact with on the computer [x, y]"
-        ),
-        text: str | None = Field(
-            None, description="The text to type on the computer or key to press"
-        ),
-        start_coordinate: list[int] | tuple[int, int] | None = Field(
-            None, description="The starting coordinate for drag actions [x, y]"
-        ),
-        scroll_direction: str | None = Field(
-            None, description="The direction to scroll (up, down, left, right)"
-        ),
-        scroll_amount: int | None = Field(None, description="The amount to scroll"),
-        duration: float | None = Field(None, description="The duration of the action in seconds"),
-        take_screenshot_on_click: bool = Field(
-            True, description="Whether to take a screenshot after clicking"
-        ),
-    ) -> list[ContentBlock]:
-        result = await super().__call__(
-            action=action,
-            coordinate=coordinate,
-            text=text,
-            start_coordinate=start_coordinate,
-            scroll_direction=scroll_direction,
-            scroll_amount=scroll_amount,
-            duration=duration,
-            take_screenshot_on_click=take_screenshot_on_click,
-        )
-        screenshot_actions = {
-            "screenshot",
-            "left_click",
-            "click",
-            "double_click",
-            "triple_click",
-            "right_click",
-            "middle_click",
-            "mouse_move",
-            "move",
-            "type",
-            "key",
-            "scroll",
-            "left_click_drag",
-            "drag",
-            "wait",
-            "hold_key",
-            "left_mouse_down",
-            "left_mouse_up",
-        }
-        if action in screenshot_actions and action != "screenshot" and take_screenshot_on_click:
-            await self._trigger_callbacks("on_screenshot_action")
-            logger.debug(
-                "Env display size %s x %s",
-                self.environment_width,
-                self.environment_height,
-            )
-
-        recorded_actions = {
-            "left_click",
-            "click",
-            "double_click",
-            "triple_click",
-            "right_click",
-            "middle_click",
-            "type",
-            "key",
-            "scroll",
-            "left_click_drag",
-            "drag",
-        }
-        if action in recorded_actions:
-            await self._trigger_callbacks(
-                "on_recorded_action",
-                action=action,
-                coordinate=coordinate,
-                text=text,
-                start_coordinate=start_coordinate,
-                scroll_direction=scroll_direction,
-                scroll_amount=scroll_amount,
-            )
-        return result
-
-    def _to_action_repr(
-        self,
-        action,
-        coordinate=None,
-        text=None,
-        start_coordinate=None,
-        scroll_direction=None,
-        scroll_amount=None,
-    ):
-        """Create unified action representation following AgentRewardBench format
-
-        Format examples:
-        - <coordinate=[123, 456]> -> CLICK
-        - <coordinate=[123, 456]> -> TYPE hello@example.com
-        - <start=[100, 200], end=[300, 400]> -> DRAG
-        - <coordinate=[123, 456], direction=up, amount=3> -> SCROLL
-        """
-
-        # Normalize action names to uppercase
-        action_name = action.upper().replace("LEFT_", "").replace("_", "")
-        if action_name == "LEFTCLICK":
-            action_name = "CLICK"
-        elif action_name == "LEFTCLICKDRAG":
-            action_name = "DRAG"
-
-        # Build element attributes part
-        attributes = []
-
-        if coordinate:
-            attributes.append(f"coordinate={list(coordinate)}")
-
-        if start_coordinate and action_name == "DRAG":
-            attributes.append(f"start={list(start_coordinate)}")
-            if coordinate:
-                attributes.append(f"end={list(coordinate)}")
-
-        if scroll_direction:
-            attributes.append(f"direction={scroll_direction}")
-
-        if scroll_amount:
-            attributes.append(f"amount={scroll_amount}")
-
-        # Create element part
-        element_part = f"<{', '.join(attributes)}>" if attributes else "<>"
-
-        # Create action part
-        if text and action_name in ["TYPE", "KEY"]:
-            action_part = f"{action_name} {text}"
-        else:
-            action_part = action_name
-
-        return f"{element_part} -> {action_part}"
diff --git a/environments/online_mind2web/src/hud_controller/tools/executor.py b/environments/online_mind2web/src/hud_controller/tools/executor.py
deleted file mode 100644
index bbcbe8e5..00000000
--- a/environments/online_mind2web/src/hud_controller/tools/executor.py
+++ /dev/null
@@ -1,384 +0,0 @@
-"""Browser-based executor for computer tools that uses Playwright."""
-
-import base64
-import logging
-from typing import Literal, Optional
-
-from hud.tools.executors.base import BaseExecutor
-from hud.tools.types import ContentResult
-
-logger = logging.getLogger(__name__)
-
-# Mapping from common key names to Playwright key names
-PLAYWRIGHT_KEY_MAP = {
-    # Control keys
-    "ctrl": "Control",
-    "control": "Control",
-    "alt": "Alt",
-    "shift": "Shift",
-    "meta": "Meta",
-    "cmd": "Meta",  # macOS Command key
-    "command": "Meta",
-    "win": "Meta",  # Windows key
-    "windows": "Meta",
-    # Navigation keys
-    "enter": "Enter",
-    "return": "Enter",
-    "tab": "Tab",
-    "backspace": "Backspace",
-    "delete": "Delete",
-    "del": "Delete",
-    "escape": "Escape",
-    "esc": "Escape",
-    "space": "Space",
-    # Arrow keys
-    "up": "ArrowUp",
-    "down": "ArrowDown",
-    "left": "ArrowLeft",
-    "right": "ArrowRight",
-    # Page navigation
-    "pageup": "PageUp",
-    "page_up": "PageUp",  # Support underscore variant
-    "pagedown": "PageDown",
-    "page_down": "PageDown",  # Support underscore variant
-    "next": "PageDown",  # Common alias for page down
-    "previous": "PageUp",  # Common alias for page up
-    "prev": "PageUp",  # Short alias for page up
-    "home": "Home",
-    "end": "End",
-    # Function keys
-    "f1": "F1",
-    "f2": "F2",
-    "f3": "F3",
-    "f4": "F4",
-    "f5": "F5",
-    "f6": "F6",
-    "f7": "F7",
-    "f8": "F8",
-    "f9": "F9",
-    "f10": "F10",
-    "f11": "F11",
-    "f12": "F12",
-    # Other keys
-    "insert": "Insert",
-    "ins": "Insert",
-    "pause": "Pause",
-    "capslock": "CapsLock",
-    "numlock": "NumLock",
-    "scrolllock": "ScrollLock",
-    "printscreen": "PrintScreen",
-    "contextmenu": "ContextMenu",
-}
-
-
-class BrowserExecutor(BaseExecutor):
-    """
-    Executor that performs all actions within a browser viewport using Playwright.
-
-    This allows HudComputerTool (and its subclasses like AnthropicComputerTool
-    and OpenAIComputerTool) to work with remote browser environments.
-
-    The executor translates computer control actions into browser page actions,
-    making it possible to control web applications as if they were desktop apps.
-    """
-
-    def __init__(self, playwright_tool, display_num: int | None = None):
-        """
-        Initialize the browser executor.
-
-        Args:
-            playwright_tool: PlaywrightToolWithMemory instance for browser control
-            display_num: Not used for browser executor, kept for compatibility
-        """
-        super().__init__(display_num)
-        self.playwright_tool = playwright_tool
-        logger.info("BrowserExecutor initialized with Playwright backend")
-
-    def _map_key(self, key: str) -> str:
-        """Map a key name to Playwright format."""
-        key = key.strip()
-        key_lower = key.lower()
-        mapped = PLAYWRIGHT_KEY_MAP.get(key_lower, key)
-        logger.debug(f"Mapping key '{key}' -> '{mapped}'")
-        return mapped
-
-    async def _ensure_page(self):
-        """Ensure browser and page are available."""
-        await self.playwright_tool._ensure_browser()
-        if not self.playwright_tool.page:
-            raise RuntimeError("No browser page available")
-        return self.playwright_tool.page
-
-    async def screenshot(self) -> str | None:
-        """Take a screenshot and return base64 encoded image."""
-        try:
-            result = await self.playwright_tool.screenshot()
-            if result.base64_image:
-                logger.debug("Browser screenshot captured via playwright tool")
-                return result.base64_image
-            elif result.error:
-                logger.error(f"Screenshot failed: {result.error}")
-                return None
-            else:
-                logger.error("Screenshot returned no image or error")
-                return None
-        except Exception as e:
-            logger.error(f"Screenshot failed: {e}")
-            return None
-
-    async def click(
-        self,
-        x: int | None = None,
-        y: int | None = None,
-        button: Literal["left", "right", "middle", "back", "forward"] = "left",
-        pattern: list[int] | None = None,
-        hold_keys: list[str] | None = None,
-        take_screenshot: bool = True,
-    ) -> ContentResult:
-        """Click at coordinates in the browser viewport."""
-        try:
-            page = await self._ensure_page()
-
-            if x is None or y is None:
-                return ContentResult(error="Coordinates required for click")
-
-            # Handle modifier keys
-            if hold_keys:
-                for key in hold_keys:
-                    mapped_key = self._map_key(key)
-                    await page.keyboard.down(mapped_key)
-
-            # Map button names
-            button_map = {
-                "left": "left",
-                "right": "right",
-                "middle": "middle",
-                "back": "left",  # Browser doesn't have back button
-                "forward": "left",  # Browser doesn't have forward button
-            }
-
-            # Perform click(s)
-            if pattern:
-                # Multi-click pattern
-                for delay in pattern:
-                    await page.mouse.click(x, y, button=button_map[button])
-                    if delay > 0:
-                        await page.wait_for_timeout(delay)
-            else:
-                # Single click
-                await page.mouse.click(x, y, button=button_map[button])
-
-            # Release modifier keys
-            if hold_keys:
-                for key in hold_keys:
-                    mapped_key = self._map_key(key)
-                    await page.keyboard.up(mapped_key)
-
-            logger.debug(f"Clicked at ({x}, {y}) with button {button}")
-
-            result = ContentResult(output=f"Clicked at ({x}, {y})")
-            if take_screenshot:
-                result = result + ContentResult(base64_image=await self.screenshot())
-
-            return result
-
-        except Exception as e:
-            logger.error(f"Click failed: {e}")
-            return ContentResult(error=str(e))
-
-    async def write(
-        self,
-        text: str,
-        enter_after: bool = False,
-        hold_keys: list[str] | None = None,
-        take_screenshot: bool = True,
-    ) -> ContentResult:
-        """Type text in the browser."""
-        try:
-            page = await self._ensure_page()
-
-            # Handle modifier keys
-            if hold_keys:
-                for key in hold_keys:
-                    mapped_key = self._map_key(key)
-                    await page.keyboard.down(mapped_key)
-
-            # Type the text
-            await page.keyboard.type(text)
-
-            if enter_after:
-                await page.keyboard.press("Enter")
-
-            # Release modifier keys
-            if hold_keys:
-                for key in hold_keys:
-                    mapped_key = self._map_key(key)
-                    await page.keyboard.up(mapped_key)
-
-            logger.debug(f"Typed text: {text[:50]}...")
-
-            result = ContentResult(output=f"Typed: {text}")
-            if take_screenshot:
-                result = result + ContentResult(base64_image=await self.screenshot())
-
-            return result
-
-        except Exception as e:
-            logger.error(f"Type failed: {e}")
-            return ContentResult(error=str(e))
-
-    async def press(
-        self,
-        keys: list[str],
-        take_screenshot: bool = True,
-    ) -> ContentResult:
-        """Press keyboard keys in the browser."""
-        try:
-            page = await self._ensure_page()
-
-            # Map keys to Playwright format
-            mapped_keys = [self._map_key(key) for key in keys]
-
-            # Always capitalize single letter keys in press method
-            processed_keys = []
-            for key in mapped_keys:
-                # Capitalize single letters (e.g., 'a' -> 'A')
-                if len(key) == 1 and key.isalpha() and key.islower():
-                    processed_keys.append(key.upper())
-                else:
-                    processed_keys.append(key)
-            mapped_keys = processed_keys
-
-            logger.info(f"Mapped keys: {mapped_keys}")
-
-            # Press the keys as a combination (at the same time)
-            key_combination = "+".join(mapped_keys)
-            await page.keyboard.press(key_combination)
-
-            logger.debug(f"Pressed keys: {keys} (mapped to: {mapped_keys})")
-
-            result = ContentResult(output=f"Pressed: {key_combination}")
-            if take_screenshot:
-                result = result + ContentResult(base64_image=await self.screenshot())
-
-            return result
-
-        except Exception as e:
-            logger.error(f"Key press failed: {e}")
-            return ContentResult(error=str(e))
-
-    async def scroll(
-        self,
-        x: int | None = None,
-        y: int | None = None,
-        scroll_x: int | None = None,
-        scroll_y: int | None = None,
-        hold_keys: list[str] | None = None,
-        take_screenshot: bool = True,
-    ) -> ContentResult:
-        """Scroll in the browser viewport."""
-        try:
-            page = await self._ensure_page()
-
-            # Default to center of viewport if coordinates not provided
-            if x is None or y is None:
-                viewport = page.viewport_size
-                x = viewport["width"] // 2 if viewport else 400
-                y = viewport["height"] // 2 if viewport else 300
-
-            # Move to position
-            await page.mouse.move(x, y)
-
-            # Perform scroll
-            delta_x = scroll_x or 0
-            delta_y = scroll_y or 0
-            await page.mouse.wheel(delta_x, delta_y)
-
-            logger.debug(f"Scrolled at ({x}, {y}) by ({delta_x}, {delta_y})")
-
-            result = ContentResult(output=f"Scrolled by ({delta_x}, {delta_y})")
-            if take_screenshot:
-                result = result + ContentResult(base64_image=await self.screenshot())
-
-            return result
-
-        except Exception as e:
-            logger.error(f"Scroll failed: {e}")
-            return ContentResult(error=str(e))
-
-    async def move(
-        self,
-        x: int | None = None,
-        y: int | None = None,
-        take_screenshot: bool = True,
-    ) -> ContentResult:
-        """Move mouse to coordinates in the browser."""
-        try:
-            page = await self._ensure_page()
-
-            if x is None or y is None:
-                return ContentResult(error="Coordinates required for move")
-
-            await page.mouse.move(x, y)
-
-            logger.debug(f"Moved mouse to ({x}, {y})")
-
-            result = ContentResult(output=f"Moved to ({x}, {y})")
-            if take_screenshot:
-                result = result + ContentResult(base64_image=await self.screenshot())
-
-            return result
-
-        except Exception as e:
-            logger.error(f"Move failed: {e}")
-            return ContentResult(error=str(e))
-
-    async def drag(
-        self,
-        path: list[tuple[int, int]],
-        button: Literal["left", "right", "middle"] = "left",
-        hold_keys: list[str] | None = None,
-        take_screenshot: bool = True,
-    ) -> ContentResult:
-        """Drag along a path in the browser."""
-        try:
-            page = await self._ensure_page()
-
-            if not path or len(path) < 2:
-                return ContentResult(error="Path must have at least 2 points")
-
-            # Handle modifier keys
-            if hold_keys:
-                for key in hold_keys:
-                    mapped_key = self._map_key(key)
-                    await page.keyboard.down(mapped_key)
-
-            # Start drag
-            start_x, start_y = path[0]
-            await page.mouse.move(start_x, start_y)
-            await page.mouse.down(button=button)
-
-            # Move through path
-            for x, y in path[1:]:
-                await page.mouse.move(x, y)
-
-            # End drag
-            await page.mouse.up(button=button)
-
-            # Release modifier keys
-            if hold_keys:
-                for key in hold_keys:
-                    mapped_key = self._map_key(key)
-                    await page.keyboard.up(mapped_key)
-
-            logger.debug(f"Dragged from {path[0]} through {len(path)} points")
-
-            result = ContentResult(output=f"Dragged through {len(path)} points")
-            if take_screenshot:
-                result = result + ContentResult(base64_image=await self.screenshot())
-
-            return result
-
-        except Exception as e:
-            logger.error(f"Drag failed: {e}")
-            return ContentResult(error=str(e))
diff --git a/environments/online_mind2web/src/hud_controller/tools/openai.py b/environments/online_mind2web/src/hud_controller/tools/openai.py
deleted file mode 100644
index 382bee7a..00000000
--- a/environments/online_mind2web/src/hud_controller/tools/openai.py
+++ /dev/null
@@ -1,266 +0,0 @@
-"""OpenAI with memory/history tracking for remote browser environment."""
-
-import os, base64
-from datetime import datetime
-import logging
-from typing import Any, Dict, List, Optional, Literal
-from datetime import datetime
-from pydantic import Field
-from hud.tools import OpenAIComputerTool
-from mcp.types import ContentBlock, ImageContent, TextContent
-from hud.tools.computer.settings import computer_settings
-from _collections_abc import Callable, Awaitable
-from hud.tools.executors import BaseExecutor
-
-logger = logging.getLogger(__name__)
-
-
-class OpenAIComputerToolWithRecord(OpenAIComputerTool):
-    """OpenAI Computer Use tool
-
-    Args:
-        OpenAIComputerTool (_type_): _description_
-    """
-
-    def __init__(
-        self,
-        # Define within environment based on platform
-        executor: BaseExecutor | None = None,
-        platform_type: Literal["auto", "xdo", "pyautogui"] = "auto",
-        display_num: int | None = None,
-        # Overrides for what dimensions the agent thinks it operates in
-        width: int = computer_settings.OPENAI_COMPUTER_WIDTH,
-        height: int = computer_settings.OPENAI_COMPUTER_HEIGHT,
-        rescale_images: bool = computer_settings.OPENAI_RESCALE_IMAGES,
-        name: str | None = None,
-        title: str | None = None,
-        description: str | None = None,
-        **kwargs: Any,
-    ) -> None:
-        """
-        Initialize with OpenAI's default dimensions.
-
-        Args:
-            width: Target width for rescaling (default: 1024 for OpenAI)
-            height: Target height for rescaling (default: 768 for OpenAI)
-            rescale_images: If True, rescale screenshots. If False, only rescale action coordinates
-            name: Tool name for MCP registration (auto-generated from class name if not provided)
-            title: Human-readable display name for the tool (auto-generated from class name)
-            description: Tool description (auto-generated from docstring if not provided)
-        """
-        super().__init__(
-            executor=executor,
-            platform_type=platform_type,
-            display_num=display_num,
-            width=width,
-            height=height,
-            rescale_images=rescale_images,
-            name=name or "openai_computer",
-            title=title or "OpenAI Computer Tool",
-            description=description or "Control computer with mouse, keyboard, and screenshots",
-            **kwargs,
-        )
-        self.add_callback("on_screenshot_action", self._on_screenshot_action)
-        self.add_callback("on_recorded_action", self._on_recorded_action)
-
-    async def _on_screenshot_action(self, **kwargs) -> None:
-        """Callback function to take and save screenshots to /screenshot directory"""
-        try:
-            # Check if executor is available and properly initialized
-            if not hasattr(self, "executor") or self.executor is None:
-                logger.debug("Executor not yet initialized, skipping screenshot")
-                return
-
-            # Additional check for executor readiness
-            if not hasattr(self.executor, "screenshot"):
-                logger.debug("Executor screenshot method not available, skipping screenshot")
-                return
-
-            screenshot_base64 = await self.executor.screenshot()
-            if screenshot_base64:
-                # Create screenshot directory if it doesn't exist
-                screenshot_dir = "/screenshot"
-                os.makedirs(screenshot_dir, exist_ok=True)
-
-                # Generate timestamp-based filename
-                timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")[:-3]  # Include milliseconds
-                filename = f"screenshot_{timestamp}.png"
-                filepath = os.path.join(screenshot_dir, filename)
-
-                # Decode base64 and save to file
-                image_data = base64.b64decode(screenshot_base64)
-                with open(filepath, "wb") as f:
-                    f.write(image_data)
-
-                logger.info(f"Saved screenshot to {filepath}")
-
-        except Exception as e:
-            logger.debug(f"Screenshot callback failed (this is normal during initialization): {e}")
-            # Don't log as error since this is expected during initialization
-
-    async def _on_recorded_action(
-        self, type=None, x=None, y=None, text=None, path=None, scroll_x=None, scroll_y=None, **_
-    ):
-        """Record action in unified representation format
-
-        Creates unified action representations like:
-        - <coordinate=[123, 456]> -> CLICK
-        - <coordinate=[123, 456]> -> TYPE hello@example.com
-        - <start=[100, 200], end=[300, 400]> -> DRAG
-        """
-        if not type:
-            return
-
-        try:
-            # Create unified action representation
-            action_repr = self._to_action_repr(type, x, y, text, path, scroll_x, scroll_y)
-
-            # Dump to file
-            action_history_dir = "/action_history"
-            os.makedirs(action_history_dir, exist_ok=True)
-            action_file = os.path.join(action_history_dir, "action_history.txt")
-
-            with open(action_file, "a", encoding="utf-8") as f:
-                f.write(f"{action_repr}\n")
-
-            logger.info(f"Recorded action: {action_repr}")
-
-        except Exception as e:
-            logger.warning(f"Failed to record action: {e}")
-
-    def _to_action_repr(
-        self, type, x=None, y=None, text=None, path=None, scroll_x=None, scroll_y=None
-    ):
-        """Create unified action representation following AgentRewardBench format
-
-        Format examples:
-        - <coordinate=[123, 456]> -> CLICK
-        - <coordinate=[123, 456]> -> TYPE hello@example.com
-        - <start=[100, 200], end=[300, 400]> -> DRAG
-        - <coordinate=[123, 456], direction=up, amount=3> -> SCROLL
-        """
-
-        # Normalize action names to uppercase
-        action_name = type.upper().replace("_", "")
-        if action_name == "DOUBLECLICK":
-            action_name = "DOUBLECLICK"
-        elif action_name == "KEYPRESS":
-            action_name = "KEY"
-
-        # Build element attributes part
-        attributes = []
-
-        if x is not None and y is not None:
-            attributes.append(f"coordinate=[{x}, {y}]")
-
-        if path and action_name == "DRAG":
-            if len(path) >= 2:
-                start = path[0]
-                end = path[-1]
-                attributes.append(f"start={list(start)}")
-                attributes.append(f"end={list(end)}")
-
-        if scroll_x is not None or scroll_y is not None:
-            if scroll_y and scroll_y > 0:
-                attributes.append("direction=down")
-                attributes.append(f"amount={abs(scroll_y)}")
-            elif scroll_y and scroll_y < 0:
-                attributes.append("direction=up")
-                attributes.append(f"amount={abs(scroll_y)}")
-            elif scroll_x and scroll_x > 0:
-                attributes.append("direction=right")
-                attributes.append(f"amount={abs(scroll_x)}")
-            elif scroll_x and scroll_x < 0:
-                attributes.append("direction=left")
-                attributes.append(f"amount={abs(scroll_x)}")
-
-        # Create element part
-        element_part = f"<{', '.join(attributes)}>" if attributes else "<>"
-
-        # Create action part
-        if text and action_name in ["TYPE", "KEY"]:
-            action_part = f"{action_name} {text}"
-        else:
-            action_part = action_name
-
-        return f"{element_part} -> {action_part}"
-
-    async def __call__(
-        self,
-        type: str = Field(..., description="The action type to perform"),
-        # Coordinate parameters
-        x: int | None = Field(None, description="X coordinate for click/move/scroll actions"),
-        y: int | None = Field(None, description="Y coordinate for click/move/scroll actions"),
-        # Button parameter
-        button: str | None = Field(
-            None, description="Mouse button for click actions (left, right, middle, wheel)"
-        ),
-        # Text parameter
-        text: str | None = Field(None, description="Text to type or response text"),
-        # Scroll parameters
-        scroll_x: int | None = Field(None, description="Horizontal scroll amount"),
-        scroll_y: int | None = Field(None, description="Vertical scroll amount"),
-        # Wait parameter
-        ms: int | None = Field(None, description="Time to wait in milliseconds"),
-        # Key press parameter
-        keys: list[str] | None = Field(None, description="Keys to press"),
-        # Drag parameter
-        path: list[dict[str, int]] | None = Field(
-            None, description="Path for drag actions as list of {x, y} dicts"
-        ),
-        # Custom action parameter
-        action: str | None = Field(None, description="Custom action name"),
-    ) -> list[ContentBlock]:
-        """Overriding OpenAIComputerTool.__call__()"""
-        result = await super().__call__(
-            type=type,
-            x=x,
-            y=y,
-            button=button,
-            text=text,
-            scroll_x=scroll_x,
-            scroll_y=scroll_y,
-            ms=ms,
-            keys=keys,
-            path=path,
-            action=action,
-        )
-        screenshot_action_type = {
-            "screenshot",
-            "click",
-            "double_click",
-            "scroll",
-            "type",
-            "move",
-            "keypress",
-            "drag",
-            "wait",
-        }
-        if type in screenshot_action_type:
-            if hasattr(self, "_trigger_callbacks"):
-                await self._trigger_callbacks("on_screenshot_action")
-            else:
-                logger.warning("_trigger_callbacks method not available")
-
-        recorded_actions = {
-            "click",
-            "double_click",
-            "type",
-            "keypress",
-            "scroll",
-            "drag",
-        }
-        if type in recorded_actions:
-            logger.info("debug record actions")
-            await self._trigger_callbacks(
-                "on_recorded_action",
-                type=type,
-                x=x,
-                y=y,
-                text=text,
-                path=path,
-                scroll_x=scroll_x,
-                scroll_y=scroll_y,
-            )
-
-        return result
diff --git a/environments/online_mind2web/src/hud_controller/tools/playwright.py b/environments/online_mind2web/src/hud_controller/tools/playwright.py
deleted file mode 100644
index e9aeace2..00000000
--- a/environments/online_mind2web/src/hud_controller/tools/playwright.py
+++ /dev/null
@@ -1,604 +0,0 @@
-"""Playwright web automation tool for HUD."""
-
-from __future__ import annotations
-
-import logging
-import os
-import base64
-from datetime import datetime
-from typing import TYPE_CHECKING, Any, Literal
-
-from mcp import ErrorData, McpError
-from mcp.types import INVALID_PARAMS, ContentBlock
-from pydantic import Field
-from hud.tools.playwright import PlaywrightTool, ContentResult
-from playwright.sync_api import expect
-
-if TYPE_CHECKING:
-    from playwright.async_api import Browser, BrowserContext, Page
-
-logger = logging.getLogger(__name__)
-
-
-class OnlineMind2Web_PlaywrightTool(PlaywrightTool):
-    """Enhanced Playwright tool with screenshot and action recording for Mind2Web."""
-
-    def __init__(self, page=None, cdp_url=None):
-        super().__init__(page=page, cdp_url=cdp_url)
-        # Register callbacks for recording
-        self.add_callback("on_screenshot_action", self._on_screenshot_action)
-        self.add_callback("on_recorded_action", self._on_recorded_action)
-
-    async def _on_screenshot_action(self, **_) -> bytes | None:
-        """Callback to take and save screenshots to /screenshot directory."""
-        try:
-            # Ensure browser connection is alive before taking screenshot
-            await self._ensure_browser()
-            if self.page is None:
-                logger.debug("Page not initialized, skipping screenshot")
-                return
-
-            # Take screenshot with animations disabled to avoid font loading delays
-            screenshot_bytes = await self.page.screenshot(full_page=False, animations="disabled")
-            screenshot_base64 = base64.b64encode(screenshot_bytes).decode()
-
-            # Create screenshot directory
-            screenshot_dir = "/screenshot"
-            os.makedirs(screenshot_dir, exist_ok=True)
-
-            # Generate timestamp-based filename
-            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")[:-3]
-            filename = f"screenshot_{timestamp}.png"
-            filepath = os.path.join(screenshot_dir, filename)
-
-            # Decode and save
-            image_data = base64.b64decode(screenshot_base64)
-            with open(filepath, "wb") as f:
-                f.write(image_data)
-
-            logger.info(f"Saved screenshot to {filepath}")
-            return image_data
-
-        except Exception as e:
-            logger.debug(f"Screenshot callback failed: {e}")
-
-    async def _on_recorded_action(
-        self,
-        action=None,
-        selector=None,
-        text=None,
-        value=None,
-        label=None,
-        **_,
-    ):
-        """Record action in unified representation format."""
-        if not action:
-            return
-
-        try:
-            # Create action representation
-            action_repr = self._to_action_repr(action, selector, text, value, label)
-
-            # Save to action history
-            action_history_dir = "/action_history"
-            os.makedirs(action_history_dir, exist_ok=True)
-            action_file = os.path.join(action_history_dir, "action_history.txt")
-
-            with open(action_file, "a", encoding="utf-8") as f:
-                f.write(f"{action_repr}\n")
-
-            logger.info(f"Recorded action: {action_repr}")
-
-        except Exception as e:
-            logger.warning(f"Failed to record action: {e}")
-
-    def _to_action_repr(self, action, selector=None, text=None, value=None, label=None):
-        """Create unified action representation.
-
-        Format examples:
-        - <selector=button.submit> -> CLICK
-        - <selector=input#email> -> TYPE user@example.com
-        - <selector=select#country, value=US> -> SELECT
-        """
-        # Normalize action name
-        action_name = action.upper()
-
-        # Build element attributes
-        attributes = []
-        if selector:
-            attributes.append(f"selector={selector}")
-        if value:
-            attributes.append(f"value={value}")
-        if label:
-            attributes.append(f"label={label}")
-
-        # Create element part
-        element_part = f"<{', '.join(attributes)}>" if attributes else "<>"
-
-        # Create action part
-        if text and action_name == "TYPE":
-            action_part = f"TYPE {text}"
-        elif action_name == "SELECT_OPTION":
-            action_part = "SELECT"
-        else:
-            action_part = action_name
-
-        return f"{element_part} -> {action_part}"
-
-    async def get_elements(self, element_type: str | None = None) -> dict[str, Any]:
-        """Get interactive elements on the page with their selectors.
-
-        Args:
-            element_type: Optional filter for element type (e.g., 'button', 'a', 'input')
-
-        Returns:
-            Dict with list of elements and their properties
-        """
-        await self._ensure_browser()
-        if self.page is None:
-            raise RuntimeError("Page not initialized")
-
-        try:
-            # JavaScript to extract interactive elements
-            js_code = """
-            (elementType) => {
-                const elements = [];
-                let selector = elementType || 'a, button, input, select, textarea, [role="button"], [onclick]';
-
-                document.querySelectorAll(selector).forEach((el, idx) => {
-                    if (!el.offsetParent && el.tagName !== 'INPUT') return; // Skip hidden elements
-
-                    const rect = el.getBoundingClientRect();
-                    if (rect.width === 0 || rect.height === 0) return; // Skip zero-size elements
-
-                    const info = {
-                        tag: el.tagName.toLowerCase(),
-                        text: el.textContent?.trim().substring(0, 100) || '',
-                        type: el.type || '',
-                        id: el.id || '',
-                        classes: Array.from(el.classList).join('.'),
-                        name: el.name || '',
-                        placeholder: el.placeholder || '',
-                        href: el.href || '',
-                        value: el.value || '',
-                        role: el.getAttribute('role') || '',
-                        ariaLabel: el.getAttribute('aria-label') || '',
-                    };
-
-                    // Generate suggested selector
-                    let selector = '';
-                    if (info.id) {
-                        selector = `#${info.id}`;
-                    } else if (info.name) {
-                        selector = `${info.tag}[name="${info.name}"]`;
-                    } else if (info.classes) {
-                        selector = `${info.tag}.${info.classes}`;
-                    } else if (info.text && info.text.length > 0 && info.text.length < 30) {
-                        selector = `text=${info.text}`;
-                    } else {
-                        selector = info.tag;
-                    }
-
-                    elements.push({
-                        selector: selector,
-                        ...info
-                    });
-                });
-
-                return elements;
-            }
-            """
-
-            elements = await self.page.evaluate(js_code, element_type)
-
-            if not elements:
-                return {
-                    "success": True,
-                    "elements": [],
-                    "message": f"No interactive elements found{f' of type {element_type}' if element_type else ''}",
-                }
-
-            # Format output for better readability
-            formatted_elements = []
-            for i, elem in enumerate(elements[:50], 1):  # Limit to 50 elements
-                elem_desc = f"{i}. {elem['selector']}"
-                if elem["text"]:
-                    elem_desc += f" - '{elem['text'][:50]}'"
-                if elem["placeholder"]:
-                    elem_desc += f" (placeholder: {elem['placeholder']})"
-                if elem["type"]:
-                    elem_desc += f" [type={elem['type']}]"
-                formatted_elements.append(elem_desc)
-
-            message = f"Found {len(elements)} interactive elements:\n" + "\n".join(
-                formatted_elements
-            )
-            if len(elements) > 50:
-                message += f"\n... and {len(elements) - 50} more elements"
-
-            return {"success": True, "elements": elements, "message": message}
-
-        except Exception as e:
-            logger.error("Get elements failed: %s", e)
-            return {
-                "success": False,
-                "error": str(e),
-                "message": f"Failed to get elements: {e}",
-            }
-
-    async def get_page_content(self) -> dict[str, Any]:
-        """Get simplified page content including HTML structure and text.
-
-        Returns:
-            Dict with page content information
-        """
-        await self._ensure_browser()
-        if self.page is None:
-            raise RuntimeError("Page not initialized")
-
-        try:
-            # Get basic page info
-            url = self.page.url
-            title = await self.page.title()
-
-            # Get simplified DOM structure focusing on semantic and interactive elements
-            js_code = """
-            () => {
-                function getSimplifiedDOM(element, depth = 0, maxDepth = 4) {
-                    if (depth > maxDepth) return null;
-
-                    // Skip script, style, and hidden elements
-                    if (['SCRIPT', 'STYLE', 'NOSCRIPT'].includes(element.tagName)) return null;
-                    if (element.offsetParent === null && element.tagName !== 'BODY') return null;
-
-                    const node = {
-                        tag: element.tagName.toLowerCase(),
-                    };
-
-                    // Add relevant attributes
-                    if (element.id) node.id = element.id;
-                    if (element.className && typeof element.className === 'string') {
-                        node.class = element.className.split(' ').filter(c => c).slice(0, 3).join(' ');
-                    }
-                    if (element.href) node.href = element.href;
-                    if (element.type) node.type = element.type;
-                    if (element.placeholder) node.placeholder = element.placeholder;
-                    if (element.name) node.name = element.name;
-
-                    // Get text content for leaf nodes or small elements
-                    const text = element.textContent?.trim();
-                    if (element.children.length === 0 && text && text.length < 100) {
-                        node.text = text;
-                    }
-
-                    // Recursively process children
-                    const children = [];
-                    for (const child of element.children) {
-                        const childNode = getSimplifiedDOM(child, depth + 1, maxDepth);
-                        if (childNode) children.push(childNode);
-                    }
-
-                    if (children.length > 0) {
-                        node.children = children;
-                    }
-
-                    return node;
-                }
-
-                const body = document.body;
-                return {
-                    structure: getSimplifiedDOM(body),
-                    mainText: document.body.innerText?.substring(0, 2000) || '',
-                };
-            }
-            """
-
-            content = await self.page.evaluate(js_code)
-
-            # Format structure as readable text
-            def format_structure(node, indent=0):
-                if not node:
-                    return ""
-
-                lines = []
-                prefix = "  " * indent
-
-                # Format node
-                tag_str = f"{prefix}<{node['tag']}"
-                if node.get("id"):
-                    tag_str += f' id="{node["id"]}"'
-                if node.get("class"):
-                    tag_str += f' class="{node["class"]}"'
-                if node.get("href"):
-                    tag_str += f' href="{node["href"][:50]}"'
-                if node.get("type"):
-                    tag_str += f' type="{node["type"]}"'
-                if node.get("placeholder"):
-                    tag_str += f' placeholder="{node["placeholder"]}"'
-                if node.get("name"):
-                    tag_str += f' name="{node["name"]}"'
-                tag_str += ">"
-
-                if node.get("text"):
-                    tag_str += f" {node['text']}"
-
-                lines.append(tag_str)
-
-                # Process children
-                if node.get("children"):
-                    for child in node["children"][:20]:  # Limit children shown
-                        lines.append(format_structure(child, indent + 1))
-
-                return "\n".join(lines)
-
-            structure_text = format_structure(content.get("structure", {}))
-            main_text = content.get("mainText", "")
-
-            message = f"""Page: {title}
-URL: {url}
-
-Main Text Content (first 2000 chars):
-{main_text}
-
-Page Structure:
-{structure_text[:3000]}
-"""
-
-            return {"success": True, "content": content, "message": message}
-
-        except Exception as e:
-            logger.error("Get page content failed: %s", e)
-            return {
-                "success": False,
-                "error": str(e),
-                "message": f"Failed to get page content: {e}",
-            }
-
-    async def __call__(
-        self,
-        action: str = Field(
-            ...,
-            description="The action to perform (navigate, screenshot, click, type, select_option, wait_for_element, get_elements, get_page_content)",  # noqa: E501
-        ),
-        url: str | None = Field(None, description="URL to navigate to (for navigate action)"),
-        selector: str | None = Field(
-            None,
-            description="""CSS selector, Playwright locator syntax, or XPath for element selection. Examples:
-- CSS: 'button.submit', '#email', 'a[href="/api"]', 'div.container > p'
-- Playwright text: 'text=Click here', 'text=/API.*/i' (case-insensitive regex)
-- Playwright role: 'role=button[name="Submit"]'
-- XPath: '//button[contains(text(), "Submit")]'
-- Attribute: '[data-testid="login-button"]'
-If selector matches multiple elements, Playwright will use the first visible one. For stricter matching, make selector more specific.""",  # noqa: E501
-        ),
-        text: str | None = Field(None, description="Text to type (for type action)"),
-        value: str | None = Field(
-            None, description="Option value to select (for select_option action)"
-        ),
-        label: str | None = Field(
-            None, description="Option label to select (for select_option action)"
-        ),
-        index: int | None = Field(
-            None, description="Option index to select (for select_option action)"
-        ),
-        element_type: str | None = Field(
-            None,
-            description="Element type filter for get_elements (e.g., 'button', 'a', 'input', 'select'). Leave empty to get all interactive elements.",  # noqa: E501
-        ),
-        wait_for_load_state: Literal["commit", "domcontentloaded", "load", "networkidle"]
-        | None = Field(
-            None,
-            description="State to wait for: commit, domcontentloaded, load, networkidle (default: load)",  # noqa: E501
-        ),
-    ) -> list[ContentBlock]:
-        """
-        Execute a Playwright web automation action.
-
-        Returns:
-            List of MCP content blocks
-        """
-        logger.info("PlaywrightTool executing action: %s", action)
-
-        try:
-            if action == "navigate":
-                if url is None:
-                    raise McpError(
-                        ErrorData(
-                            code=INVALID_PARAMS, message="url parameter is required for navigate"
-                        )
-                    )
-                result = await self.navigate(url, wait_for_load_state or "load")
-
-            elif action == "screenshot":
-                result = await self.screenshot()
-
-            elif action == "click":
-                if selector is None:
-                    raise McpError(
-                        ErrorData(
-                            code=INVALID_PARAMS, message="selector parameter is required for click"
-                        )
-                    )
-                result = await self.click(selector)
-
-            elif action == "type":
-                if selector is None:
-                    raise McpError(
-                        ErrorData(
-                            code=INVALID_PARAMS, message="selector parameter is required for type"
-                        )
-                    )
-                if text is None:
-                    raise McpError(
-                        ErrorData(
-                            code=INVALID_PARAMS, message="text parameter is required for type"
-                        )
-                    )
-                result = await self.type_text(selector, text)
-
-            elif action == "select_option":
-                if selector is None:
-                    raise McpError(
-                        ErrorData(
-                            code=INVALID_PARAMS,
-                            message="selector parameter is required for select_option",
-                        )
-                    )
-                # Implement select_option using page API
-                await self._ensure_browser()
-                if self.page is None:
-                    raise RuntimeError("Page not initialized")
-                locator = self.page.locator(selector)
-                if value:
-                    await locator.select_option(value=value)
-                    result = {"success": True, "message": f"Selected option by value: {value}"}
-                elif label:
-                    await locator.select_option(label=label)
-                    result = {"success": True, "message": f"Selected option by label: {label}"}
-                elif index is not None:
-                    await locator.select_option(index=index)
-                    result = {"success": True, "message": f"Selected option by index: {index}"}
-                else:
-                    result = {"success": False, "error": "Must provide value, label, or index"}
-
-            elif action == "wait_for_element":
-                if selector is None:
-                    raise McpError(
-                        ErrorData(
-                            code=INVALID_PARAMS,
-                            message="selector parameter is required for wait_for_element",
-                        )
-                    )
-                result = await self.wait_for_element(selector)
-
-            elif action == "get_elements":
-                result = await self.get_elements(element_type)
-
-            elif action == "get_page_content":
-                result = await self.get_page_content()
-
-            else:
-                raise McpError(ErrorData(code=INVALID_PARAMS, message=f"Unknown action: {action}"))
-
-            # Trigger callbacks for screenshot and action recording
-            screenshot_actions = {
-                "navigate",
-                "click",
-                "type",
-                "select_option",
-            }
-            if action in screenshot_actions and action != "screenshot":
-                try:
-                    os.makedirs("/screenshot", exist_ok=True)
-                    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")[:-3]
-                    screenshot_path = f"/screenshot/screenshot_{timestamp}.png"
-                    # Take screenshot and save to file
-                    screenshot_result = await self.screenshot(path=screenshot_path)
-                    # Add screenshot to result if successful
-                    if screenshot_result.base64_image and isinstance(result, dict):
-                        result = ContentResult(
-                            output=result.get("message", ""),
-                            base64_image=screenshot_result.base64_image,
-                        )
-                    elif screenshot_result.base64_image and isinstance(result, ContentResult):
-                        result = ContentResult(
-                            output=result.output,
-                            error=result.error,
-                            base64_image=screenshot_result.base64_image,
-                        )
-
-                    if screenshot_result.base64_image:
-                        logger.info(f"Saved screenshot to {screenshot_path}")
-                except Exception as e:
-                    logger.warning(f"Screenshot after action failed: {e}")
-
-            recorded_actions = {
-                "navigate",
-                "click",
-                "type",
-                "select_option",
-                "screenshot",
-            }
-            if action in recorded_actions:
-                await self._trigger_callbacks(
-                    "on_recorded_action",
-                    action=action,
-                    selector=selector,
-                    text=text,
-                    value=value,
-                    label=label,
-                )
-
-            # Convert dict result to ToolResult
-            if isinstance(result, dict):
-                if result.get("success"):
-                    tool_result = ContentResult(output=result.get("message", ""))
-                else:
-                    tool_result = ContentResult(error=result.get("error", "Unknown error"))
-            elif isinstance(result, ContentResult):
-                tool_result = result
-            else:
-                tool_result = ContentResult(output=str(result))
-
-            # Convert result to content blocks
-            return tool_result.to_content_blocks()
-
-        except McpError:
-            raise
-        except Exception as e:
-            logger.error("PlaywrightTool error: %s", e)
-            raise McpError(ErrorData(code=INVALID_PARAMS, message=f"Playwright error: {e}")) from e
-
-    async def screenshot(self, path=None) -> ContentResult:
-        """Take a screenshot of the current page.
-
-        Returns:
-            ToolResult with base64_image
-        """
-        await self._ensure_browser()
-        if self.page is None:
-            raise RuntimeError("Page not initialized after _ensure_browser")
-
-        try:
-            # Always return base64 encoded screenshot as ToolResult
-            screenshot_bytes = await self.page.screenshot(
-                full_page=False, animations="disabled", path=path
-            )
-            import base64
-
-            screenshot_b64 = base64.b64encode(screenshot_bytes).decode()
-            return ContentResult(base64_image=screenshot_b64)
-        except Exception as e:
-            logger.error("Screenshot failed: %s", e)
-            return ContentResult(error=f"Failed to take screenshot: {e}")
-
-    async def click(
-        self,
-        selector: str,
-        button: Literal["left", "right", "middle"] = "left",
-        count: int = 1,
-    ) -> dict[str, Any]:
-        """Click an element by selector.
-
-        Args:
-            selector: CSS selector for element to click
-            button: Mouse button to use (left, right, middle)
-            count: Number of clicks
-
-        Returns:
-            Dict with click result
-        """
-        await self._ensure_browser()
-        if self.page is None:
-            raise RuntimeError("Page not initialized after _ensure_browser")
-
-        try:
-            await self.page.click(selector, button=button, click_count=count, timeout=10000)
-            return {"success": True, "message": f"Clicked element: {selector}"}
-        except Exception as e:
-            logger.error("Click failed: %s", e)
-            return {
-                "success": False,
-                "error": str(e),
-                "message": f"Failed to click {selector}: {e}",
-            }
diff --git a/environments/online_mind2web/test_task.json b/environments/online_mind2web/test_task.json
deleted file mode 100644
index 9b5d91fe..00000000
--- a/environments/online_mind2web/test_task.json
+++ /dev/null
@@ -1,55 +0,0 @@
-[
-    {
-        "prompt": "Find the store location and hours of the closest Trader Joe's to zip code 90028 and set it as my home store.",
-        "mcp_config": {
-            "local": {
-                "command": "docker",
-                "args": [
-                    "run",
-                    "--rm",
-                    "-i",
-                    "--env-file",
-                    ".env",
-                    "hud-om2w:latest"
-                ]
-            }
-        },
-        "setup_tool": {
-            "name": "playwright",
-            "arguments": {
-                "action": "navigate",
-                "url": "https://www.traderjoes.com/",
-                "wait_for_load_state": "networkidle"
-            }
-        },
-        "evaluate_tool": {
-            "name": "evaluate",
-            "arguments": {
-                "name": "webjudge",
-                "arguments": {
-                    "task_description": {
-                        "task_id": "b7258ee05d75e6c50673a59914db412e_110325",
-                        "confirmed_task": "Find the store location and hours of the closest Trader Joe's to zip code 90028 and set it as my home store.",
-                        "website": "https://www.traderjoes.com/",
-                        "reference_length": 6,
-                        "level": "medium"
-                    }
-                }
-            }
-        },
-        "metadata": {
-            "dataset": "Online-Mind2Web",
-            "website": "https://www.traderjoes.com/",
-            "task_type": "web_navigation",
-            "reference_length": 6
-        },
-        "system_prompt": "\nYou are assisting humans doing web navigation tasks step by step. At each stage, you can see the webpage by a screenshot.\n\n**Instructions**:\n1. Read the task description carefully.\n2. Identify and extract **key points** directly stated in the task description.\n  - A **key point** is a critical element, condition, or step explicitly mentioned in the task description.\n  - Do not infer or add any unstated elements.\n  - Words such as \"best\", \"highest\", \"cheapest\", \"latest\", \"most recent\", \"lowest\", \"closest\", \"highest-rated\", \"largest, \" and \"newest\" must go through the sort function(e.g., the key point should be \"Filter by highest\").\n3. Execute the task step-by-step, ensuring you address each identified key point. Use appropriate sort/filter functions for any comparative requirements.\n\n**Allowed Tools**\nPlaywright TOOL:\n- navigate(url, wait_for_load_state=\"load\") - Go to URL (default: load, can use: commit, domcontentloaded, load, networkidle)\n- screenshot() - See current page\n- get_elements(element_type=None) - List interactive elements with selectors\n- get_page_content() - Get page structure and text\n- click(selector) - Click element\n- type(selector, text) - Type into input\n- select_option(selector, value/label/index) - Select dropdown\n- wait_for_element(selector) - Wait for element\n\nSELECTORS:\n1. ID: \"#unique-id\"\n2. Text: \"text=Button Label\"\n3. Role: \"role=button[name=\"Submit\"]\"\n4. CSS: \"button.primary\"\n5. XPath: \" //button[contains(text(),\"Submit\")]\"\n\nCOMPUTER TOOL:\n- computer, anthropic_computer_tool, openai_computer_tool\n- Actions: click(coordinate=[x,y]), type(text), mouse_move(coordinate), scroll(direction)\n\nWORKFLOW:\n1. screenshot → see state\n2. get_elements (if unsure) → discover elements\n3. action → interact\n4. screenshot → verify\n5. repeat\n\nTIPS:\n- Always start with screenshot\n- Use get_elements when unsure what\"s clickable\n- Prefer text selectors: \"text=Pricing\" over \".nav-link:nth-child(3)\"\n- If \"element not found\" → use get_elements to find correct selector\n- If \"multiple elements match\" → make selector more specific or use text-based\n- When encountering websites protected by security measures like Cloudflare and CAPTCHA, use COMPUTER TOOL the validation mechanism, then continue the task.\n ",
-        "agent_config": {
-            "disallowed_tools": [
-                "setup",
-                "evaluate",
-                "computer"
-            ]
-        }
-    }
-]
\ No newline at end of file
diff --git a/environments/remote_browser/.gitignore b/environments/remote_browser/.gitignore
deleted file mode 100644
index 1916bd2d..00000000
--- a/environments/remote_browser/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-gcp.json
-test.ipynb
\ No newline at end of file
diff --git a/environments/remote_browser/Dockerfile b/environments/remote_browser/Dockerfile
deleted file mode 100644
index c550b572..00000000
--- a/environments/remote_browser/Dockerfile
+++ /dev/null
@@ -1,36 +0,0 @@
-# Use our HUD base browser image with Playwright and uv pre-installed
-FROM hudpython/base-browser:latest
-
-# Create app-specific working directory
-WORKDIR /app
-
-# Copy project files
-COPY pyproject.toml ./
-COPY src/ ./src/
-
-# Install the package using the existing venv at /opt/venv
-# The --python flag tells uv to use this specific Python instead of creating a new venv
-RUN uv pip install --python /opt/venv -e .
-
-# Create directories for logs and data
-RUN mkdir -p /app/logs /app/data
-
-ENV DISPLAY_WIDTH=1448
-ENV DISPLAY_HEIGHT=944
-
-ENV PYTHONUNBUFFERED=1 \
-    PYTHONDONTWRITEBYTECODE=1
-# Note: Environment variables for browser providers should be set at runtime:
-# - BROWSER_PROVIDER: anchorbrowser, steel, browserbase, hyperbrowser, kernel
-# - Provider-specific API keys: ANCHOR_API_KEY, STEEL_API_KEY, etc.
-# - GCP_CREDENTIALS_JSON: For Google Sheets functionality (if needed)
-
-# Run remote browser with persistent context
-CMD ["sh", "-c", "\
-    # Start context server in background \
-    python3 -m hud_controller.context >&2 & \
-    # Wait a bit for context server to start \
-    sleep 2 && \
-    # Run MCP server in foreground with exec \
-    exec python3 -m hud_controller.server \
-"]
\ No newline at end of file
diff --git a/environments/remote_browser/README.md b/environments/remote_browser/README.md
deleted file mode 100644
index 76ee20e0..00000000
--- a/environments/remote_browser/README.md
+++ /dev/null
@@ -1,225 +0,0 @@
-# HUD Remote Browser MCP Server
-
-This MCP server provides browser automation capabilities using various remote browser providers.
-
-## Running with Docker
-
-The Docker image supports both production and development modes using the same Dockerfile.
-
-### Building the Image
-
-```bash
-# Production build (default)
-docker build -t hud-remote-browser:latest .
-```
-
-### Running in Production Mode
-
-```bash
-# Using AnchorBrowser
-docker run --rm -i \
-  -e BROWSER_PROVIDER=anchorbrowser \
-  -e ANCHOR_API_KEY=your-api-key \
-  hud-remote-browser:latest
-
-# Using BrowserBase
-docker run --rm -i \
-  -e BROWSER_PROVIDER=browserbase \
-  -e BROWSERBASE_API_KEY=your-api-key \
-  -e BROWSERBASE_PROJECT_ID=your-project-id \
-  hud-remote-browser:latest
-```
-
-### Running in Development Mode (Hot Reload)
-
-Development mode allows you to edit code locally and see changes immediately without rebuilding.
-
-#### Option 1: Using `hud dev` (Recommended)
-
-The easiest way to develop with hot-reload:
-
-```bash
-# Set required environment variables
-export BROWSER_PROVIDER=anchorbrowser
-export ANCHOR_API_KEY=your-api-key
-
-# Start development proxy
-hud dev . --build
-
-# This will:
-# - Build/use hud-remote-browser:dev image
-# - Mount ./src for hot-reload
-# - Provide HTTP endpoint for Cursor
-# - Auto-restart on file changes
-# - Pass through environment variables
-# - **Keep browser sessions alive across restarts**
-```
-
-Add the URL from output to Cursor or click the deeplink.
-
-**Note**: With hot-reload enabled, your browser session persists across code changes. This means you can modify your code and the server will restart automatically without losing your browser state, tabs, or navigation history.
-
-#### Option 2: Manual Docker Run
-
-For direct control over the development environment:
-
-```bash
-# Windows
-docker run --rm -i ^
-  -v "%cd%\src:/app/src:rw" ^
-  -e BROWSER_PROVIDER=anchorbrowser ^
-  -e ANCHOR_API_KEY=your-api-key ^
-  -e PYTHONPATH=/app ^
-  hud-remote-browser:dev
-
-# Linux/Mac
-docker run --rm -i \
-  -v "$(pwd)/src:/app/src:rw" \
-  -e BROWSER_PROVIDER=anchorbrowser \
-  -e ANCHOR_API_KEY=your-api-key \
-  -e PYTHONPATH=/app/src \
-  hud-remote-browser:dev
-```
-
-The `-v` flag mounts your local `src/` directory into the container, allowing instant code changes.
-
-## Supported Browser Providers
-
-- **anchorbrowser** - Requires `ANCHOR_API_KEY`
-- **browserbase** - Requires `BROWSERBASE_API_KEY` and `BROWSERBASE_PROJECT_ID`
-- **hyperbrowser** - Requires `HYPERBROWSER_API_KEY`
-- **steel** - Requires `STEEL_API_KEY`
-- **kernel** - No additional requirements
-
-## Environment Variables
-
-### Core Variables
-
-| Variable | Required | Description |
-|----------|----------|-------------|
-| `BROWSER_PROVIDER` | **Yes** | The browser provider to use |
-| `LOG_LEVEL` | No | Logging level (default: INFO) |
-
-### Provider-Specific Variables
-
-| Provider | Required Variables |
-|----------|-------------------|
-| anchorbrowser | `ANCHOR_API_KEY` |
-| browserbase | `BROWSERBASE_API_KEY`, `BROWSERBASE_PROJECT_ID` |
-| hyperbrowser | `HYPERBROWSER_API_KEY` |
-| steel | `STEEL_API_KEY` |
-
-### Optional Browser Settings
-
-| Variable | Description |
-|----------|-------------|
-| `HEADLESS` | Whether to run browser in headless mode |
-| `DEFAULT_TIMEOUT` | Default timeout for browser operations |
-| `WINDOW_WIDTH` | Browser window width |
-| `WINDOW_HEIGHT` | Browser window height |
-| `PROXY_URL` | HTTP proxy URL |
-
-### Proxy Configuration
-
-The remote browser environment supports multiple proxy providers:
-
-| Variable | Description | Default |
-|----------|-------------|---------|
-| `PROXY_PROVIDER` | Proxy provider type (auto, decodo, standard, residential, none) | auto |
-
-#### Options:
-
-- **`auto`** (default): Let the browser use its default proxy
-- **`decodo`**: Use Decodo proxy service
-  - Requires: `DECODO_USERNAME`, `DECODO_PASSWORD`
-  - Optional: `DECODO_ROTATING` (false=port 10000, true=test ports 10001-11000)
-- **`standard`**: Use any HTTP/SOCKS proxy
-  - Requires: `PROXY_SERVER`
-  - Optional: `PROXY_USERNAME`, `PROXY_PASSWORD`
-- **`none`**: Force direct connection (no proxy)
-
-Example:
-```bash
-# Use Decodo proxy
-export PROXY_PROVIDER=decodo
-export DECODO_USERNAME=username
-export DECODO_PASSWORD=password
-```
-
-### Google Cloud Platform (GCP) Credentials
-
-For Google Sheets functionality, you have multiple options to provide GCP credentials:
-
-#### Option 1: JSON String (now more lenient)
-```bash
-# Supports standard JSON, single-quoted, or Python dict format
--e GCP_CREDENTIALS_JSON='{"type":"service_account","project_id":"...","private_key":"..."}'
-```
-
-#### Option 2: Base64 Encoded (recommended for complex credentials)
-```bash
-# First encode your credentials file
-base64 < service-account.json
-# Then set the environment variable
--e GCP_CREDENTIALS_BASE64='eyJ0eXBlIjoic2VydmljZV9hY2NvdW50IiwicHJvamVjdF9pZCI6Li4ufQ=='
-```
-
-#### Option 3: File Path
-```bash
-# Mount the credentials file and reference it
--v /path/to/service-account.json:/app/creds.json \
--e GCP_CREDENTIALS_FILE='/app/creds.json'
-```
-
-#### Option 4: Individual Environment Variables
-```bash
--e GCP_TYPE='service_account' \
--e GCP_PROJECT_ID='your-project-id' \
--e GCP_PRIVATE_KEY_ID='your-key-id' \
--e GCP_PRIVATE_KEY='-----BEGIN PRIVATE KEY-----\n...\n-----END PRIVATE KEY-----' \
--e GCP_CLIENT_EMAIL='your-service-account@project.iam.gserviceaccount.com' \
--e GCP_CLIENT_ID='1234567890' \
--e GCP_AUTH_URI='https://accounts.google.com/o/oauth2/auth' \
--e GCP_TOKEN_URI='https://oauth2.googleapis.com/token' \
--e GCP_AUTH_PROVIDER_X509_CERT_URL='https://www.googleapis.com/oauth2/v1/certs' \
--e GCP_CLIENT_X509_CERT_URL='https://www.googleapis.com/robot/v1/metadata/x509/...'
-```
-
-## MCP Resources
-
-The server provides several MCP resources:
-
-### telemetry://live
-Returns real-time telemetry data including the provider's live view URL (if available):
-```json
-{
-  "provider": "anchorbrowser",
-  "status": "running",
-  "live_url": "https://browser.anchorbrowser.io/sessions/abc123",
-  "cdp_url": "wss://browser.anchorbrowser.io/devtools/...",
-  "instance_id": "session_abc123",
-  "timestamp": "2024-01-15T10:30:00.000Z"
-}
-```
-
-### setup://registry
-Returns all available setup functions for browser initialization.
-
-### evaluators://registry
-Returns all available evaluator functions for browser state validation.
-
-## MCP Protocol
-
-The server communicates via stdio using the MCP protocol. Example initialization:
-
-```bash
-echo '{"jsonrpc": "2.0", "id": 1, "method": "initialize", "params": {...}}' | \
-  docker run --rm -i -e BROWSER_PROVIDER=steel -e STEEL_API_KEY=... hud-remote-browser:latest
-```
-
-## Error Handling
-
-If `BROWSER_PROVIDER` is not set, the server will fail with:
-```
-BROWSER_PROVIDER environment variable is required. Supported providers: anchorbrowser, steel, browserbase, hyperbrowser, kernel
-```
\ No newline at end of file
diff --git a/environments/remote_browser/pyproject.toml b/environments/remote_browser/pyproject.toml
deleted file mode 100644
index cbf1e856..00000000
--- a/environments/remote_browser/pyproject.toml
+++ /dev/null
@@ -1,22 +0,0 @@
-[project]
-name = "hud-remote-browser"
-version = "0.1.0"
-description = "HUD Remote Browser Controller with MCP tools for cloud browser providers"
-requires-python = ">=3.11,<3.13"
-dependencies = [ "hud-python>=0.4.12", "pyautogui", "playwright", "httpx", "typer", "google-api-python-client", "google-auth",]
-
-[build-system]
-requires = [ "hatchling",]
-build-backend = "hatchling.build"
-
-[project.scripts]
-hud-remote-browser = "hud_controller.__main__:main"
-
-[tool.hud]
-image = "hud-remote-browser:dev"
-
-[tool.hatch.metadata]
-allow-direct-references = true
-
-[tool.hatch.build.targets.wheel]
-packages = [ "src/hud_controller",]
diff --git a/environments/remote_browser/src/hud_controller/__init__.py b/environments/remote_browser/src/hud_controller/__init__.py
deleted file mode 100644
index a17cd01e..00000000
--- a/environments/remote_browser/src/hud_controller/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-"""HUD Remote Browser Controller - Manages remote browser environments via MCP."""
-
-__version__ = "0.1.0"
diff --git a/environments/remote_browser/src/hud_controller/context.py b/environments/remote_browser/src/hud_controller/context.py
deleted file mode 100644
index 0e45a766..00000000
--- a/environments/remote_browser/src/hud_controller/context.py
+++ /dev/null
@@ -1,139 +0,0 @@
-"""
-Context server for remote browser environment that persists state across hot-reloads.
-
-Run this as a separate process to maintain browser session state during development:
-    python -m hud_controller.context
-"""
-
-import asyncio
-import logging
-from datetime import datetime
-from typing import Dict, Any, Optional
-from hud.server.context import run_context_server
-
-logger = logging.getLogger(__name__)
-
-
-class RemoteBrowserContext:
-    """Context that holds remote browser state across reloads."""
-
-    def __init__(self):
-        """Initialize the remote browser context."""
-        self.browser_provider = None
-        self.is_initialized = False
-        self.provider_config: Optional[Dict[str, Any]] = None
-        self.launch_options: Optional[Dict[str, Any]] = None
-        self._startup_complete = False
-        self.playwright_tool = None  # Store the playwright tool
-        self._telemetry: Optional[Dict[str, Any]] = None  # Store full telemetry data
-
-        logger.info("[RemoteBrowserContext] Created new remote browser context")
-
-    def startup(self):
-        """One-time startup when context server starts."""
-        if self._startup_complete:
-            logger.info("[RemoteBrowserContext] Startup already complete, skipping")
-            return
-
-        logger.info("[RemoteBrowserContext] Performing one-time startup")
-        self._startup_complete = True
-
-    # === Proxy-friendly methods for multiprocessing.Manager ===
-    # Note: These are needed because direct attribute access doesn't always
-    # work correctly through the multiprocessing proxy
-
-    def get_browser_provider(self):
-        """Get the browser provider instance."""
-        return self.browser_provider
-
-    def set_browser_provider(self, provider) -> None:
-        """Set the browser provider instance."""
-        self.browser_provider = provider
-        if provider:
-            self.provider_name = provider.__class__.__name__.replace("Provider", "").lower()
-            logger.info(f"[RemoteBrowserContext] Set browser provider: {self.provider_name}")
-
-    def get_cdp_url(self) -> Optional[str]:
-        """Get the CDP URL from telemetry."""
-        return self._telemetry.get("cdp_url") if self._telemetry else None
-
-    def get_is_initialized(self) -> bool:
-        """Check if environment is initialized."""
-        return self.is_initialized
-
-    def set_initialized(self, value: bool) -> None:
-        """Set initialization status."""
-        self.is_initialized = value
-        logger.info(f"[RemoteBrowserContext] Initialization status: {value}")
-
-    def get_provider_config(self) -> Optional[Dict[str, Any]]:
-        """Get provider configuration."""
-        return self.provider_config
-
-    def set_provider_config(self, config: Dict[str, Any]) -> None:
-        """Set provider configuration."""
-        self.provider_config = config
-        logger.info(f"[RemoteBrowserContext] Set provider config")
-
-    def get_launch_options(self) -> Optional[Dict[str, Any]]:
-        """Get launch options."""
-        return self.launch_options
-
-    def set_launch_options(self, options: Dict[str, Any]) -> None:
-        """Set launch options."""
-        self.launch_options = options
-        logger.info(f"[RemoteBrowserContext] Set launch options")
-
-    def get_playwright_tool(self):
-        """Get the playwright tool instance."""
-        return self.playwright_tool
-
-    def set_playwright_tool(self, tool) -> None:
-        """Set the playwright tool instance."""
-        self.playwright_tool = tool
-        logger.info(f"[RemoteBrowserContext] Set playwright tool")
-
-    def set_telemetry(self, telemetry: Dict[str, Any]) -> None:
-        """Set the full telemetry data."""
-        self._telemetry = telemetry
-        logger.info(f"[RemoteBrowserContext] Set telemetry: {telemetry}")
-
-    def get_state_summary(self) -> Dict[str, Any]:
-        """Get a summary of the current state."""
-        return {
-            "is_initialized": self.is_initialized,
-            "startup_complete": self._startup_complete,
-            "provider_name": self._telemetry.get("provider") if self._telemetry else None,
-            "has_cdp_url": self.get_cdp_url() is not None,
-            "has_browser_provider": self.browser_provider is not None,
-            "has_playwright_tool": self.playwright_tool is not None,
-        }
-
-    def get_telemetry(self) -> Dict[str, Any]:
-        """Get telemetry data from the browser provider."""
-        # If we have stored telemetry, return it
-        if self._telemetry:
-            return self._telemetry
-
-        # Otherwise return basic telemetry data
-        return {
-            "provider": "unknown",
-            "status": "not_initialized",
-            "live_url": None,
-            "cdp_url": None,
-            "instance_id": None,
-            "timestamp": datetime.now().isoformat(),
-        }
-
-
-if __name__ == "__main__":
-    # Run the context server with RemoteBrowserContext
-    context = RemoteBrowserContext()
-    context.startup()
-
-    # Log initial state
-    logger.info(f"[Context] Starting remote browser context server")
-    logger.info(f"[Context] Initial state: {context.get_state_summary()}")
-
-    # Run the context server
-    asyncio.run(run_context_server(context, "/tmp/hud_remote_browser_ctx.sock"))
diff --git a/environments/remote_browser/src/hud_controller/evaluate/__init__.py b/environments/remote_browser/src/hud_controller/evaluate/__init__.py
deleted file mode 100644
index 29cd8a3f..00000000
--- a/environments/remote_browser/src/hud_controller/evaluate/__init__.py
+++ /dev/null
@@ -1,24 +0,0 @@
-"""Evaluation layer for remote browser environment."""
-
-from __future__ import annotations
-
-from hud.tools.base import BaseHub
-
-evaluate = BaseHub("evaluate")
-
-# Import all evaluator functions to register them
-from . import (
-    url_match,
-    page_contains,
-    element_exists,
-    cookie_exists,
-    cookie_match,
-    history_length,
-    selector_history,
-    raw_last_action_is,
-    verify_type_action,
-    sheet_contains,
-    sheets_cell_values,
-)
-
-__all__ = ["evaluate"]
diff --git a/environments/remote_browser/src/hud_controller/evaluate/cookie_exists.py b/environments/remote_browser/src/hud_controller/evaluate/cookie_exists.py
deleted file mode 100644
index 31d42f86..00000000
--- a/environments/remote_browser/src/hud_controller/evaluate/cookie_exists.py
+++ /dev/null
@@ -1,69 +0,0 @@
-"""Cookie exists evaluator for remote browser environment."""
-
-import logging
-from fastmcp import Context
-from hud.tools.types import EvaluationResult
-from . import evaluate
-
-logger = logging.getLogger(__name__)
-
-
-@evaluate.tool("cookie_exists")
-async def cookie_exists(ctx: Context, cookie_name: str):
-    """Check if a cookie exists in the browser.
-
-    Args:
-        cookie_name: Name of the cookie to check for
-
-    Returns:
-        Evaluation result
-    """
-    logger.info(f"Checking if cookie exists: {cookie_name}")
-
-    # Get the playwright tool from the environment
-    # Get the playwright tool from the persistent context
-    persistent_ctx = evaluate.env
-    playwright_tool = getattr(persistent_ctx, "playwright_tool", None)
-    if not playwright_tool or not hasattr(playwright_tool, "page") or not playwright_tool.page:
-        logger.error("No browser page available")
-        return EvaluationResult(
-            reward=0.0, done=False, content="No browser page available", info={"success": False}
-        )
-
-    try:
-        # Get all cookies
-        cookies = await playwright_tool.page.context.cookies()
-
-        # Check if cookie exists
-        cookie = next((c for c in cookies if c.get("name") == cookie_name), None)
-
-        if cookie:
-            logger.info(f"✅ Cookie found: {cookie_name}")
-            return EvaluationResult(
-                reward=1.0,
-                done=True,
-                content=f"Cookie found: {cookie_name}",
-                info={
-                    "success": True,
-                    "cookie_name": cookie_name,
-                    "cookie_value": cookie.get("value", ""),
-                    "domain": cookie.get("domain", ""),
-                },
-            )
-        else:
-            logger.info(f"❌ Cookie not found: {cookie_name}")
-            return EvaluationResult(
-                reward=0.0,
-                done=False,
-                content=f"Cookie not found: {cookie_name}",
-                info={"success": False, "cookie_name": cookie_name, "total_cookies": len(cookies)},
-            )
-
-    except Exception as e:
-        logger.error(f"Error checking cookie: {e}")
-        return EvaluationResult(
-            reward=0.0,
-            done=False,
-            content=f"Error checking cookie: {str(e)}",
-            info={"success": False, "error": str(e)},
-        )
diff --git a/environments/remote_browser/src/hud_controller/evaluate/cookie_match.py b/environments/remote_browser/src/hud_controller/evaluate/cookie_match.py
deleted file mode 100644
index 7af93f3f..00000000
--- a/environments/remote_browser/src/hud_controller/evaluate/cookie_match.py
+++ /dev/null
@@ -1,82 +0,0 @@
-"""Cookie match evaluator for remote browser environment."""
-
-import logging
-from fastmcp import Context
-from hud.tools.types import EvaluationResult
-from . import evaluate
-
-logger = logging.getLogger(__name__)
-
-
-@evaluate.tool("cookie_match")
-async def cookie_match(ctx: Context, cookie_name: str, expected_value: str):
-    """Check if a cookie value matches expected value.
-
-    Args:
-        cookie_name: Name of the cookie to check
-        expected_value: Expected value of the cookie
-
-    Returns:
-        Evaluation result
-    """
-    logger.info(f"Checking cookie {cookie_name} for value: {expected_value}")
-
-    # Get the playwright tool from the environment
-    # Get the playwright tool from the persistent context
-    persistent_ctx = evaluate.env
-    playwright_tool = getattr(persistent_ctx, "playwright_tool", None)
-    if not playwright_tool or not hasattr(playwright_tool, "page") or not playwright_tool.page:
-        logger.error("No browser page available")
-        return EvaluationResult(
-            reward=0.0, done=False, content="No browser page available", info={"success": False}
-        )
-
-    try:
-        # Get all cookies
-        cookies = await playwright_tool.page.context.cookies()
-
-        # Find the cookie
-        cookie = next((c for c in cookies if c.get("name") == cookie_name), None)
-
-        if not cookie:
-            logger.info(f"❌ Cookie not found: {cookie_name}")
-            return EvaluationResult(
-                reward=0.0,
-                done=False,
-                content=f"Cookie not found: {cookie_name}",
-                info={"success": False, "cookie_name": cookie_name},
-            )
-
-        actual_value = cookie.get("value", "")
-        if actual_value == expected_value:
-            logger.info(f"✅ Cookie value matches: {cookie_name}={expected_value}")
-            return EvaluationResult(
-                reward=1.0,
-                done=True,
-                content=f"Cookie value matches",
-                info={"success": True, "cookie_name": cookie_name, "value": actual_value},
-            )
-        else:
-            logger.info(
-                f"❌ Cookie value mismatch: expected '{expected_value}', got '{actual_value}'"
-            )
-            return EvaluationResult(
-                reward=0.0,
-                done=False,
-                content=f"Cookie value mismatch",
-                info={
-                    "success": False,
-                    "cookie_name": cookie_name,
-                    "expected": expected_value,
-                    "actual": actual_value,
-                },
-            )
-
-    except Exception as e:
-        logger.error(f"Error checking cookie: {e}")
-        return EvaluationResult(
-            reward=0.0,
-            done=False,
-            content=f"Error checking cookie: {str(e)}",
-            info={"success": False, "error": str(e)},
-        )
diff --git a/environments/remote_browser/src/hud_controller/evaluate/element_exists.py b/environments/remote_browser/src/hud_controller/evaluate/element_exists.py
deleted file mode 100644
index 2c9554b8..00000000
--- a/environments/remote_browser/src/hud_controller/evaluate/element_exists.py
+++ /dev/null
@@ -1,61 +0,0 @@
-"""Element exists evaluator for remote browser environment."""
-
-import logging
-from fastmcp import Context
-from hud.tools.types import EvaluationResult
-from . import evaluate
-
-logger = logging.getLogger(__name__)
-
-
-@evaluate.tool("element_exists")
-async def element_exists(ctx: Context, selector: str):
-    """Check if an element exists on the page.
-
-    Args:
-        selector: CSS selector for the element
-
-    Returns:
-        Evaluation result
-    """
-    logger.info(f"Checking if element exists: {selector}")
-
-    # Get the playwright tool from the environment
-    # Get the playwright tool from the persistent context
-    persistent_ctx = evaluate.env
-    playwright_tool = getattr(persistent_ctx, "playwright_tool", None)
-    if not playwright_tool or not hasattr(playwright_tool, "page") or not playwright_tool.page:
-        logger.error("No browser page available")
-        return EvaluationResult(
-            reward=0.0, done=False, content="No browser page available", info={"success": False}
-        )
-
-    try:
-        # Check if element exists
-        element = await playwright_tool.page.query_selector(selector)
-
-        if element:
-            logger.info(f"✅ Element found: {selector}")
-            return EvaluationResult(
-                reward=1.0,
-                done=True,
-                content=f"Element found: {selector}",
-                info={"success": True, "selector": selector},
-            )
-        else:
-            logger.info(f"❌ Element not found: {selector}")
-            return EvaluationResult(
-                reward=0.0,
-                done=False,
-                content=f"Element not found: {selector}",
-                info={"success": False, "selector": selector},
-            )
-
-    except Exception as e:
-        logger.error(f"Error checking element: {e}")
-        return EvaluationResult(
-            reward=0.0,
-            done=False,
-            content=f"Error checking element: {str(e)}",
-            info={"success": False, "error": str(e)},
-        )
diff --git a/environments/remote_browser/src/hud_controller/evaluate/history_length.py b/environments/remote_browser/src/hud_controller/evaluate/history_length.py
deleted file mode 100644
index 168661a5..00000000
--- a/environments/remote_browser/src/hud_controller/evaluate/history_length.py
+++ /dev/null
@@ -1,83 +0,0 @@
-"""History length evaluator for remote browser environment."""
-
-import logging
-from typing import Optional
-from fastmcp import Context
-from hud.tools.types import EvaluationResult
-from . import evaluate
-
-logger = logging.getLogger(__name__)
-
-
-@evaluate.tool("history_length")
-async def history_length(
-    ctx: Context, min_length: Optional[int] = None, max_length: Optional[int] = None
-):
-    """Check if action history has specific length.
-
-    Args:
-        min_length: Minimum required length
-        max_length: Maximum allowed length
-
-    Returns:
-        Evaluation result
-    """
-    logger.info(f"Evaluating history length - min: {min_length}, max: {max_length}")
-
-    # Get the playwright tool from the environment
-    # Get the playwright tool from the persistent context
-    persistent_ctx = evaluate.env
-    playwright_tool = getattr(persistent_ctx, "playwright_tool", None)
-    if not playwright_tool:
-        logger.error("No playwright tool available")
-        return EvaluationResult(
-            reward=0.0,
-            done=False,
-            content="No playwright tool available",
-            info={"error": "No playwright tool available"},
-        )
-
-    # Get action history from PlaywrightToolWithMemory
-    history_length = (
-        len(playwright_tool.action_history) if hasattr(playwright_tool, "action_history") else 0
-    )
-    logger.info(f"Current history length: {history_length}")
-
-    in_range = True
-    if min_length is not None and history_length < min_length:
-        in_range = False
-        logger.info(f"❌ History too short: {history_length} < {min_length}")
-    if max_length is not None and history_length > max_length:
-        in_range = False
-        logger.info(f"❌ History too long: {history_length} > {max_length}")
-
-    if in_range:
-        logger.info(f"✅ History length in range: {history_length}")
-
-    # Calculate reward based on how close we are to the target
-    if min_length is not None and max_length is not None:
-        target = (min_length + max_length) / 2
-        reward = max(0, 1 - abs(history_length - target) / target)
-    else:
-        reward = 1.0 if in_range else 0.0
-
-    # Build content message
-    if in_range:
-        content = f"History length in range: {history_length}"
-    else:
-        if min_length is not None and history_length < min_length:
-            content = f"History too short: {history_length} < {min_length}"
-        else:
-            content = f"History too long: {history_length} > {max_length}"
-
-    return EvaluationResult(
-        reward=float(reward),
-        done=in_range,
-        content=content,
-        info={
-            "history_length": history_length,
-            "min_length": min_length,
-            "max_length": max_length,
-            "in_range": in_range,
-        },
-    )
diff --git a/environments/remote_browser/src/hud_controller/evaluate/page_contains.py b/environments/remote_browser/src/hud_controller/evaluate/page_contains.py
deleted file mode 100644
index f8c12dbb..00000000
--- a/environments/remote_browser/src/hud_controller/evaluate/page_contains.py
+++ /dev/null
@@ -1,95 +0,0 @@
-"""Page contains evaluator for remote browser environment."""
-
-import logging
-from typing import Union, List
-from fastmcp import Context
-from hud.tools.types import EvaluationResult
-from . import evaluate
-
-logger = logging.getLogger(__name__)
-
-
-@evaluate.tool("page_contains")
-async def page_contains(
-    ctx: Context, search_terms: Union[str, List[str]], partial_rewarding: bool = True
-):
-    """Check if the page contains specific text.
-
-    Args:
-        search_terms: Text to search for (string or list of strings)
-        partial_rewarding: If True, give partial credit for finding some terms
-
-    Returns:
-        Evaluation result with reward between 0.0 and 1.0
-    """
-    logger.info(f"Evaluating page_contains for terms: {search_terms}")
-
-    # Get the playwright tool from the environment
-    # Get the playwright tool from the persistent context
-    persistent_ctx = evaluate.env
-    playwright_tool = getattr(persistent_ctx, "playwright_tool", None)
-    if not playwright_tool or not hasattr(playwright_tool, "page") or not playwright_tool.page:
-        logger.error("No browser page available")
-        return EvaluationResult(
-            reward=0.0, done=False, content="No browser page available", info={"success": False}
-        )
-
-    # Get page content
-    try:
-        content = await playwright_tool.page.content()
-        logger.info(f"Page content retrieved, length: {len(content)}")
-    except Exception as e:
-        logger.error(f"Failed to get page content: {e}")
-        return EvaluationResult(
-            reward=0.0,
-            done=False,
-            content=f"Failed to get page content: {str(e)}",
-            info={"success": False},
-        )
-
-    # Normalize search terms to list
-    if isinstance(search_terms, str):
-        terms = [search_terms]
-    else:
-        terms = search_terms
-
-    # Search for terms
-    found_terms = []
-    not_found_terms = []
-
-    for term in terms:
-        if term in content:
-            found_terms.append(term)
-            logger.info(f"✅ Found term: '{term}'")
-        else:
-            not_found_terms.append(term)
-            logger.info(f"❌ Term not found: '{term}'")
-
-    # Calculate reward
-    if partial_rewarding and terms:
-        reward = len(found_terms) / len(terms)
-    else:
-        reward = 1.0 if len(not_found_terms) == 0 else 0.0
-
-    # Build content message
-    if reward == 1.0:
-        content_msg = "All terms found on page"
-    elif reward > 0:
-        content_msg = f"Found {len(found_terms)} of {len(terms)} terms"
-    else:
-        content_msg = "No terms found on page"
-
-    logger.info(f"Page contains evaluation complete. Reward: {reward}")
-
-    return EvaluationResult(
-        reward=float(reward),
-        done=reward == 1.0,
-        content=content_msg,
-        info={
-            "success": reward > 0,
-            "found_terms": found_terms,
-            "not_found_terms": not_found_terms,
-            "total_terms": len(terms),
-            "partial_rewarding": partial_rewarding,
-        },
-    )
diff --git a/environments/remote_browser/src/hud_controller/evaluate/raw_last_action_is.py b/environments/remote_browser/src/hud_controller/evaluate/raw_last_action_is.py
deleted file mode 100644
index e4ad4975..00000000
--- a/environments/remote_browser/src/hud_controller/evaluate/raw_last_action_is.py
+++ /dev/null
@@ -1,93 +0,0 @@
-"""Raw last action evaluator for remote browser environment."""
-
-import logging
-from typing import Optional, Dict, Any
-from fastmcp import Context
-from hud.tools.types import EvaluationResult
-from . import evaluate
-
-logger = logging.getLogger(__name__)
-
-
-@evaluate.tool("raw_last_action_is")
-async def raw_last_action_is(
-    ctx: Context, expected_action: str, expected_details: Optional[Dict[str, Any]] = None
-):
-    """Check if the last action matches expected.
-
-    Args:
-        expected_action: Expected action type (e.g., "click", "type", "navigate")
-        expected_details: Optional expected details of the action
-
-    Returns:
-        Standard evaluation result with reward between 0.0 and 1.0
-    """
-    logger.info(f"Evaluating raw_last_action_is: expected={expected_action}")
-
-    # Get the playwright tool from the environment
-    # Get the playwright tool from the persistent context
-    persistent_ctx = evaluate.env
-    playwright_tool = getattr(persistent_ctx, "playwright_tool", None)
-    if not playwright_tool:
-        logger.error("No playwright tool available")
-        return EvaluationResult(
-            reward=0.0,
-            done=False,
-            content="No playwright tool available",
-            info={"error": "No playwright tool available"},
-        )
-
-    # Get action history
-    action_history = (
-        playwright_tool.action_history if hasattr(playwright_tool, "action_history") else []
-    )
-
-    if not action_history:
-        logger.info("No actions have been performed yet")
-        return EvaluationResult(
-            reward=0.0,
-            done=False,
-            content="No actions have been performed",
-            info={"success": False, "expected_action": expected_action},
-        )
-
-    # Get last action
-    last_action = action_history[-1]
-
-    # Check if action type matches
-    action_matches = last_action["type"] == expected_action
-    details_match = True
-
-    # Check details if provided
-    if expected_details and action_matches:
-        actual_details = last_action.get("details", {})
-        for key, expected_value in expected_details.items():
-            if actual_details.get(key) != expected_value:
-                details_match = False
-                break
-
-    success = action_matches and details_match
-
-    if success:
-        content = f"Last action matches: {expected_action}"
-    else:
-        if not action_matches:
-            content = (
-                f"Last action '{last_action['type']}' does not match expected '{expected_action}'"
-            )
-        else:
-            content = f"Action matches but details do not match"
-
-    logger.info(f"Last action evaluation: {content}")
-
-    return EvaluationResult(
-        reward=1.0 if success else 0.0,
-        done=success,
-        content=content,
-        info={
-            "success": success,
-            "last_action": last_action,
-            "expected_action": expected_action,
-            "expected_details": expected_details,
-        },
-    )
diff --git a/environments/remote_browser/src/hud_controller/evaluate/selector_history.py b/environments/remote_browser/src/hud_controller/evaluate/selector_history.py
deleted file mode 100644
index dd52fcb0..00000000
--- a/environments/remote_browser/src/hud_controller/evaluate/selector_history.py
+++ /dev/null
@@ -1,80 +0,0 @@
-"""Selector history evaluator for remote browser environment."""
-
-import logging
-from fastmcp import Context
-from hud.tools.types import EvaluationResult
-from . import evaluate
-
-logger = logging.getLogger(__name__)
-
-
-@evaluate.tool("selector_history")
-async def selector_history(ctx: Context, index: int, expected_selector: str):
-    """Check if selector at index matches expected.
-
-    Args:
-        index: Index in selector history (0-based)
-        expected_selector: Expected selector string
-
-    Returns:
-        Standard evaluation result with reward between 0.0 and 1.0
-    """
-    logger.info(f"Evaluating selector_history: index={index}, expected={expected_selector}")
-
-    # Get the playwright tool from the environment
-    # Get the playwright tool from the persistent context
-    persistent_ctx = evaluate.env
-    playwright_tool = getattr(persistent_ctx, "playwright_tool", None)
-    if not playwright_tool:
-        logger.error("No playwright tool available")
-        return EvaluationResult(
-            reward=0.0,
-            done=False,
-            content="No playwright tool available",
-            info={"error": "No playwright tool available"},
-        )
-
-    # Get selector history
-    selector_history = (
-        playwright_tool.selector_history if hasattr(playwright_tool, "selector_history") else []
-    )
-
-    # Check if index is valid
-    if index < 0 or index >= len(selector_history):
-        logger.info(f"No selector found at index {index}")
-        return EvaluationResult(
-            reward=0.0,
-            done=False,
-            content=f"No selector found at index {index}",
-            info={
-                "success": False,
-                "expected_selector": expected_selector,
-                "selector_history_length": len(selector_history),
-            },
-        )
-
-    # Get selector at index
-    actual_selector = selector_history[index]
-
-    # Check if selector matches
-    success = actual_selector == expected_selector
-
-    if success:
-        content = f"Selector at index {index} matches: {expected_selector}"
-    else:
-        content = f"Selector at index {index} '{actual_selector}' does not match expected '{expected_selector}'"
-
-    logger.info(f"Selector history evaluation: {content}")
-
-    return EvaluationResult(
-        reward=1.0 if success else 0.0,
-        done=success,
-        content=content,
-        info={
-            "success": success,
-            "actual_selector": actual_selector,
-            "expected_selector": expected_selector,
-            "index": index,
-            "selector_history_length": len(selector_history),
-        },
-    )
diff --git a/environments/remote_browser/src/hud_controller/evaluate/sheet_contains.py b/environments/remote_browser/src/hud_controller/evaluate/sheet_contains.py
deleted file mode 100644
index 0af40368..00000000
--- a/environments/remote_browser/src/hud_controller/evaluate/sheet_contains.py
+++ /dev/null
@@ -1,171 +0,0 @@
-"""Evaluator to check if a Google Sheet contains specific text."""
-
-import logging
-from typing import Union, List
-from fastmcp import Context
-from hud.tools.types import EvaluationResult
-from . import evaluate
-
-logger = logging.getLogger(__name__)
-
-
-@evaluate.tool("sheet_contains")
-async def sheet_contains(ctx: Context, args: Union[str, List[str]], partial_rewarding: bool = True):
-    """Check if a Google Sheet contains specific text by copying content to clipboard.
-
-    Args:
-        args: Search terms as string or list of strings
-        partial_rewarding: Whether to give partial rewards
-
-    Returns:
-        Evaluation result
-    """
-    logger.info("Starting sheet_contains evaluation")
-
-    # Get the playwright tool from the environment
-    # Get the playwright tool from the persistent context
-    persistent_ctx = evaluate.env
-    playwright_tool = getattr(persistent_ctx, "playwright_tool", None)
-    if not playwright_tool or not hasattr(playwright_tool, "page") or not playwright_tool.page:
-        logger.error("No browser page available")
-        return EvaluationResult(
-            reward=0.0,
-            done=False,
-            content="No browser page available",
-            info={"error": "No browser page available"},
-        )
-
-    page = playwright_tool.page
-
-    # Verify we're on a Google Sheets page
-    current_url = page.url
-    logger.info(f"Current page URL: {current_url}")
-
-    if "docs.google.com/spreadsheets" not in current_url:
-        logger.error(f"Not on a Google Sheets page! URL: {current_url}")
-        return EvaluationResult(
-            reward=0.0,
-            done=False,
-            content=f"Not on a Google Sheets page! URL: {current_url}",
-            info={"error": f"Not on a Google Sheets page! URL: {current_url}"},
-        )
-
-    logger.info("Confirmed on Google Sheets page")
-
-    # Process search terms
-    search_terms = []
-    if isinstance(args, str):
-        search_terms = [args]
-    elif isinstance(args, list):
-        search_terms = args
-    else:
-        logger.error(f"Invalid args format: {args}. Expected string or list of strings.")
-        return EvaluationResult(
-            reward=0.0,
-            done=False,
-            content=f"Invalid args format. Expected string or list, got {type(args)}",
-            info={"error": f"Invalid args format. Expected string or list, got {type(args)}"},
-        )
-
-    if not search_terms:
-        logger.error("No search terms provided")
-        return EvaluationResult(
-            reward=0.0,
-            done=False,
-            content="No search terms provided",
-            info={"error": "No search terms provided"},
-        )
-
-    logger.info(f"Search terms to find: {search_terms}")
-
-    try:
-        # Wait for sheet to fully load before attempting to copy
-        logger.info("Waiting for sheet to fully load...")
-        try:
-            await page.wait_for_selector(".grid-container", timeout=20000)
-            logger.info("Sheet grid container loaded")
-            # Additional wait for cells to populate
-            await page.wait_for_timeout(2000)
-        except Exception as e:
-            logger.warning(f"Timeout waiting for sheet to load: {str(e)}")
-            # Still proceed, but with a longer fallback wait
-            await page.wait_for_timeout(5000)
-
-        # Select all cells using Ctrl+A
-        logger.info("Selecting all cells with Ctrl+A")
-        await page.keyboard.press("Control+A")
-        await page.wait_for_timeout(500)
-
-        # Copy to clipboard with Ctrl+C
-        logger.info("Copying content to clipboard with Ctrl+C")
-        await page.keyboard.press("Control+C")
-        await page.wait_for_timeout(1000)
-
-        # Get clipboard content
-        logger.info("Getting clipboard content")
-        clipboard_content = await page.evaluate("() => navigator.clipboard.readText()")
-
-        if not clipboard_content:
-            logger.warning("Clipboard content is empty")
-            return EvaluationResult(
-                reward=0.0,
-                done=False,
-                content="Clipboard content is empty",
-                info={"error": "Clipboard content is empty"},
-            )
-
-        logger.info(f"Clipboard content length: {len(clipboard_content)} characters")
-        logger.debug(f"First 200 chars: {clipboard_content[:200]}...")
-
-        # Check for search terms
-        found_terms = []
-        missing_terms = []
-
-        for term in search_terms:
-            if term.lower() in clipboard_content.lower():
-                found_terms.append(term)
-                logger.info(f"✓ Found term: '{term}'")
-            else:
-                missing_terms.append(term)
-                logger.info(f"✗ Missing term: '{term}'")
-
-        # Calculate reward
-        if partial_rewarding and len(search_terms) > 0:
-            reward = float(len(found_terms)) / len(search_terms)
-            logger.info(f"Partial rewarding: {len(found_terms)}/{len(search_terms)} = {reward}")
-        elif not missing_terms:
-            reward = 1.0
-            logger.info("All terms found!")
-        else:
-            reward = 0.0
-            logger.info(f"Missing terms: {missing_terms}")
-
-        success = not missing_terms
-
-        # Build content message
-        if success:
-            content = "All terms found in sheet"
-        else:
-            content = f"Missing terms: {missing_terms}"
-
-        return EvaluationResult(
-            reward=float(reward),
-            done=success,
-            content=content,
-            info={
-                "success": success,
-                "found_terms": found_terms,
-                "missing_terms": missing_terms,
-                "total_terms": len(search_terms),
-                "clipboard_length": len(clipboard_content),
-            },
-        )
-
-    except Exception as e:
-        logger.error(f"Exception during sheet_contains evaluation: {str(e)}")
-        return EvaluationResult(
-            reward=0.0,
-            done=False,
-            content=f"Failed to evaluate: {str(e)}",
-            info={"error": f"Failed to evaluate: {str(e)}"},
-        )
diff --git a/environments/remote_browser/src/hud_controller/evaluate/sheets_cell_values.py b/environments/remote_browser/src/hud_controller/evaluate/sheets_cell_values.py
deleted file mode 100644
index 7f152941..00000000
--- a/environments/remote_browser/src/hud_controller/evaluate/sheets_cell_values.py
+++ /dev/null
@@ -1,349 +0,0 @@
-"""Evaluator to check if specific cells in a Google Sheet have expected values."""
-
-import asyncio
-import logging
-from typing import Dict, Any, List, Union
-from fastmcp import Context
-from hud.tools.types import EvaluationResult
-from . import evaluate
-
-logger = logging.getLogger(__name__)
-
-
-@evaluate.tool("sheets_cell_values")
-async def sheets_cell_values(
-    ctx: Context, args: Union[Dict[str, Any], List[Dict[str, Any]]], partial_rewarding: bool = True
-):
-    """Check if specific cells in a Google Sheet have expected values.
-
-    Args:
-        args: Either a dict of cell mappings {"A1": "value", "B2": "value"}
-              or a list with a dict [{"A1": "value", "B2": "value"}]
-        partial_rewarding: Whether to give partial rewards
-
-    Returns:
-        Evaluation result dict with reward, done, and info
-    """
-    logger.info("Starting sheets_cell_values evaluation")
-    logger.info(f"Received args: {args}")
-
-    # Extract cell values from args
-    if isinstance(args, list) and len(args) > 0:
-        # Handle args as list: args=[{"A1": "value"}]
-        cell_values = args[0] if isinstance(args[0], dict) else {}
-    elif isinstance(args, dict):
-        # Handle args as dict: args={"A1": "value"}
-        cell_values = args
-    else:
-        cell_values = {}
-
-    logger.info(f"Cell values to check: {cell_values}")
-
-    # Get the playwright tool from the environment
-    # Get the playwright tool from the persistent context
-    persistent_ctx = evaluate.env
-    playwright_tool = getattr(persistent_ctx, "playwright_tool", None)
-    if not playwright_tool or not hasattr(playwright_tool, "page") or not playwright_tool.page:
-        logger.error("No browser page available")
-        return EvaluationResult(
-            reward=0.0,
-            done=False,
-            content="No browser page available",
-            info={"error": "No browser page available"},
-        )
-
-    page = playwright_tool.page
-    context = page.context
-
-    # Verify we're on a Google Sheets page
-    current_url = page.url
-    logger.info(f"Current page URL: {current_url}")
-
-    if "docs.google.com/spreadsheets" not in current_url:
-        logger.error(f"Not on a Google Sheets page! URL: {current_url}")
-        return EvaluationResult(
-            reward=0.0,
-            done=False,
-            content=f"Not on a Google Sheets page! URL: {current_url}",
-            info={"error": f"Not on a Google Sheets page! URL: {current_url}"},
-        )
-
-    logger.info("Confirmed on Google Sheets page")
-
-    # Validate cell_values
-    if not isinstance(cell_values, dict):
-        logger.error(
-            f"Invalid cell values format: {cell_values}. Expected dictionary of cell references to values."
-        )
-        return EvaluationResult(
-            reward=0.0,
-            done=False,
-            content=f"Invalid cell values format. Expected dict, got {type(cell_values)}",
-            info={"error": f"Invalid cell values format. Expected dict, got {type(cell_values)}"},
-        )
-
-    if not cell_values:
-        logger.warning("No cell values to check")
-        return EvaluationResult(
-            reward=1.0,
-            done=True,
-            content="No cell values to check",
-            info={"message": "No cell values to check"},
-        )
-
-    # Try to navigate to the ANSWER sheet tab with retries
-    logger.info("=== ANSWER Sheet Navigation ===")
-    max_attempts = 3
-    answer_navigation_successful = False
-
-    for attempt in range(1, max_attempts + 1):
-        try:
-            logger.info(
-                f"Attempt {attempt}/{max_attempts}: Attempting to find and navigate to ANSWER sheet tab..."
-            )
-
-            # Look for the ANSWER sheet tab using the selector
-            answer_tab_selector = 'span.docs-sheet-tab-name:has-text("ANSWER")'
-            logger.info(f"Searching for ANSWER tab with selector: {answer_tab_selector}")
-
-            # Check if the ANSWER tab exists
-            answer_tab_exists = await page.locator(answer_tab_selector).count() > 0
-            logger.info(
-                f"ANSWER tab search result (attempt {attempt}): {'Found' if answer_tab_exists else 'Not found'}"
-            )
-
-            if answer_tab_exists:
-                logger.info(f"✅ Found ANSWER sheet tab on attempt {attempt}, clicking on it...")
-                await page.locator(answer_tab_selector).click()
-                logger.info("Clicked on ANSWER tab, waiting for sheet to switch...")
-
-                # Wait a bit for the sheet to switch
-                try:
-                    await page.wait_for_timeout(1000)
-                except Exception as timeout_error:
-                    logger.debug(f"Timeout error (continuing): {timeout_error}")
-                    await asyncio.sleep(1)
-                logger.info(f"✅ Successfully navigated to ANSWER sheet on attempt {attempt}")
-                answer_navigation_successful = True
-                break
-            else:
-                logger.warning(f"⚠️ ANSWER sheet tab not found on attempt {attempt}")
-
-                if attempt < max_attempts:
-                    logger.info(f"Waiting 500ms before retry {attempt + 1}...")
-                    try:
-                        await page.wait_for_timeout(500)
-                    except Exception as timeout_error:
-                        logger.debug(f"Timeout error (continuing): {timeout_error}")
-                        await asyncio.sleep(0.5)
-
-        except Exception as nav_error:
-            logger.error(
-                f"❌ Error navigating to ANSWER sheet on attempt {attempt}: {str(nav_error)}"
-            )
-
-            if attempt < max_attempts:
-                logger.info(f"Waiting 2500ms before retry {attempt + 1}...")
-                try:
-                    await page.wait_for_timeout(2500)
-                except Exception as timeout_error:
-                    logger.debug(f"Timeout error (continuing): {timeout_error}")
-                    await asyncio.sleep(2.5)
-
-    if not answer_navigation_successful:
-        logger.warning(
-            f"⚠️ Failed to navigate to ANSWER sheet after {max_attempts} attempts, proceeding with current sheet"
-        )
-
-    # Wait for sheet to fully load
-    logger.info("Waiting for sheet to fully load...")
-    try:
-        # Wait for grid container to be present
-        await page.wait_for_selector(".grid-container", timeout=20000)
-        logger.info("Sheet grid container loaded")
-
-        # Additional wait for cells to populate
-        try:
-            await page.wait_for_timeout(2000)
-        except Exception as timeout_error:
-            logger.debug(f"Timeout error (continuing): {timeout_error}")
-            await asyncio.sleep(2)
-    except Exception as e:
-        logger.warning(f"Timeout waiting for sheet to load: {str(e)}")
-        # Still proceed, but with a longer fallback wait
-        await asyncio.sleep(5)
-
-    # Extract sheet content using clipboard method
-    try:
-        logger.info("=== File Content Extraction ===")
-
-        # Grant clipboard permissions
-        try:
-            await context.grant_permissions(["clipboard-read", "clipboard-write"])
-            logger.info("Granted clipboard read-write permissions")
-        except Exception as perm_error:
-            logger.warning(f"Failed to grant permissions: {str(perm_error)}")
-
-        logger.info("Extracting page contents")
-
-        # Clear any selection and focus on the sheet
-        await page.keyboard.press("Escape")
-
-        # Click on the sheet body to ensure focus
-        await page.locator("body").click(force=True)
-
-        # Click on the sheet container
-        await page.click(".fixed4-inner-container")
-
-        logger.info("Selecting all content with Ctrl+A")
-
-        # Select all content
-        await page.keyboard.press("Control+A")
-
-        # Wait for 1 second
-        await asyncio.sleep(1)
-
-        # Copy to clipboard
-        await page.keyboard.press("Control+C")
-
-        # Wait for 1 second
-        await asyncio.sleep(1)
-
-        # Get clipboard content
-        clipboard_content = await page.evaluate("() => navigator.clipboard.readText()")
-        logger.info(f"Successfully extracted {len(clipboard_content)} characters from file")
-
-        # Parse the clipboard content to extract cell values
-        # Split content into rows (by newlines)
-        rows = clipboard_content.rstrip("\n").split("\n")
-        logger.info(f"Split file content into {len(rows)} rows")
-
-        # Show first few rows for debugging
-        if len(rows) > 0:
-            logger.info("First few rows of content:")
-            for i, row in enumerate(rows[:3]):
-                row_preview = row.replace("\t", " | ")[:100]
-                logger.info(f"  Row {i + 1}: '{row_preview}{'...' if len(row) > 100 else ''}'")
-            if len(rows) > 3:
-                logger.info(f"  ... and {len(rows) - 3} more rows")
-
-        logger.info("=== Cell Reference Parsing ===")
-
-        # Parse cell references to get row and column indices
-        actual_values = {}
-        for cell_ref, expected_value in cell_values.items():
-            logger.info(f"Processing cell reference: '{cell_ref}' -> expected: '{expected_value}'")
-
-            # Extract row and column from cell reference (e.g., "A1" -> row=0, col=0)
-            if len(cell_ref) < 2 or not cell_ref[0].isalpha() or not cell_ref[1:].isdigit():
-                logger.error(
-                    f"❌ Invalid cell reference format: '{cell_ref}' (expected format: A1, B2, etc.)"
-                )
-                actual_values[cell_ref] = None
-                continue
-
-            col_letter = cell_ref[0].upper()
-            row_num = int(cell_ref[1:]) - 1  # Convert to 0-indexed
-            col_num = ord(col_letter) - ord("A")  # Convert A->0, B->1, etc.
-
-            logger.info(
-                f"  Parsed '{cell_ref}' -> row={row_num + 1} (0-indexed: {row_num}), col={col_letter} (0-indexed: {col_num})"
-            )
-
-            # Check if the row exists in our parsed content
-            if row_num < len(rows):
-                logger.info(f"  Row {row_num + 1} exists in content")
-                # Split the row into cells (by tabs)
-                cells = rows[row_num].split("\t")
-                logger.info(f"  Row {row_num + 1} has {len(cells)} columns")
-
-                # Check if the column exists in this row
-                if col_num < len(cells):
-                    actual_values[cell_ref] = cells[col_num]
-                    logger.info(f"  ✅ Found value for {cell_ref}: '{actual_values[cell_ref]}'")
-                else:
-                    logger.warning(
-                        f"  ❌ Column {col_letter} (index {col_num}) not found in row {row_num + 1} (has {len(cells)} columns)"
-                    )
-                    actual_values[cell_ref] = ""
-            else:
-                logger.warning(
-                    f"  ❌ Row {row_num + 1} not found in content (has {len(rows)} rows)"
-                )
-                actual_values[cell_ref] = ""
-
-        logger.info("=== Cell Value Comparison ===")
-
-        # Check each expected cell value
-        total_cells = len(cell_values)
-        matching_cells = 0
-        mismatches = []
-
-        for cell_ref, expected_value in cell_values.items():
-            actual_value = actual_values.get(cell_ref, "")
-            logger.info(f"Comparing cell {cell_ref}:")
-            logger.info(f"  Expected: '{expected_value}' (type: {type(expected_value)})")
-            logger.info(f"  Actual:   '{actual_value}' (type: {type(actual_value)})")
-
-            if actual_value is None:
-                mismatch_msg = f"Cell {cell_ref} not found"
-                mismatches.append({"cell": cell_ref, "expected": expected_value, "actual": ""})
-                logger.info(f"  ❌ {mismatch_msg}")
-            elif str(actual_value).strip() == str(expected_value).strip():
-                matching_cells += 1
-                logger.info(
-                    f"  ✅ MATCH: '{str(actual_value).strip()}' == '{str(expected_value).strip()}'"
-                )
-            else:
-                mismatches.append(
-                    {
-                        "cell": cell_ref,
-                        "expected": expected_value,
-                        "actual": actual_value,
-                    }
-                )
-                logger.info(
-                    f"  ❌ VALUE MISMATCH: '{str(actual_value).strip()}' != '{str(expected_value).strip()}'"
-                )
-
-        # Calculate reward
-        if partial_rewarding and total_cells > 0:
-            reward = matching_cells / total_cells
-            logger.info(f"✅ Partial rewarding: {matching_cells}/{total_cells} = {reward}")
-        elif matching_cells == total_cells:
-            reward = 1.0
-            logger.info("✅ ALL cells match expected values!")
-        else:
-            reward = 0.0
-            logger.info("❌ NOT all cells match expected values")
-            logger.info(f"Mismatches: {mismatches}")
-
-        success = matching_cells == total_cells
-        logger.info(f"Final reward: {reward}")
-
-        # Build content message
-        if success:
-            content = f"All {total_cells} cells match expected values"
-        else:
-            content = f"{matching_cells}/{total_cells} cells match, {len(mismatches)} mismatches"
-
-        return EvaluationResult(
-            reward=float(reward),
-            done=success,
-            content=content,
-            info={
-                "success": success,
-                "matching_cells": matching_cells,
-                "total_cells": total_cells,
-                "mismatches": mismatches,
-            },
-        )
-
-    except Exception as e:
-        logger.error(f"Error evaluating sheet cells: {str(e)}", exc_info=True)
-        return EvaluationResult(
-            reward=0.0,
-            done=False,
-            content=f"Failed to evaluate: {str(e)}",
-            info={"error": f"Failed to evaluate: {str(e)}"},
-        )
diff --git a/environments/remote_browser/src/hud_controller/evaluate/url_match.py b/environments/remote_browser/src/hud_controller/evaluate/url_match.py
deleted file mode 100644
index 9b504067..00000000
--- a/environments/remote_browser/src/hud_controller/evaluate/url_match.py
+++ /dev/null
@@ -1,62 +0,0 @@
-"""URL match evaluator for remote browser environment."""
-
-import logging
-from fastmcp import Context
-from hud.tools.types import EvaluationResult
-from . import evaluate
-
-logger = logging.getLogger(__name__)
-
-
-@evaluate.tool("url_match")
-async def url_match(ctx: Context, target_url: str):
-    """Check if the current URL contains a target string.
-
-    Args:
-        target_url: The target URL string to look for
-
-    Returns:
-        Evaluation result with reward between 0.0 and 1.0
-    """
-    logger.info(f"Evaluating URL match for target: '{target_url}'")
-
-    # Get the playwright tool from the environment
-    # Get the playwright tool from the persistent context
-    persistent_ctx = evaluate.env
-    playwright_tool = getattr(persistent_ctx, "playwright_tool", None)
-    if not playwright_tool or not hasattr(playwright_tool, "page") or not playwright_tool.page:
-        logger.error("No browser page available")
-        return EvaluationResult(
-            reward=0.0, done=False, content="No browser page available", info={"success": False}
-        )
-
-    # Get the current URL
-    current_url = playwright_tool.page.url
-    logger.info(f"Current page URL: '{current_url}'")
-
-    # Check if target URL is in current URL
-    if target_url in current_url:
-        logger.info(f"✅ URL match successful: '{target_url}' found in '{current_url}'")
-        return EvaluationResult(
-            reward=1.0,
-            done=True,
-            content=f"URL match successful",
-            info={"success": True, "current_url": current_url, "target_url": target_url},
-        )
-    else:
-        logger.info(f"❌ URL match failed: '{target_url}' not found in '{current_url}'")
-
-        # Provide helpful debugging information
-        info = {"success": False, "current_url": current_url, "target_url": target_url}
-
-        # Check for partial matches
-        if target_url.lower() in current_url.lower():
-            info["note"] = "Case-insensitive match found"
-
-        # Check for protocol differences
-        if current_url.startswith("https://") and not target_url.startswith("https://"):
-            alt_target = "https://" + target_url
-            if alt_target in current_url:
-                info["note"] = "Match found with https:// prefix"
-
-        return EvaluationResult(reward=0.0, done=False, content=f"URL match failed", info=info)
diff --git a/environments/remote_browser/src/hud_controller/evaluate/verify_type_action.py b/environments/remote_browser/src/hud_controller/evaluate/verify_type_action.py
deleted file mode 100644
index 1e32faef..00000000
--- a/environments/remote_browser/src/hud_controller/evaluate/verify_type_action.py
+++ /dev/null
@@ -1,128 +0,0 @@
-"""Evaluator to verify a click-then-type action sequence."""
-
-import logging
-from typing import Optional
-from fastmcp import Context
-from hud.tools.types import EvaluationResult
-from . import evaluate
-
-logger = logging.getLogger(__name__)
-
-
-@evaluate.tool("verify_type_action")
-async def verify_type_action(
-    ctx: Context,
-    expected_text: str,
-    selector: Optional[str] = None,
-    partial_rewarding: bool = True,
-):
-    """Check for a sequence: first click on element, then type text into it.
-
-    Args:
-        expected_text: The expected text that should have been typed
-        selector: Optional selector to check (if not provided, checks last type action)
-        partial_rewarding: Whether to give partial rewards
-
-    Returns:
-        Standard evaluation result with reward between 0.0 and 1.0
-    """
-    logger.info("Starting verify_type_action evaluation")
-
-    expected_value = expected_text
-
-    if not expected_value:
-        logger.error("No expected text provided")
-        return EvaluationResult(
-            reward=0.0, done=False, content="No expected text provided", info={"success": False}
-        )
-
-    logger.info(f"Looking for type action with text: {expected_value}")
-    if selector:
-        logger.info(f"Checking for specific selector: {selector}")
-
-    # Get the playwright tool from the environment
-    # Get the playwright tool from the persistent context
-    persistent_ctx = evaluate.env
-    playwright_tool = getattr(persistent_ctx, "playwright_tool", None)
-    if (
-        not playwright_tool
-        or not hasattr(playwright_tool, "action_history")
-        or not playwright_tool.action_history
-    ):
-        logger.error("No playwright tool available")
-        return EvaluationResult(
-            reward=0.0,
-            done=False,
-            content="No playwright tool available",
-            info={"error": "No playwright tool available"},
-        )
-
-    action_history = playwright_tool.action_history
-
-    logger.info(f"Total actions in history: {len(action_history)}")
-
-    if len(action_history) == 0:
-        logger.info("No actions in history")
-        return EvaluationResult(
-            reward=0.0,
-            done=False,
-            content="No actions in history",
-            info={"success": False, "action_count": 0},
-        )
-
-    # Look for the most recent type action
-    for i in range(len(action_history) - 1, -1, -1):
-        action = action_history[i]
-
-        if action.get("type") == "type":
-            action_details = action.get("details", {})
-            typed_text = action_details.get("text", "")
-            action_selector = action_details.get("selector", "")
-
-            # Check if selector matches (if specified)
-            if selector and action_selector != selector:
-                continue
-
-            # Check if typed text matches
-            if str(typed_text) == str(expected_value):
-                logger.info(f"✓ Found matching type action at index {i}")
-                logger.info(f"  Selector: {action_selector}")
-                logger.info(f"  Text: '{typed_text}'")
-
-                return EvaluationResult(
-                    reward=1.0,
-                    done=True,
-                    content="Found matching type action",
-                    info={
-                        "success": True,
-                        "typed_text": typed_text,
-                        "selector": action_selector,
-                        "action_index": i,
-                    },
-                )
-            elif not selector:
-                # If no specific selector required, any mismatch is a failure
-                logger.info(f"✗ Found type action but text mismatch")
-                logger.info(f"  Expected: '{expected_value}'")
-                logger.info(f"  Got: '{typed_text}'")
-
-                if partial_rewarding:
-                    return EvaluationResult(
-                        reward=0.5,
-                        done=False,
-                        content="Text mismatch",
-                        info={
-                            "success": False,
-                            "expected": expected_value,
-                            "actual": typed_text,
-                            "selector": action_selector,
-                        },
-                    )
-
-    logger.info("No matching type action found")
-    return EvaluationResult(
-        reward=0.0,
-        done=False,
-        content="No matching type action found",
-        info={"success": False, "expected_text": expected_value, "required_selector": selector},
-    )
diff --git a/environments/remote_browser/src/hud_controller/problems/__init__.py b/environments/remote_browser/src/hud_controller/problems/__init__.py
deleted file mode 100644
index dc6b56ed..00000000
--- a/environments/remote_browser/src/hud_controller/problems/__init__.py
+++ /dev/null
@@ -1,14 +0,0 @@
-"""Problem definitions for remote browser environment."""
-
-from .registry import ProblemRegistry, problem
-
-# Import problem definitions to trigger registration
-from .navigate_and_verify import NavigateAndVerifyProblem
-from .form_interaction import FormFillAndSubmitProblem
-from .search_interaction import GoogleSearchProblem
-from .element_interaction import ButtonClickTestProblem
-
-__all__ = [
-    "ProblemRegistry",
-    "problem",
-]
diff --git a/environments/remote_browser/src/hud_controller/problems/element_interaction.py b/environments/remote_browser/src/hud_controller/problems/element_interaction.py
deleted file mode 100644
index a6ccf111..00000000
--- a/environments/remote_browser/src/hud_controller/problems/element_interaction.py
+++ /dev/null
@@ -1,41 +0,0 @@
-"""Element interaction problems for testing UI components."""
-
-from ..problems import problem
-
-
-@problem("button_click_test", description="Test button clicking and verification")
-class ButtonClickTestProblem:
-    """Problem that tests clicking buttons and verifying state changes."""
-
-    def get_setup(self):
-        """Load a page with interactive elements."""
-        html_content = """
-        <!DOCTYPE html>
-        <html>
-        <head>
-            <title>Button Test</title>
-            <style>
-                button { padding: 10px 20px; margin: 10px; font-size: 16px; }
-                #result { margin-top: 20px; font-weight: bold; }
-            </style>
-        </head>
-        <body>
-            <h1>Button Click Test</h1>
-            <button id="test-btn" onclick="document.getElementById('result').innerText='Button clicked!'">
-                Click Me
-            </button>
-            <div id="result"></div>
-        </body>
-        </html>
-        """
-        return {"name": "load_html_content", "arguments": {"html": html_content}}
-
-    def get_evaluation(self):
-        """Verify the button is present."""
-        return {
-            "name": "page_contains",
-            "arguments": {
-                "search_terms": ["Button Click Test", "Click Me"],
-                "partial_rewarding": True,
-            },
-        }
diff --git a/environments/remote_browser/src/hud_controller/problems/form_interaction.py b/environments/remote_browser/src/hud_controller/problems/form_interaction.py
deleted file mode 100644
index 5127762b..00000000
--- a/environments/remote_browser/src/hud_controller/problems/form_interaction.py
+++ /dev/null
@@ -1,28 +0,0 @@
-"""Form interaction problem for testing input elements."""
-
-from ..problems import problem
-
-
-@problem("form_fill_and_submit", description="Fill out a form and verify submission")
-class FormFillAndSubmitProblem:
-    """Problem that fills out a form and verifies the interaction."""
-
-    def get_setup(self):
-        """Set up a form page."""
-        return {
-            "name": "navigate_to_url",
-            "arguments": {
-                "url": "https://httpbin.org/forms/post",
-                "wait_for_load_state": "domcontentloaded",
-            },
-        }
-
-    def get_evaluation(self):
-        """Verify form elements are present."""
-        return {
-            "name": "page_contains",
-            "arguments": {
-                "search_terms": ["Customer name:", "Pizza Size", "Submit order"],
-                "partial_rewarding": True,
-            },
-        }
diff --git a/environments/remote_browser/src/hud_controller/problems/navigate_and_verify.py b/environments/remote_browser/src/hud_controller/problems/navigate_and_verify.py
deleted file mode 100644
index 8d6b282b..00000000
--- a/environments/remote_browser/src/hud_controller/problems/navigate_and_verify.py
+++ /dev/null
@@ -1,28 +0,0 @@
-"""Example problem definition for remote browser environment."""
-
-from ..problems import problem
-
-
-@problem("navigate_and_verify", description="Navigate to a URL and verify the page contains text")
-class NavigateAndVerifyProblem:
-    """Problem that navigates to a URL and verifies page content."""
-
-    def get_setup(self):
-        """Get the setup configuration for this problem."""
-        return {
-            "name": "navigate_to_url",
-            "arguments": {"url": "https://example.com", "wait_for_load_state": "networkidle"},
-        }
-
-    def get_evaluation(self):
-        """Get the evaluation configuration for this problem."""
-        return {
-            "name": "page_contains",
-            "arguments": {
-                "search_terms": [
-                    "Example Domain",
-                    "This domain is for use in illustrative examples",
-                ],
-                "partial_rewarding": True,
-            },
-        }
diff --git a/environments/remote_browser/src/hud_controller/problems/registry.py b/environments/remote_browser/src/hud_controller/problems/registry.py
deleted file mode 100644
index 2c52c758..00000000
--- a/environments/remote_browser/src/hud_controller/problems/registry.py
+++ /dev/null
@@ -1,91 +0,0 @@
-"""Registry system for problem definitions."""
-
-from typing import Dict, Type, Any, List
-import json
-import logging
-
-logger = logging.getLogger(__name__)
-
-# Global registry for problem classes
-PROBLEM_REGISTRY: Dict[str, Type] = {}
-
-
-def problem(name: str, description: str | None = None):
-    """Decorator to register a problem class.
-
-    Args:
-        name: The problem identifier
-        description: Optional description for the problem
-
-    Example:
-        @problem("navigate_and_click", description="Navigate to URL and click element")
-        class NavigateAndClickProblem:
-            def get_setup(self):
-                return {"name": "navigate_to_url", "arguments": {"url": "https://example.com"}}
-            def get_evaluation(self):
-                return {"name": "element_clicked", "arguments": {"selector": "#submit"}}
-    """
-
-    def decorator(cls):
-        # Store metadata on the class
-        cls._problem_name = name
-        cls._problem_description = description
-
-        PROBLEM_REGISTRY[name] = cls
-        logger.info(f"Registered problem: {name} -> {cls.__name__}")
-        return cls
-
-    return decorator
-
-
-class ProblemRegistry:
-    """Registry for problem definitions."""
-
-    @staticmethod
-    def create_problem(name: str):
-        """Create a problem instance by name.
-
-        Args:
-            name: Problem identifier
-
-        Returns:
-            Problem instance
-        """
-        if name not in PROBLEM_REGISTRY:
-            available = list(PROBLEM_REGISTRY.keys())
-            raise ValueError(f"Unknown problem: {name}. Available: {available}")
-
-        problem_class = PROBLEM_REGISTRY[name]
-        return problem_class()
-
-    @staticmethod
-    def to_json() -> str:
-        """Convert registry to JSON for MCP resource serving."""
-        problems = []
-        for name, cls in PROBLEM_REGISTRY.items():
-            problems.append(
-                {
-                    "name": name,
-                    "class": cls.__name__,
-                    "description": getattr(cls, "_problem_description", None),
-                }
-            )
-        return json.dumps(problems, indent=2)
-
-    @staticmethod
-    def list_problems() -> List[str]:
-        """Get list of available problem names."""
-        return list(PROBLEM_REGISTRY.keys())
-
-    @staticmethod
-    def get_problem_info(name: str) -> dict:
-        """Get information about a specific problem."""
-        if name not in PROBLEM_REGISTRY:
-            raise ValueError(f"Unknown problem: {name}")
-
-        cls = PROBLEM_REGISTRY[name]
-        return {
-            "name": name,
-            "class": cls.__name__,
-            "description": getattr(cls, "_problem_description", None),
-        }
diff --git a/environments/remote_browser/src/hud_controller/problems/search_interaction.py b/environments/remote_browser/src/hud_controller/problems/search_interaction.py
deleted file mode 100644
index 8c1634b0..00000000
--- a/environments/remote_browser/src/hud_controller/problems/search_interaction.py
+++ /dev/null
@@ -1,19 +0,0 @@
-"""Search engine interaction problems."""
-
-from ..problems import problem
-
-
-@problem("google_search", description="Perform a Google search and verify results")
-class GoogleSearchProblem:
-    """Problem that performs a search and verifies results appear."""
-
-    def get_setup(self):
-        """Navigate to Google."""
-        return {"name": "navigate_to_url", "arguments": {"url": "https://www.google.com"}}
-
-    def get_evaluation(self):
-        """Verify Google search page loaded."""
-        return {
-            "name": "page_contains",
-            "arguments": {"search_terms": ["Google", "Search"], "partial_rewarding": True},
-        }
diff --git a/environments/remote_browser/src/hud_controller/providers/README.md b/environments/remote_browser/src/hud_controller/providers/README.md
deleted file mode 100644
index db3213a7..00000000
--- a/environments/remote_browser/src/hud_controller/providers/README.md
+++ /dev/null
@@ -1,110 +0,0 @@
-# Remote Browser Providers
-
-This directory contains implementations for various cloud browser providers that can be used with the HUD Remote Browser environment.
-
-## Supported Providers
-
-### 1. **AnchorBrowser** ✅ (Implemented)
-- **API Endpoint**: `https://api.anchorbrowser.io/v1/sessions`
-- **Features**:
-  - Residential proxy support
-  - CAPTCHA solving
-  - Ad blocking
-  - Popup blocking
-- **API Key**: `ANCHOR_API_KEY` environment variable
-- **Documentation**: Internal
-
-### 2. **BrowserBase** 🚧 (To be implemented)
-- **API Endpoint**: `https://api.browserbase.com/v1/sessions`
-- **Features**:
-  - Multiple regions support
-  - Context persistence
-  - Live view URLs
-  - Session recordings
-  - Proxy support
-- **API Key**: `X-BB-API-Key` header
-- **Documentation**: https://docs.browserbase.com/reference/api/create-a-session
-
-### 3. **HyperBrowser** 🚧 (To be implemented)
-- **API Endpoint**: `https://api.hyperbrowser.ai/api/session`
-- **Features**:
-  - Stealth mode
-  - Advanced proxy configuration (country/state/city)
-  - Profile management
-  - Web recording
-  - CAPTCHA solving
-  - Ad blocking
-  - Browser fingerprinting
-- **API Key**: `x-api-key` header
-- **Documentation**: https://docs.hyperbrowser.ai/reference/api-reference/sessions
-
-### 4. **Steel** 🚧 (To be implemented)
-- **API Endpoint**: `https://api.steel.dev/v1/sessions`
-- **Features**:
-  - Session management
-  - Browser automation
-  - Proxy support
-- **API Key**: `steel_api_key` header or `STEEL_API_KEY` env variable
-- **Documentation**: https://docs.steel.dev/api-reference
-
-### 5. **Kernel** ❌ (Not yet available)
-- **Status**: API not yet available for browser sessions
-- **Documentation**: N/A
-
-## Provider Lifecycle
-
-Each provider follows a similar lifecycle pattern:
-
-1. **Initialization**
-   - Set up API credentials
-   - Configure base URLs and default options
-
-2. **Session Creation** (`launch()`)
-   - Make API request to create a new browser session
-   - Handle provider-specific options (proxy, stealth, etc.)
-   - Return CDP WebSocket URL for Playwright connection
-
-3. **Session Management**
-   - Track session IDs and metadata
-   - Provide status checks
-   - Handle session-specific features (live view, recordings, etc.)
-
-4. **Session Termination** (`close()`)
-   - Clean up resources
-   - End the browser session via API
-   - Handle any provider-specific cleanup
-
-## Implementation Guide
-
-To add a new provider:
-
-1. Create a new file in this directory (e.g., `browserbase.py`)
-2. Inherit from `BrowserProvider` base class
-3. Implement required methods:
-   - `__init__()` - Initialize with API credentials
-   - `launch()` - Create a new session and return CDP URL
-   - `close()` - Terminate the session
-   - `get_status()` - Return session status
-4. Add provider to the registry in `__init__.py`
-5. Update environment variables in the main README
-
-## Environment Variables
-
-Each provider uses specific environment variables:
-
-- **AnchorBrowser**: `ANCHOR_API_KEY`
-- **BrowserBase**: `BROWSERBASE_API_KEY`
-- **HyperBrowser**: `HYPERBROWSER_API_KEY`
-- **Steel**: `STEEL_API_KEY`
-
-## Common Features Across Providers
-
-| Feature | AnchorBrowser | BrowserBase | HyperBrowser | Steel |
-|---------|---------------|-------------|--------------|-------|
-| Proxy Support | ✅ | ✅ | ✅ | ✅ |
-| CAPTCHA Solving | ✅ | ❓ | ✅ | ❓ |
-| Ad Blocking | ✅ | ❓ | ✅ | ❓ |
-| Session Recording | ❌ | ✅ | ✅ | ❓ |
-| Live View | ✅ | ✅ | ✅ | ❓ |
-| Profile Persistence | ❌ | ✅ | ✅ | ❓ |
-| Multi-Region | ❌ | ✅ | ✅ | ❓ |
\ No newline at end of file
diff --git a/environments/remote_browser/src/hud_controller/providers/__init__.py b/environments/remote_browser/src/hud_controller/providers/__init__.py
deleted file mode 100644
index 3ed34b64..00000000
--- a/environments/remote_browser/src/hud_controller/providers/__init__.py
+++ /dev/null
@@ -1,33 +0,0 @@
-"""Browser provider implementations for remote browser control."""
-
-from .base import BrowserProvider
-from .anchorbrowser import AnchorBrowserProvider
-from .browserbase import BrowserBaseProvider
-from .steel import SteelProvider
-from .hyperbrowser import HyperBrowserProvider
-# from .kernel import KernelProvider  # Not yet implemented
-
-__all__ = [
-    "BrowserProvider",
-    "AnchorBrowserProvider",
-    "BrowserBaseProvider",
-    "SteelProvider",
-    "HyperBrowserProvider",
-    # "KernelProvider",  # Not yet implemented
-]
-
-# Provider registry for easy lookup
-PROVIDERS = {
-    "anchorbrowser": AnchorBrowserProvider,
-    "browserbase": BrowserBaseProvider,
-    "steel": SteelProvider,
-    "hyperbrowser": HyperBrowserProvider,
-    # "kernel": KernelProvider,  # Not yet implemented
-}
-
-
-def get_provider(name: str) -> type[BrowserProvider]:
-    """Get a provider class by name."""
-    if name not in PROVIDERS:
-        raise ValueError(f"Unknown provider: {name}. Available: {list(PROVIDERS.keys())}")
-    return PROVIDERS[name]
diff --git a/environments/remote_browser/src/hud_controller/providers/anchorbrowser.py b/environments/remote_browser/src/hud_controller/providers/anchorbrowser.py
deleted file mode 100644
index 38f9daa0..00000000
--- a/environments/remote_browser/src/hud_controller/providers/anchorbrowser.py
+++ /dev/null
@@ -1,170 +0,0 @@
-"""AnchorBrowser provider implementation."""
-
-import os
-import logging
-from typing import Optional, Dict, Any
-import httpx
-import requests
-
-from .base import BrowserProvider
-from .helper.proxy import get_proxy_config
-
-logger = logging.getLogger(__name__)
-
-
-class AnchorBrowserProvider(BrowserProvider):
-    """AnchorBrowser provider for remote browser control.
-
-    AnchorBrowser provides cloud-based browser instances with features like:
-    - Proxy support
-    - CAPTCHA solving
-    - Ad blocking
-    - Popup blocking
-    """
-
-    def __init__(self, config: Dict[str, Any] | None = None):
-        super().__init__(config)
-        self.api_key = config.get("api_key") if config else os.getenv("ANCHOR_API_KEY")
-        self.base_url = (
-            config.get("base_url", "https://api.anchorbrowser.io")
-            if config
-            else "https://api.anchorbrowser.io"
-        )
-        self._session_data: Dict[str, Any] | None = None  # Initialize session data storage
-
-        if not self.api_key:
-            raise ValueError("AnchorBrowser API key not provided")
-
-    async def launch(self, **kwargs) -> str:
-        """Launch an AnchorBrowser instance.
-
-        Args:
-            **kwargs: Launch options including:
-                - max_duration: Maximum session duration in seconds (default: 120)
-                - idle_timeout: Idle timeout in seconds (default: 30)
-                - proxy: Proxy configuration dict with:
-                    - type: "custom" or "anchor_residential"
-                    - server: Proxy server address (for custom)
-                    - username: Proxy username (for custom)
-                    - password: Proxy password (for custom)
-                    - country_code: Country code (for anchor_residential)
-                - headless: Whether to run headless
-                - viewport: Viewport size
-                - captcha_solver: Enable CAPTCHA solving
-                - adblock: Enable ad blocking
-                - popup_blocker: Enable popup blocking
-
-        Returns:
-            CDP URL for connecting to the browser
-        """
-        # Build request payload
-        request_data = {
-            "session": {
-                "timeout": {
-                    "max_duration": kwargs.get("max_duration", 120),
-                    "idle_timeout": kwargs.get("idle_timeout", 30),
-                },
-            },
-            "browser": {
-                "adblock": {"active": True},
-                "popup_blocker": {"active": True},
-                "captcha_solver": {"active": True},
-            },
-        }
-
-        proxy_config = await get_proxy_config()
-
-        # Default to residential proxy if nothing configured
-        if not proxy_config:
-            proxy_config = {
-                "type": "anchor_residential",
-                "active": True,
-                "country_code": os.getenv("PROXY_COUNTRY", "us"),
-            }
-            logger.info("Using default AnchorBrowser residential proxy")
-
-        # Add proxy to request data
-        request_data["session"]["proxy"] = proxy_config
-
-        # Make API request
-        async with httpx.AsyncClient() as client:
-            response = await client.post(
-                f"{self.base_url}/v1/sessions",
-                json=request_data,
-                headers={"anchor-api-key": str(self.api_key), "Content-Type": "application/json"},
-                timeout=30.0,
-            )
-            response.raise_for_status()
-
-        # Extract session data
-        data = response.json()
-        session_data = data.get("data", {})
-        self._instance_id = session_data.get("id")
-        self._session_data = session_data  # Store full session data
-        self._cdp_url = session_data.get("cdp_url")
-
-        if not self._instance_id:
-            raise Exception("Failed to get session ID from AnchorBrowser response")
-        if not self._cdp_url:
-            raise Exception("Failed to get CDP URL from AnchorBrowser response")
-
-        self._is_running = True
-
-        logger.info(f"Launched AnchorBrowser session: {self._instance_id}")
-        logger.info(f"Using proxy type: {proxy_config.get('type')}")
-        return self._cdp_url
-
-    def close(self) -> None:
-        """Terminate the AnchorBrowser session."""
-        if not self._instance_id:
-            return
-
-        try:
-            logger.info(f"Terminating AnchorBrowser session: {self._instance_id}")
-            response = requests.delete(
-                f"{self.base_url}/v1/sessions/{self._instance_id}",
-                headers={
-                    "anchor-api-key": str(self.api_key),
-                    "Content-Type": "application/json",
-                },
-                timeout=25.0,
-            )
-            response.raise_for_status()
-
-            logger.info(f"Terminated AnchorBrowser session: {self._instance_id}")
-        except Exception as e:
-            logger.error(f"Error terminating session {self._instance_id}: {e}")
-        finally:
-            self._is_running = False
-            self._cdp_url = None
-            self._instance_id = None
-
-    async def get_status(self) -> Dict[str, Any]:
-        """Get status including AnchorBrowser-specific info."""
-        status = await super().get_status()
-
-        # Add AnchorBrowser-specific status
-        if self._instance_id and self._is_running:
-            try:
-                async with httpx.AsyncClient() as client:
-                    response = await client.get(
-                        f"{self.base_url}/v1/sessions/{self._instance_id}/status",
-                        headers={
-                            "anchor-api-key": str(self.api_key),
-                            "Content-Type": "application/json",
-                        },
-                        timeout=10.0,
-                    )
-                    if response.status_code == 200:
-                        session_status = response.json().get("data", {})
-                        status["session_status"] = session_status
-            except Exception as e:
-                logger.warning(f"Failed to get session status: {e}")
-
-        return status
-
-    def get_live_view_url(self) -> Optional[str]:
-        """Get the live view URL for the AnchorBrowser instance."""
-        if self._session_data:
-            return self._session_data.get("live_view_url")
-        return None
diff --git a/environments/remote_browser/src/hud_controller/providers/base.py b/environments/remote_browser/src/hud_controller/providers/base.py
deleted file mode 100644
index 32807635..00000000
--- a/environments/remote_browser/src/hud_controller/providers/base.py
+++ /dev/null
@@ -1,96 +0,0 @@
-"""Base class for browser providers."""
-
-from abc import ABC, abstractmethod
-from typing import Optional, Dict, Any
-import logging
-
-logger = logging.getLogger(__name__)
-
-
-class BrowserProvider(ABC):
-    """Abstract base class for browser providers.
-
-    Each provider manages the lifecycle of a remote browser instance
-    and provides access to its Chrome DevTools Protocol (CDP) endpoint.
-    """
-
-    def __init__(self, config: Dict[str, Any] | None = None):
-        """Initialize the provider with optional configuration.
-
-        Args:
-            config: Provider-specific configuration
-        """
-        self.config = config or {}
-        self._cdp_url: Optional[str] = None
-        self._instance_id: Optional[str] = None
-        self._is_running = False
-
-    @abstractmethod
-    async def launch(self, **kwargs) -> str:
-        """Launch a browser instance and return its CDP URL.
-
-        Args:
-            **kwargs: Provider-specific launch options
-
-        Returns:
-            CDP URL (e.g., "ws://localhost:9222/devtools/browser/xxx")
-
-        Raises:
-            Exception: If launch fails
-        """
-        pass
-
-    @abstractmethod
-    def close(self) -> None:
-        """Close the browser instance and cleanup resources.
-
-        Raises:
-            Exception: If close fails
-        """
-        pass
-
-    async def get_status(self) -> Dict[str, Any]:
-        """Get the current status of the browser instance.
-
-        Returns:
-            Dictionary with status information including:
-            - is_running: Whether the browser is running
-            - cdp_url: The CDP URL if available
-            - instance_id: Provider-specific instance identifier
-            - additional provider-specific status info
-        """
-        return {
-            "is_running": self._is_running,
-            "cdp_url": self._cdp_url,
-            "instance_id": self._instance_id,
-            "provider": self.__class__.__name__,
-        }
-
-    def get_live_view_url(self) -> Optional[str]:
-        """Get the live view URL for the browser instance.
-
-        Returns:
-            Live view URL if available, None otherwise
-        """
-        # Default implementation returns None
-        # Providers should override this method
-        return None
-
-    @property
-    def cdp_url(self) -> Optional[str]:
-        """Get the CDP URL of the running browser instance."""
-        return self._cdp_url
-
-    @property
-    def is_running(self) -> bool:
-        """Check if the browser instance is running."""
-        return self._is_running
-
-    async def __aenter__(self):
-        """Async context manager entry - launch the browser."""
-        await self.launch()
-        return self
-
-    async def __aexit__(self, exc_type, exc_val, exc_tb):
-        """Async context manager exit - close the browser."""
-        self.close()
diff --git a/environments/remote_browser/src/hud_controller/providers/browserbase.py b/environments/remote_browser/src/hud_controller/providers/browserbase.py
deleted file mode 100644
index a2abf262..00000000
--- a/environments/remote_browser/src/hud_controller/providers/browserbase.py
+++ /dev/null
@@ -1,176 +0,0 @@
-"""BrowserBase provider implementation."""
-
-import os
-import logging
-from typing import Optional, Dict, Any
-import httpx
-
-from .base import BrowserProvider
-
-logger = logging.getLogger(__name__)
-
-
-class BrowserBaseProvider(BrowserProvider):
-    """BrowserBase provider for remote browser control.
-
-    BrowserBase provides cloud browser instances with features like:
-    - Multiple regions support
-    - Context persistence
-    - Live view URLs
-    - Session recordings
-    - Proxy support
-
-    API Documentation: https://docs.browserbase.com/reference/api/create-a-session
-    """
-
-    def __init__(self, config: Dict[str, Any] | None = None):
-        super().__init__(config)
-        self.api_key = config.get("api_key") if config else os.getenv("BROWSERBASE_API_KEY")
-        self.base_url = (
-            config.get("base_url", "https://api.browserbase.com")
-            if config
-            else "https://api.browserbase.com"
-        )
-        self.project_id = (
-            config.get("project_id") if config else os.getenv("BROWSERBASE_PROJECT_ID")
-        )
-        self._session_data: Dict[str, Any] | None = None
-
-        if not self.api_key:
-            raise ValueError("BrowserBase API key not provided")
-
-    async def launch(self, **kwargs) -> str:
-        """Launch a BrowserBase instance.
-
-        Args:
-            **kwargs: Launch options including:
-                - projectId: Project ID (required if not set in config)
-                - region: Browser region (e.g., "us-west-2")
-                - keepAlive: Keep session alive after disconnect
-                - contextId: Reuse browser context
-                - browserSettings: Additional browser settings
-                - proxies: Enable proxy support
-
-        Returns:
-            CDP URL for connecting to the browser
-        """
-        # Build request payload
-        request_data = {"projectId": kwargs.get("projectId", self.project_id)}
-
-        # Add optional parameters
-        if "region" in kwargs:
-            request_data["region"] = kwargs["region"]
-
-        if "keepAlive" in kwargs:
-            request_data["keepAlive"] = kwargs["keepAlive"]
-
-        if "contextId" in kwargs:
-            request_data["contextId"] = kwargs["contextId"]
-
-        if "browserSettings" in kwargs:
-            request_data["browserSettings"] = kwargs["browserSettings"]
-
-        if "proxies" in kwargs:
-            request_data["proxies"] = kwargs["proxies"]
-
-        # Ensure we have a project ID
-        if not request_data.get("projectId"):
-            raise ValueError("BrowserBase project ID not provided")
-
-        # Make API request
-        async with httpx.AsyncClient() as client:
-            response = await client.post(
-                f"{self.base_url}/v1/sessions",
-                json=request_data,
-                headers={"X-BB-API-Key": str(self.api_key), "Content-Type": "application/json"},
-                timeout=30.0,
-            )
-            response.raise_for_status()
-
-        # Extract session data
-        data = response.json()
-        self._session_data = data
-        self._instance_id = data.get("id")
-
-        if not self._instance_id:
-            raise Exception("Failed to get session ID from BrowserBase response")
-
-        # Get CDP URL - BrowserBase returns connectUrl directly
-        self._cdp_url = data.get("connectUrl")
-        if not self._cdp_url:
-            raise Exception("Failed to get connect URL from BrowserBase response")
-
-        self._is_running = True
-
-        logger.info(f"Launched BrowserBase session: {self._instance_id}")
-        logger.info(f"CDP URL: {self._cdp_url}")
-
-        # Store additional URLs for reference
-        self._live_view_url = data.get("liveViewUrl")
-        self._selenium_remote_url = data.get("seleniumRemoteUrl")
-
-        return self._cdp_url
-
-    def close(self) -> None:
-        """Terminate the BrowserBase session."""
-        if not self._instance_id:
-            return
-
-        try:
-            # BrowserBase sessions automatically close after disconnect unless keepAlive is true
-            # We can explicitly update the session to mark it as ended
-            with httpx.Client() as client:
-                response = client.post(
-                    f"{self.base_url}/v1/sessions/{self._instance_id}",
-                    json={"status": "COMPLETED"},
-                    headers={"X-BB-API-Key": str(self.api_key), "Content-Type": "application/json"},
-                    timeout=30.0,
-                )
-                # BrowserBase may return 404 if session already ended
-                if response.status_code != 404:
-                    response.raise_for_status()
-
-            logger.info(f"Terminated BrowserBase session: {self._instance_id}")
-        except Exception as e:
-            logger.error(f"Error terminating session {self._instance_id}: {e}")
-        finally:
-            self._is_running = False
-            self._cdp_url = None
-            self._instance_id = None
-
-    async def get_status(self) -> Dict[str, Any]:
-        """Get status including BrowserBase-specific info."""
-        status = await super().get_status()
-
-        # Add BrowserBase-specific status
-        if self._instance_id and self._is_running:
-            try:
-                async with httpx.AsyncClient() as client:
-                    response = await client.get(
-                        f"{self.base_url}/v1/sessions/{self._instance_id}",
-                        headers={
-                            "X-BB-API-Key": str(self.api_key),
-                            "Content-Type": "application/json",
-                        },
-                        timeout=10.0,
-                    )
-                    if response.status_code == 200:
-                        session_data = response.json()
-                        status["session_data"] = session_data
-                        status["status"] = session_data.get("status", "UNKNOWN")
-                        status["region"] = session_data.get("region")
-                        status["proxy_bytes"] = session_data.get("proxyBytes")
-                        status["cpu_usage"] = session_data.get("avgCpuUsage")
-                        status["memory_usage"] = session_data.get("memoryUsage")
-            except Exception as e:
-                logger.warning(f"Failed to get session status: {e}")
-
-        return status
-
-    def get_live_view_url(self) -> Optional[str]:
-        """Get the live view URL for the BrowserBase instance."""
-        return self._live_view_url if hasattr(self, "_live_view_url") else None
-
-    def get_selenium_remote_url(self) -> Optional[str]:
-        """Get the Selenium remote URL for the BrowserBase instance."""
-        return self._selenium_remote_url if hasattr(self, "_selenium_remote_url") else None
diff --git a/environments/remote_browser/src/hud_controller/providers/helper/__init__.py b/environments/remote_browser/src/hud_controller/providers/helper/__init__.py
deleted file mode 100644
index f3181c79..00000000
--- a/environments/remote_browser/src/hud_controller/providers/helper/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-"""Helper utilities for browser providers."""
-
-from .proxy import get_proxy_config
-
-__all__ = ["get_proxy_config"]
diff --git a/environments/remote_browser/src/hud_controller/providers/helper/proxy.py b/environments/remote_browser/src/hud_controller/providers/helper/proxy.py
deleted file mode 100644
index 2eedffd7..00000000
--- a/environments/remote_browser/src/hud_controller/providers/helper/proxy.py
+++ /dev/null
@@ -1,86 +0,0 @@
-# Copied simplified global proxy helper
-import os, random, asyncio, logging
-from typing import Optional, Dict, Any
-import httpx
-
-logger = logging.getLogger(__name__)
-
-
-# ----------------------- provider helpers ---------------------------
-async def _decodo_proxy() -> Optional[Dict[str, Any]]:
-    user = os.getenv("DECODO_USERNAME")
-    pwd = os.getenv("DECODO_PASSWORD")
-    if not user or not pwd:
-        return None
-    host = os.getenv("DECODO_HOST", "us.decodo.com")
-    rotating = os.getenv("DECODO_ROTATING", "true").lower() == "true"
-    if rotating:
-        port = 10000
-        logger.info("Using Decodo rotating proxy (port 10000)")
-        return {
-            "type": "custom",
-            "server": f"{host}:{port}",
-            "username": user,
-            "password": pwd,
-            "active": True,
-        }
-    logger.info("Searching Decodo ports 10001-11000 …")
-    tried = set()
-    for _ in range(5):
-        port = random.randint(10001, 11000)
-        while port in tried:
-            port = random.randint(10001, 11000)
-        tried.add(port)
-        proxy_url = f"http://{user}:{pwd}@{host}:{port}"
-        try:
-            async with httpx.AsyncClient(proxy=proxy_url, timeout=5.0) as client:
-                if (await client.get("http://httpbin.org/ip")).status_code == 200:
-                    logger.info("Decodo port %s works", port)
-                    return {
-                        "type": "custom",
-                        "server": f"{host}:{port}",
-                        "username": user,
-                        "password": pwd,
-                        "active": True,
-                    }
-        except Exception:
-            continue
-    logger.warning("No working Decodo port found")
-    return None
-
-
-def _standard_proxy() -> Optional[Dict[str, Any]]:
-    server = os.getenv("PROXY_SERVER")
-    if not server:
-        return None
-    return {
-        "type": "custom",
-        "server": server,
-        "username": os.getenv("PROXY_USERNAME"),
-        "password": os.getenv("PROXY_PASSWORD"),
-        "active": True,
-    }
-
-
-# ----------------------- public API ---------------------------------
-async def get_proxy_config() -> Optional[Dict[str, Any]]:
-    provider = os.getenv("PROXY_PROVIDER", "auto").lower()
-
-    if provider == "none":
-        logger.info("Proxy explicitly disabled")
-        return None
-
-    if provider == "decodo":
-        config = await _decodo_proxy()
-        if not config:
-            logger.warning("Decodo proxy requested but credentials not found")
-        return config
-
-    if provider == "standard":
-        config = _standard_proxy()
-        if not config:
-            logger.warning("Standard proxy requested but PROXY_SERVER not set")
-        return config
-
-    # auto or unknown - let browser use its default
-    return None
diff --git a/environments/remote_browser/src/hud_controller/providers/hyperbrowser.py b/environments/remote_browser/src/hud_controller/providers/hyperbrowser.py
deleted file mode 100644
index e290a9a1..00000000
--- a/environments/remote_browser/src/hud_controller/providers/hyperbrowser.py
+++ /dev/null
@@ -1,244 +0,0 @@
-"""HyperBrowser provider implementation."""
-
-import os
-import logging
-from typing import Optional, Dict, Any, List
-import httpx
-
-from .base import BrowserProvider
-
-logger = logging.getLogger(__name__)
-
-
-class HyperBrowserProvider(BrowserProvider):
-    """HyperBrowser provider for remote browser control.
-
-    HyperBrowser provides cloud browser instances with advanced features like:
-    - Stealth mode with fingerprinting
-    - Advanced proxy configuration (country/state/city)
-    - Profile management and persistence
-    - Web recording (video and screenshots)
-    - CAPTCHA solving
-    - Ad blocking and tracker blocking
-    - Browser fingerprinting customization
-
-    API Documentation: https://docs.hyperbrowser.ai/reference/api-reference/sessions
-    """
-
-    def __init__(self, config: Dict[str, Any] | None = None):
-        super().__init__(config)
-        self.api_key = config.get("api_key") if config else os.getenv("HYPERBROWSER_API_KEY")
-        self.base_url = (
-            config.get("base_url", "https://api.hyperbrowser.ai")
-            if config
-            else "https://api.hyperbrowser.ai"
-        )
-        self._session_data: Dict[str, Any] | None = None
-
-        if not self.api_key:
-            raise ValueError("HyperBrowser API key not provided")
-
-    async def launch(self, **kwargs) -> str:
-        """Launch a HyperBrowser instance.
-
-        Args:
-            **kwargs: Launch options including:
-                - useStealth: Enable stealth mode (default: False)
-                - useProxy: Enable proxy (default: False)
-                - proxyCountry: Country code for proxy
-                - proxyState: State code for US proxies
-                - proxyCity: City name for proxy
-                - proxyServer: Custom proxy server
-                - proxyServerUsername: Proxy username
-                - proxyServerPassword: Proxy password
-                - solveCaptchas: Enable CAPTCHA solving
-                - adblock: Enable ad blocking
-                - trackers: Enable tracker blocking
-                - annoyances: Enable annoyances blocking
-                - enableWebRecording: Enable session recording
-                - enableVideoWebRecording: Enable video recording
-                - profile: Profile configuration dict with id and persistChanges
-                - acceptCookies: Auto-accept cookies
-                - extensionIds: List of extension IDs to load
-                - browserArgs: Additional browser arguments
-                - timeoutMinutes: Session timeout (1-720 minutes)
-
-        Returns:
-            CDP URL for connecting to the browser
-        """
-        # Build request payload with defaults
-        request_data = {
-            "useStealth": kwargs.get("useStealth", False),
-            "useProxy": kwargs.get("useProxy", False),
-        }
-
-        # Add proxy configuration
-        if request_data["useProxy"]:
-            if "proxyServer" in kwargs:
-                request_data["proxyServer"] = kwargs["proxyServer"]
-                request_data["proxyServerUsername"] = kwargs.get("proxyServerUsername")
-                request_data["proxyServerPassword"] = kwargs.get("proxyServerPassword")
-            else:
-                # Use HyperBrowser's residential proxy
-                request_data["proxyCountry"] = kwargs.get("proxyCountry", "US")
-                if "proxyState" in kwargs:
-                    request_data["proxyState"] = kwargs["proxyState"]
-                if "proxyCity" in kwargs:
-                    request_data["proxyCity"] = kwargs["proxyCity"]
-
-        # Add optional features
-        if "solveCaptchas" in kwargs:
-            request_data["solveCaptchas"] = kwargs["solveCaptchas"]
-
-        if "adblock" in kwargs:
-            request_data["adblock"] = kwargs["adblock"]
-
-        if "trackers" in kwargs:
-            request_data["trackers"] = kwargs["trackers"]
-
-        if "annoyances" in kwargs:
-            request_data["annoyances"] = kwargs["annoyances"]
-
-        # Recording options
-        if "enableWebRecording" in kwargs:
-            request_data["enableWebRecording"] = kwargs["enableWebRecording"]
-
-        if "enableVideoWebRecording" in kwargs:
-            request_data["enableVideoWebRecording"] = kwargs["enableVideoWebRecording"]
-
-        # Profile management
-        if "profile" in kwargs:
-            request_data["profile"] = kwargs["profile"]
-
-        if "acceptCookies" in kwargs:
-            request_data["acceptCookies"] = kwargs["acceptCookies"]
-
-        # Extensions and browser args
-        if "extensionIds" in kwargs:
-            request_data["extensionIds"] = kwargs["extensionIds"]
-
-        if "browserArgs" in kwargs:
-            request_data["browserArgs"] = kwargs["browserArgs"]
-
-        # Timeout
-        if "timeoutMinutes" in kwargs:
-            request_data["timeoutMinutes"] = kwargs["timeoutMinutes"]
-
-        # Make API request
-        async with httpx.AsyncClient() as client:
-            response = await client.post(
-                f"{self.base_url}/api/session",
-                json=request_data,
-                headers={"x-api-key": str(self.api_key), "Content-Type": "application/json"},
-                timeout=30.0,
-            )
-            response.raise_for_status()
-
-        # Extract session data
-        data = response.json()
-        self._session_data = data
-        self._instance_id = data.get("id")
-
-        if not self._instance_id:
-            raise Exception("Failed to get session ID from HyperBrowser response")
-
-        # Get WebSocket endpoint - HyperBrowser returns wsEndpoint
-        self._cdp_url = data.get("wsEndpoint")
-        if not self._cdp_url:
-            raise Exception("Failed to get WebSocket endpoint from HyperBrowser response")
-
-        self._is_running = True
-
-        logger.info(f"Launched HyperBrowser session: {self._instance_id}")
-        logger.info(f"CDP URL: {self._cdp_url}")
-
-        # Store additional URLs for reference
-        self._session_url = data.get("sessionUrl")
-        self._live_url = data.get("liveUrl")
-        self._token = data.get("token")
-
-        return self._cdp_url
-
-    def close(self) -> None:
-        """Terminate the HyperBrowser session."""
-        if not self._instance_id:
-            return
-
-        try:
-            with httpx.Client() as client:
-                response = client.put(
-                    f"{self.base_url}/api/session/{self._instance_id}/stop",
-                    headers={"x-api-key": str(self.api_key), "Content-Type": "application/json"},
-                    timeout=30.0,
-                )
-                response.raise_for_status()
-
-            logger.info(f"Terminated HyperBrowser session: {self._instance_id}")
-        except Exception as e:
-            logger.error(f"Error terminating session {self._instance_id}: {e}")
-        finally:
-            self._is_running = False
-            self._cdp_url = None
-            self._instance_id = None
-
-    async def get_status(self) -> Dict[str, Any]:
-        """Get status including HyperBrowser-specific info."""
-        status = await super().get_status()
-
-        # Add HyperBrowser-specific status
-        if self._instance_id and self._is_running:
-            try:
-                async with httpx.AsyncClient() as client:
-                    response = await client.get(
-                        f"{self.base_url}/api/session/{self._instance_id}",
-                        headers={
-                            "x-api-key": str(self.api_key),
-                            "Content-Type": "application/json",
-                        },
-                        timeout=10.0,
-                    )
-                    if response.status_code == 200:
-                        session_data = response.json()
-                        status["session_data"] = session_data
-                        status["status"] = session_data.get("status", "UNKNOWN")
-                        status["start_time"] = session_data.get("startTime")
-                        status["end_time"] = session_data.get("endTime")
-            except Exception as e:
-                logger.warning(f"Failed to get session status: {e}")
-
-        return status
-
-    def get_live_view_url(self) -> Optional[str]:
-        """Get the live view URL for the HyperBrowser instance."""
-        return self._live_url if hasattr(self, "_live_url") else None
-
-    def get_session_url(self) -> Optional[str]:
-        """Get the session URL for the HyperBrowser instance."""
-        return self._session_url if hasattr(self, "_session_url") else None
-
-    async def get_sessions_list(
-        self, page: int = 1, status: Optional[str] = None
-    ) -> Dict[str, Any]:
-        """Get list of sessions.
-
-        Args:
-            page: Page number (default: 1)
-            status: Filter by status ("active", "closed", "error")
-
-        Returns:
-            Dict with sessions list and pagination info
-        """
-        params = {"page": page}
-        if status:
-            params["status"] = status
-
-        async with httpx.AsyncClient() as client:
-            response = await client.get(
-                f"{self.base_url}/api/sessions",
-                params=params,
-                headers={"x-api-key": str(self.api_key), "Content-Type": "application/json"},
-                timeout=10.0,
-            )
-            response.raise_for_status()
-
-        return response.json()
diff --git a/environments/remote_browser/src/hud_controller/providers/kernel.py b/environments/remote_browser/src/hud_controller/providers/kernel.py
deleted file mode 100644
index 14cee895..00000000
--- a/environments/remote_browser/src/hud_controller/providers/kernel.py
+++ /dev/null
@@ -1,13 +0,0 @@
-"""Kernel browser provider implementation (stub)."""
-
-from .base import BrowserProvider
-
-
-class KernelProvider(BrowserProvider):
-    """Kernel browser-as-a-service platform - placeholder implementation."""
-
-    async def launch(self, **kwargs) -> str:
-        raise NotImplementedError("Kernel provider not yet implemented")
-
-    def close(self) -> None:
-        raise NotImplementedError("Kernel provider not yet implemented")
diff --git a/environments/remote_browser/src/hud_controller/providers/steel.py b/environments/remote_browser/src/hud_controller/providers/steel.py
deleted file mode 100644
index a85209c3..00000000
--- a/environments/remote_browser/src/hud_controller/providers/steel.py
+++ /dev/null
@@ -1,203 +0,0 @@
-"""Steel provider implementation."""
-
-import os
-import logging
-from typing import Optional, Dict, Any
-import httpx
-
-from .base import BrowserProvider
-
-logger = logging.getLogger(__name__)
-
-
-class SteelProvider(BrowserProvider):
-    """Steel provider for remote browser control.
-
-    Steel is an open-source browser API that provides cloud browser instances with features like:
-    - CAPTCHA solving
-    - Proxy support
-    - Session management
-    - Context persistence (cookies, local storage)
-    - Live view and recordings
-    - Anti-detection measures
-    - Up to 24-hour sessions
-
-    API Documentation: https://docs.steel.dev/api-reference
-    """
-
-    def __init__(self, config: Dict[str, Any] | None = None):
-        super().__init__(config)
-        self.api_key = config.get("api_key") if config else os.getenv("STEEL_API_KEY")
-        self.base_url = (
-            config.get("base_url", "https://api.steel.dev") if config else "https://api.steel.dev"
-        )
-        self._session_data: Dict[str, Any] | None = None
-
-        if not self.api_key:
-            raise ValueError("Steel API key not provided")
-
-    async def launch(self, **kwargs) -> str:
-        """Launch a Steel browser instance.
-
-        Args:
-            **kwargs: Launch options including:
-                - sessionTimeout: Session timeout in milliseconds (max 24 hours)
-                - proxy: Proxy configuration (user:pass@host:port)
-                - blockAds: Block ads (default: False)
-                - stealth: Enable stealth mode
-                - isSelenium: Create Selenium-compatible session
-                - loadExtensions: Load Chrome extensions
-                - solveCaptchas: Enable CAPTCHA solving
-                - context: Saved context (cookies, localStorage)
-
-        Returns:
-            CDP WebSocket URL for connecting to the browser
-        """
-        # Build request payload using Steel's format
-        request_data = {
-            "sessionId": kwargs.get("sessionId", ""),
-            "userAgent": kwargs.get("userAgent", ""),
-            "useProxy": kwargs.get("useProxy", False),
-            "proxyUrl": kwargs.get("proxyUrl", ""),
-            "blockAds": kwargs.get("blockAds", False),
-            "solveCaptcha": kwargs.get("solveCaptcha", False),
-            "timeout": kwargs.get("timeout", 1800000),  # 30 minutes default
-            "concurrency": kwargs.get("concurrency", 1),
-            "isSelenium": kwargs.get("isSelenium", False),
-            "region": kwargs.get("region", "lax"),
-        }
-
-        # Add dimensions if specified
-        if "dimensions" in kwargs:
-            request_data["dimensions"] = kwargs["dimensions"]
-        else:
-            request_data["dimensions"] = {"width": 1920, "height": 1080}
-
-        # Add session context if provided
-        if "sessionContext" in kwargs:
-            request_data["sessionContext"] = kwargs["sessionContext"]
-
-        # Add stealth config
-        if "stealthConfig" in kwargs:
-            request_data["stealthConfig"] = kwargs["stealthConfig"]
-
-        # Make API request
-        async with httpx.AsyncClient() as client:
-            response = await client.post(
-                f"{self.base_url}/v1/sessions",
-                json=request_data,
-                headers={"Content-Type": "application/json", "Steel-Api-Key": str(self.api_key)},
-                timeout=30.0,
-            )
-            response.raise_for_status()
-
-        # Extract session data
-        data = response.json()
-        self._session_data = data
-        self._instance_id = data.get("id")
-
-        if not self._instance_id:
-            raise Exception("Failed to get session ID from Steel response")
-
-        # Get WebSocket URL - Steel returns wsUrl
-        self._cdp_url = data.get("wsUrl")
-        if not self._cdp_url:
-            # Fallback to constructing URL if not provided
-            self._cdp_url = f"wss://api.steel.dev/sessions/{self._instance_id}"
-
-        self._is_running = True
-
-        logger.info(f"Launched Steel session: {self._instance_id}")
-        logger.info(f"CDP URL: {self._cdp_url}")
-
-        # Store additional URLs for reference
-        self._debug_url = data.get("debugUrl")
-        self._live_view_url = data.get("liveViewUrl")
-
-        return self._cdp_url
-
-    def close(self) -> None:
-        """Terminate the Steel session."""
-        if not self._instance_id:
-            return
-
-        try:
-            with httpx.Client() as client:
-                response = client.delete(
-                    f"{self.base_url}/v1/sessions/{self._instance_id}",
-                    headers={
-                        "Content-Type": "application/json",
-                        "Steel-Api-Key": str(self.api_key),
-                    },
-                    timeout=30.0,
-                )
-                # Steel may return 404 if session already ended
-                if response.status_code != 404:
-                    response.raise_for_status()
-
-            logger.info(f"Terminated Steel session: {self._instance_id}")
-        except Exception as e:
-            logger.error(f"Error terminating session {self._instance_id}: {e}")
-        finally:
-            self._is_running = False
-            self._cdp_url = None
-            self._instance_id = None
-
-    async def get_status(self) -> Dict[str, Any]:
-        """Get status including Steel-specific info."""
-        status = await super().get_status()
-
-        # Add Steel-specific status
-        if self._instance_id and self._is_running:
-            try:
-                async with httpx.AsyncClient() as client:
-                    response = await client.get(
-                        f"{self.base_url}/v1/sessions/{self._instance_id}",
-                        headers={
-                            "steel_api_key": str(self.api_key),
-                            "Content-Type": "application/json",
-                        },
-                        timeout=10.0,
-                    )
-                    if response.status_code == 200:
-                        session_data = response.json()
-                        status["session_data"] = session_data
-                        status["status"] = session_data.get("status", "UNKNOWN")
-                        status["context"] = session_data.get("context")
-            except Exception as e:
-                logger.warning(f"Failed to get session status: {e}")
-
-        return status
-
-    def get_debug_url(self) -> Optional[str]:
-        """Get the debug URL for the Steel instance."""
-        return self._debug_url if hasattr(self, "_debug_url") else None
-
-    def get_live_view_url(self) -> Optional[str]:
-        """Get the live view URL for the Steel instance."""
-        return self._live_view_url if hasattr(self, "_live_view_url") else None
-
-    async def save_context(self) -> Optional[Dict[str, Any]]:
-        """Save the current browser context (cookies, localStorage).
-
-        Returns:
-            Context data that can be passed to launch() to restore state
-        """
-        if not self._instance_id:
-            return None
-
-        try:
-            async with httpx.AsyncClient() as client:
-                response = await client.get(
-                    f"{self.base_url}/v1/sessions/{self._instance_id}/context",
-                    headers={
-                        "Content-Type": "application/json",
-                        "Steel-Api-Key": str(self.api_key),
-                    },
-                    timeout=10.0,
-                )
-                response.raise_for_status()
-                return response.json()
-        except Exception as e:
-            logger.error(f"Failed to save context: {e}")
-            return None
diff --git a/environments/remote_browser/src/hud_controller/server.py b/environments/remote_browser/src/hud_controller/server.py
deleted file mode 100644
index af7af53d..00000000
--- a/environments/remote_browser/src/hud_controller/server.py
+++ /dev/null
@@ -1,350 +0,0 @@
-"""MCP server for remote browser environment."""
-
-import sys
-import logging
-import os
-import asyncio
-from datetime import datetime
-from typing import Optional, TypedDict, Any
-
-# Configure stderr logging
-logging.basicConfig(
-    stream=sys.stderr,
-    level=logging.INFO,
-    format="[%(levelname)s] %(asctime)s | %(name)s | %(message)s",
-    force=True,
-)
-logger = logging.getLogger(__name__)
-
-from hud.server import MCPServer
-from hud.server.context import attach_context
-
-# Import tools
-from .tools import PlaywrightToolWithMemory, BrowserExecutor
-from hud.tools.computer import (
-    AnthropicComputerTool,
-    OpenAIComputerTool,
-    HudComputerTool,
-)
-
-# Import setup and evaluate hubs
-from .setup import setup as setup_hub
-from .evaluate import evaluate as evaluate_hub
-
-# Import providers
-from .providers import get_provider, BrowserProvider
-
-# Global persistent context (initialized during startup)
-persistent_ctx = None
-playwright_tool: Optional[PlaywrightToolWithMemory] = None
-browser_executor: Optional[BrowserExecutor] = None
-
-# Create Hud FastMCP instance
-mcp = MCPServer(
-    name="HUD Remote Browser Environment",
-    instructions="""
-    This is a remote browser automation environment that connects to cloud browser providers.
-    The browser provider is configured via the BROWSER_PROVIDER environment variable.
-    
-    Available tools:
-    - setup: Initialize browser environment with various setup functions
-    - evaluate: Evaluate browser state with various evaluator functions
-    - playwright tools: Browser automation (navigate, click, type, etc.)
-    - computer tools: Control browser as if it were a desktop application
-    """,
-)
-
-
-class Telemetry(TypedDict):
-    """Standard evaluation result format."""
-
-    provider: str
-    status: str
-    live_url: str | None
-    timestamp: str
-    cdp_url: str | None
-    instance_id: str | None
-
-
-@mcp.resource("telemetry://live")
-async def get_telemetry_resource() -> Telemetry:
-    """MCP resource containing telemetry data including provider's live view URL."""
-    global persistent_ctx
-
-    if persistent_ctx:
-        try:
-            telemetry = persistent_ctx.get_telemetry()  # Now synchronous
-            return Telemetry(
-                provider=telemetry["provider"],
-                status=telemetry["status"],
-                live_url=telemetry["live_url"],
-                timestamp=datetime.now().isoformat(),
-                cdp_url=None,
-                instance_id=telemetry["instance_id"],
-            )
-        except Exception as e:
-            logger.error(f"Error getting telemetry data: {e}")
-            # Return default telemetry on error instead of None
-            return Telemetry(
-                provider=os.getenv("BROWSER_PROVIDER", "unknown"),
-                status="error",
-                live_url=None,
-                timestamp=datetime.now().isoformat(),
-                cdp_url=None,
-                instance_id=None,
-            )
-
-    return Telemetry(
-        provider=os.getenv("BROWSER_PROVIDER", "unknown"),
-        status="not_initialized",
-        live_url=None,
-        timestamp=datetime.now().isoformat(),
-        cdp_url=None,
-        instance_id=None,
-    )
-
-
-@mcp.initialize
-async def initialize_environment(ctx):
-    """Initialize the remote browser environment with progress reporting."""
-    global persistent_ctx, playwright_tool, browser_executor
-
-    # Extract progress token from context if available
-    progress_token = None
-    if ctx.meta and hasattr(ctx.meta, "progressToken"):
-        progress_token = ctx.meta.progressToken
-
-    async def send_progress(progress: int, message: str):
-        if progress_token and hasattr(ctx, "session"):
-            try:
-                await ctx.session.send_progress_notification(
-                    progress_token=progress_token,
-                    progress=progress,
-                    total=100,
-                    message=message,
-                )
-            except Exception as e:
-                logger.warning(f"Failed to send progress notification: {e}")
-        logger.info(f"[{progress}%] {message}")
-
-    try:
-        await send_progress(5, "Connecting to persistent context...")
-
-        # Connect to persistent context server
-        max_retries = 10
-        retry_delay = 1.0
-
-        for attempt in range(max_retries):
-            try:
-                persistent_ctx = attach_context("/tmp/hud_remote_browser_ctx.sock")
-                if persistent_ctx is None:
-                    raise ConnectionError("Failed to attach to context server")
-                logger.info("Connected to persistent remote browser context")
-
-                # Log current state
-                state = persistent_ctx.get_state_summary()
-                logger.info(f"Context state: {state}")
-
-                if persistent_ctx.get_is_initialized():
-                    logger.info("Resuming with existing browser session")
-                else:
-                    logger.info("Starting fresh browser session")
-                break
-
-            except Exception as e:
-                if attempt < max_retries - 1:
-                    logger.warning(
-                        f"Context server not ready yet (attempt {attempt + 1}/{max_retries}): {e}"
-                    )
-                    await asyncio.sleep(retry_delay)
-                else:
-                    logger.error(
-                        f"Failed to connect to context server after {max_retries} attempts: {e}"
-                    )
-                    logger.error(
-                        "The context server should be started automatically. Check container logs."
-                    )
-                    raise
-
-        await send_progress(10, "Connected to persistent context")
-
-        # At this point, persistent_ctx is guaranteed to be set
-        assert persistent_ctx is not None
-
-        # Check if we need to initialize a new browser session
-        if not persistent_ctx.get_is_initialized():
-            await send_progress(15, "Initializing new browser session...")
-
-            # Get provider configuration from environment
-            provider_name = os.getenv("BROWSER_PROVIDER")
-            if not provider_name:
-                error_msg = (
-                    "BROWSER_PROVIDER environment variable is required. "
-                    "Supported providers: anchorbrowser, steel, browserbase, hyperbrowser, kernel"
-                )
-                logger.error(error_msg)
-                raise ValueError(error_msg)
-
-            provider_name = provider_name.lower()
-            await send_progress(20, f"Using browser provider: {provider_name}")
-
-            # Initialize the browser provider
-            provider_class = get_provider(provider_name)
-            provider_config = {}
-
-            # Add provider-specific configuration
-            if provider_name == "anchorbrowser":
-                provider_config["api_key"] = os.getenv("ANCHOR_API_KEY")
-                provider_config["base_url"] = os.getenv(
-                    "ANCHOR_BASE_URL", "https://api.anchorbrowser.io"
-                )
-            elif provider_name == "steel":
-                provider_config["api_key"] = os.getenv("STEEL_API_KEY")
-                provider_config["base_url"] = os.getenv("STEEL_BASE_URL", "https://api.steel.dev")
-            elif provider_name == "browserbase":
-                provider_config["api_key"] = os.getenv("BROWSERBASE_API_KEY")
-                provider_config["project_id"] = os.getenv("BROWSERBASE_PROJECT_ID")
-            elif provider_name == "hyperbrowser":
-                provider_config["api_key"] = os.getenv("HYPERBROWSER_API_KEY")
-            elif provider_name == "kernel":
-                provider_config["api_key"] = os.getenv("KERNEL_API_KEY")
-
-            # Store provider config in context
-            persistent_ctx.set_provider_config(provider_config)
-
-            browser_provider = provider_class(provider_config)
-            persistent_ctx.set_browser_provider(browser_provider)
-            await send_progress(30, "Browser provider initialized")
-
-            # Launch the browser and get CDP URL
-            await send_progress(40, "Launching remote browser...")
-
-            # Build launch options
-            launch_options = {}
-
-            # Add other launch options from environment
-            max_duration = os.getenv("BROWSER_MAX_DURATION")
-            if max_duration:
-                launch_options["max_duration"] = int(max_duration)
-            idle_timeout = os.getenv("BROWSER_IDLE_TIMEOUT")
-            if idle_timeout:
-                launch_options["idle_timeout"] = int(idle_timeout)
-
-            # Store launch options in context
-            persistent_ctx.set_launch_options(launch_options)
-
-            # Create browser session
-            cdp_url = await browser_provider.launch(**launch_options)
-
-            # Build and store telemetry data
-            telemetry_data = {
-                "provider": provider_name,
-                "status": "running",
-                "live_url": browser_provider.get_live_view_url()
-                if hasattr(browser_provider, "get_live_view_url")
-                else None,
-                "cdp_url": cdp_url,
-                "instance_id": browser_provider._instance_id
-                if hasattr(browser_provider, "_instance_id")
-                else None,
-                "timestamp": datetime.now().isoformat(),
-            }
-            persistent_ctx.set_telemetry(telemetry_data)
-
-            await send_progress(60, f"Browser launched")
-        else:
-            # Reuse existing browser session
-            await send_progress(20, "Reusing existing browser session...")
-
-            # Get existing CDP URL from context
-            cdp_url = persistent_ctx.get_cdp_url()
-            if not cdp_url:
-                raise ValueError("No CDP URL in persistent context")
-
-            await send_progress(60, f"Using existing CDP URL")
-
-        # Initialize PlaywrightToolWithMemory with CDP URL from context
-        # This reconnects to the existing browser session on reloads
-        playwright_tool = PlaywrightToolWithMemory(context=None, cdp_url=cdp_url)
-
-        # Ensure browser is connected before registering tools
-        await playwright_tool._ensure_browser()
-        await send_progress(65, "Browser connection established")
-
-        # Add playwright tool to MCP
-        mcp.add_tool(playwright_tool)
-        await send_progress(70, "Playwright tool registered")
-
-        # Initialize browser executor
-        browser_executor = BrowserExecutor(playwright_tool)
-        await send_progress(75, "Browser executor initialized")
-
-        # Create and register computer tools with default dimensions
-        mcp.add_tool(HudComputerTool(executor=browser_executor))
-        mcp.add_tool(AnthropicComputerTool(executor=browser_executor))
-        mcp.add_tool(OpenAIComputerTool(executor=browser_executor))
-
-        await send_progress(80, "Registered hud computer tools")
-
-        # Set the persistent context as environment for setup and evaluate hubs
-        setup_hub.env = persistent_ctx
-        evaluate_hub.env = persistent_ctx
-
-        # Also store the current playwright tool on the persistent context
-        # Note: This is NOT pickled/persisted, it's just for current session access
-        persistent_ctx.playwright_tool = playwright_tool
-
-        # Mount the hubs
-        mcp.mount(setup_hub)
-        mcp.mount(evaluate_hub)
-        await send_progress(90, "Setup and evaluate tools registered")
-
-        # Navigate to initial URL if specified (only for new sessions)
-        if not persistent_ctx.get_is_initialized():
-            initial_url = os.getenv("BROWSER_URL")
-            if initial_url:
-                await send_progress(95, f"Navigating to {initial_url}")
-                await playwright_tool.navigate(initial_url)
-
-            # Mark as initialized
-            persistent_ctx.set_initialized(True)
-
-        await send_progress(100, "Remote browser environment ready!")
-
-    except Exception as e:
-        logger.error(f"Initialization failed: {e}")
-        import traceback
-
-        logger.error(f"Traceback: {traceback.format_exc()}")
-        raise
-
-
-@mcp.shutdown
-async def shutdown_environment():
-    """Shutdown the remote browser environment (only called on SIGTERM)."""
-    global persistent_ctx, playwright_tool, browser_executor
-
-    logger.info("🔧 SIGTERM received - shutting down browser provider")
-    try:
-        # Close the browser provider
-        if persistent_ctx:
-            logger.info("Closing browser provider...")
-            try:
-                provider = persistent_ctx.get_browser_provider()
-                if provider and hasattr(provider, "close"):
-                    provider.close()
-                    logger.info("Browser provider closed")
-            except Exception as e:
-                logger.error(f"Error closing provider: {e}")
-
-        logger.info("✅ Browser shutdown completed")
-    except Exception as e:
-        logger.error(f"Error during shutdown: {e}")
-    finally:
-        # Clear local references
-        playwright_tool = None
-        browser_executor = None
-
-
-if __name__ == "__main__":
-    mcp.run()
diff --git a/environments/remote_browser/src/hud_controller/setup/__init__.py b/environments/remote_browser/src/hud_controller/setup/__init__.py
deleted file mode 100644
index b3dea6f7..00000000
--- a/environments/remote_browser/src/hud_controller/setup/__init__.py
+++ /dev/null
@@ -1,16 +0,0 @@
-"""Setup layer for remote browser environment.
-
-This module exposes:
-- ``setup``, the BaseHub instance for setup operations
-"""
-
-from __future__ import annotations
-
-from hud.tools.base import BaseHub
-
-setup = BaseHub("setup")
-
-# Import all setup functions to register them
-from . import navigate, cookies, load_html, interact, sheets
-
-__all__ = ["setup"]
diff --git a/environments/remote_browser/src/hud_controller/setup/cookies.py b/environments/remote_browser/src/hud_controller/setup/cookies.py
deleted file mode 100644
index a160ee6d..00000000
--- a/environments/remote_browser/src/hud_controller/setup/cookies.py
+++ /dev/null
@@ -1,69 +0,0 @@
-"""Cookie setup functions for remote browser environment."""
-
-import logging
-from typing import List, Dict, Any
-from fastmcp import Context
-from mcp.types import TextContent
-from . import setup
-
-logger = logging.getLogger(__name__)
-
-
-@setup.tool("set_cookies")
-async def set_cookies(ctx: Context, cookies: List[Dict[str, Any]]):
-    """Set cookies in the browser.
-
-    Args:
-        cookies: List of cookie dictionaries with name, value, and optional properties
-
-    Returns:
-        Setup result with status
-    """
-    logger.info(f"Setting {len(cookies)} cookies")
-
-    # Get the playwright tool from the environment context
-    persistent_ctx = setup.env
-    playwright_tool = getattr(persistent_ctx, "playwright_tool", None)
-    if not playwright_tool or not hasattr(playwright_tool, "page") or not playwright_tool.page:
-        logger.error("No browser page available")
-        return TextContent(text="No browser page available", type="text")
-
-    try:
-        # Add cookies to the context
-        await playwright_tool.page.context.add_cookies(cookies)
-
-        logger.info(f"Successfully set {len(cookies)} cookies")
-        return TextContent(
-            text=f"Set {len(cookies)} cookies: {', '.join([c.get('name', 'unnamed') for c in cookies])}",
-            type="text",
-        )
-    except Exception as e:
-        logger.error(f"Failed to set cookies: {e}")
-        return TextContent(text=f"Failed to set cookies: {str(e)}", type="text")
-
-
-@setup.tool("clear_cookies")
-async def clear_cookies(ctx: Context):
-    """Clear all cookies from the browser.
-
-    Returns:
-        Setup result with status
-    """
-    logger.info("Clearing all cookies")
-
-    # Get the playwright tool from the environment context
-    persistent_ctx = setup.env
-    playwright_tool = getattr(persistent_ctx, "playwright_tool", None)
-    if not playwright_tool or not hasattr(playwright_tool, "page") or not playwright_tool.page:
-        logger.error("No browser page available")
-        return TextContent(text="No browser page available", type="text")
-
-    try:
-        # Clear all cookies
-        await playwright_tool.page.context.clear_cookies()
-
-        logger.info("Successfully cleared all cookies")
-        return TextContent(text="Cleared all cookies", type="text")
-    except Exception as e:
-        logger.error(f"Failed to clear cookies: {e}")
-        return TextContent(text=f"Failed to clear cookies: {str(e)}", type="text")
diff --git a/environments/remote_browser/src/hud_controller/setup/interact.py b/environments/remote_browser/src/hud_controller/setup/interact.py
deleted file mode 100644
index 84cbd682..00000000
--- a/environments/remote_browser/src/hud_controller/setup/interact.py
+++ /dev/null
@@ -1,105 +0,0 @@
-"""Interaction setup functions for remote browser environment."""
-
-import logging
-from fastmcp import Context
-from mcp.types import TextContent
-from . import setup
-
-logger = logging.getLogger(__name__)
-
-
-@setup.tool("click_element")
-async def click_element(ctx: Context, selector: str, timeout: int = 30000):
-    """Click on an element by selector.
-
-    Args:
-        selector: CSS selector for the element
-        timeout: Maximum time to wait for element (ms)
-
-    Returns:
-        Setup result with status
-    """
-    logger.info(f"Clicking element with selector: {selector}")
-
-    # Get the playwright tool from the environment context
-    persistent_ctx = setup.env
-    playwright_tool = getattr(persistent_ctx, "playwright_tool", None)
-    if not playwright_tool or not hasattr(playwright_tool, "page") or not playwright_tool.page:
-        logger.error("No browser page available")
-        return TextContent(text="No browser page available", type="text")
-
-    try:
-        # Wait for element and click
-        element = await playwright_tool.page.wait_for_selector(selector, timeout=timeout)
-        await element.click()
-
-        logger.info(f"Successfully clicked element: {selector}")
-        return TextContent(text=f"Clicked element: {selector}", type="text")
-    except Exception as e:
-        logger.error(f"Failed to click element: {e}")
-        return TextContent(text=f"Failed to click element: {str(e)}", type="text")
-
-
-@setup.tool("fill_input")
-async def fill_input(ctx: Context, selector: str, text: str, timeout: int = 30000):
-    """Fill an input field with text.
-
-    Args:
-        selector: CSS selector for the input element
-        text: Text to fill in the input
-        timeout: Maximum time to wait for element (ms)
-
-    Returns:
-        Setup result with status
-    """
-    logger.info(f"Filling input {selector} with text")
-
-    # Get the playwright tool from the environment context
-    persistent_ctx = setup.env
-    playwright_tool = getattr(persistent_ctx, "playwright_tool", None)
-    if not playwright_tool or not hasattr(playwright_tool, "page") or not playwright_tool.page:
-        logger.error("No browser page available")
-        return TextContent(text="No browser page available", type="text")
-
-    try:
-        # Wait for element and fill
-        element = await playwright_tool.page.wait_for_selector(selector, timeout=timeout)
-        await element.fill(text)
-
-        logger.info(f"Successfully filled input: {selector}")
-        return TextContent(text=f"Filled input {selector} with {len(text)} characters", type="text")
-    except Exception as e:
-        logger.error(f"Failed to fill input: {e}")
-        return TextContent(text=f"Failed to fill input: {str(e)}", type="text")
-
-
-@setup.tool("select_option")
-async def select_option(ctx: Context, selector: str, value: str, timeout: int = 30000):
-    """Select an option in a dropdown.
-
-    Args:
-        selector: CSS selector for the select element
-        value: Value of the option to select
-        timeout: Maximum time to wait for element (ms)
-
-    Returns:
-        Setup result with status
-    """
-    logger.info(f"Selecting option {value} in {selector}")
-
-    # Get the playwright tool from the environment context
-    persistent_ctx = setup.env
-    playwright_tool = getattr(persistent_ctx, "playwright_tool", None)
-    if not playwright_tool or not hasattr(playwright_tool, "page") or not playwright_tool.page:
-        logger.error("No browser page available")
-        return TextContent(text="No browser page available", type="text")
-
-    try:
-        # Wait for element and select option
-        await playwright_tool.page.select_option(selector, value, timeout=timeout)
-
-        logger.info(f"Successfully selected option: {value}")
-        return TextContent(text=f"Selected option '{value}' in {selector}", type="text")
-    except Exception as e:
-        logger.error(f"Failed to select option: {e}")
-        return TextContent(text=f"Failed to select option: {str(e)}", type="text")
diff --git a/environments/remote_browser/src/hud_controller/setup/load_html.py b/environments/remote_browser/src/hud_controller/setup/load_html.py
deleted file mode 100644
index c269d142..00000000
--- a/environments/remote_browser/src/hud_controller/setup/load_html.py
+++ /dev/null
@@ -1,44 +0,0 @@
-"""Setup function to load custom HTML content."""
-
-import logging
-from fastmcp import Context
-from mcp.types import TextContent
-from . import setup
-
-logger = logging.getLogger(__name__)
-
-
-@setup.tool("load_html_content")
-async def load_html_content(ctx: Context, html: str):
-    """Load custom HTML content directly into the browser.
-
-    Args:
-        html: HTML content to load
-
-    Returns:
-        Setup result with status
-    """
-    logger.info("Loading custom HTML content into browser")
-
-    # Get the playwright tool from the environment context
-    persistent_ctx = setup.env
-    playwright_tool = getattr(persistent_ctx, "playwright_tool", None)
-    if not playwright_tool or not hasattr(playwright_tool, "page") or not playwright_tool.page:
-        logger.error("No browser page available")
-        return TextContent(text="No browser page available", type="text")
-
-    try:
-        # Create a data URL with the HTML content
-        data_url = f"data:text/html,{html}"
-
-        # Navigate to the data URL
-        await playwright_tool.page.goto(data_url)
-        logger.info("Successfully loaded custom HTML content")
-
-        return TextContent(
-            text=f"Custom HTML content loaded ({len(html)} chars) - URL: {playwright_tool.page.url}",
-            type="text",
-        )
-    except Exception as e:
-        logger.error(f"Failed to load HTML content: {e}")
-        return TextContent(text=f"Failed to load HTML content: {str(e)}", type="text")
diff --git a/environments/remote_browser/src/hud_controller/setup/navigate.py b/environments/remote_browser/src/hud_controller/setup/navigate.py
deleted file mode 100644
index 6bebedd7..00000000
--- a/environments/remote_browser/src/hud_controller/setup/navigate.py
+++ /dev/null
@@ -1,41 +0,0 @@
-"""Navigation setup for remote browser environment."""
-
-import logging
-from fastmcp import Context
-from mcp.types import TextContent
-from . import setup
-
-logger = logging.getLogger(__name__)
-
-
-@setup.tool("navigate_to_url")
-async def navigate_to_url(ctx: Context, url: str, wait_for_load_state: str = "networkidle"):
-    """Navigate browser to a specific URL.
-
-    Args:
-        url: The URL to navigate to
-        wait_for_load_state: State to wait for after navigation
-
-    Returns:
-        Setup result with navigation status
-    """
-    logger.info(f"Navigating to URL: {url}")
-
-    # Get the playwright tool from the environment context
-    persistent_ctx = setup.env
-    playwright_tool = getattr(persistent_ctx, "playwright_tool", None)
-    if not playwright_tool or not hasattr(playwright_tool, "page") or not playwright_tool.page:
-        logger.error("No playwright tool available")
-        return TextContent(text="No browser available for navigation", type="text")
-
-    # Navigate using the playwright tool
-    result = await playwright_tool.navigate(url, wait_for_load_state)
-
-    if result.get("success"):
-        logger.info(f"Successfully navigated to {url}")
-        return TextContent(
-            text=f"Navigated to {url} - Title: {result.get('title', 'Unknown')}", type="text"
-        )
-    else:
-        logger.error(f"Failed to navigate: {result.get('error')}")
-        return TextContent(text=f"Navigation failed: {result.get('error')}", type="text")
diff --git a/environments/remote_browser/src/hud_controller/setup/sheets.py b/environments/remote_browser/src/hud_controller/setup/sheets.py
deleted file mode 100644
index 71a30e5c..00000000
--- a/environments/remote_browser/src/hud_controller/setup/sheets.py
+++ /dev/null
@@ -1,345 +0,0 @@
-"""Setup functions for Google Sheets operations."""
-
-import base64
-import io
-import json
-import logging
-import os
-import httpx
-from typing import Any, Dict, Optional
-from googleapiclient.discovery import build
-from googleapiclient.http import MediaIoBaseUpload
-from google.oauth2.service_account import Credentials
-from fastmcp import Context
-from mcp.types import TextContent
-from . import setup
-
-logger = logging.getLogger(__name__)
-
-
-async def navigate_to_google_sheet(page, sheet_url: str, max_attempts: int = 3):
-    """Navigate to a Google Sheet with retry logic and loading issue handling.
-
-    Args:
-        page: Playwright page object
-        sheet_url: URL of the Google Sheet
-        max_attempts: Maximum number of navigation attempts
-
-    Returns:
-        bool: True if navigation was successful, False otherwise
-    """
-    for attempt in range(max_attempts):
-        try:
-            if attempt > 0:
-                logger.info(f"Retrying navigation (attempt {attempt + 1}/{max_attempts})")
-
-            # Navigate to the sheet
-            await page.goto(sheet_url, wait_until="load", timeout=45000)
-
-            # Wait for sheet to load
-            try:
-                await page.wait_for_selector(".grid-container", timeout=20000)
-                logger.info("Sheet loaded successfully")
-
-                # Check for loading issue popup
-                await page.wait_for_timeout(2000)  # Give popup time to appear
-
-                if await page.locator('text="Loading issue"').is_visible(timeout=1000):
-                    logger.warning("Loading issue popup detected, reloading page")
-                    await page.reload(wait_until="networkidle", timeout=30000)
-
-                    # Wait for sheet to load again after reload
-                    await page.wait_for_selector(".grid-container", timeout=20000)
-                    logger.info("Sheet reloaded successfully")
-
-                return True  # Success
-
-            except Exception as e:
-                if attempt < max_attempts - 1:
-                    logger.warning("Timeout waiting for sheet to load, will retry with refresh")
-                    await page.reload(timeout=30000)
-                else:
-                    logger.warning("Timeout waiting for sheet to fully load after all attempts")
-                    return False
-
-        except Exception as e:
-            if attempt < max_attempts - 1:
-                logger.warning(f"Navigation failed: {str(e)}, retrying...")
-                await page.wait_for_timeout(2000)  # Wait before retry
-            else:
-                logger.error(f"Navigation failed after all attempts: {str(e)}")
-                raise
-
-    return False
-
-
-def get_gcp_credentials() -> Dict[str, str]:
-    """Get GCP credentials from environment variable.
-
-    Expects one of:
-    1. GCP_CREDENTIALS_JSON - A JSON string or base64-encoded JSON
-    2. GCP_CREDENTIALS_BASE64 - Base64 encoded JSON
-    3. GCP_CREDENTIALS_FILE - Path to a JSON file
-    4. Individual environment variables for each field (GCP_PROJECT_ID, etc.)
-
-    Returns:
-        Dict containing GCP service account credentials
-    """
-    # First try to get from JSON env var
-    creds_json = os.getenv("GCP_CREDENTIALS_JSON")
-    if creds_json:
-        # Check if it's base64 encoded (doesn't start with { and no spaces)
-        if not creds_json.startswith("{") and " " not in creds_json[:100]:
-            try:
-                import base64
-
-                # Decode base64 first
-                creds_json = base64.b64decode(creds_json).decode("utf-8")
-                logger.info("Detected and decoded base64-encoded GCP credentials")
-            except Exception as e:
-                logger.debug(f"Not base64 encoded: {e}")
-
-        # Parse as JSON
-        try:
-            return json.loads(creds_json)
-        except json.JSONDecodeError as e:
-            logger.error(f"Failed to parse GCP_CREDENTIALS_JSON: {e}")
-            raise ValueError(
-                "Invalid GCP_CREDENTIALS_JSON format. "
-                "Use either: 1) Valid JSON, 2) Base64-encoded JSON, "
-                "3) GCP_CREDENTIALS_BASE64 env var, or 4) Individual env vars"
-            )
-
-    # Try base64 encoded credentials
-    creds_base64 = os.getenv("GCP_CREDENTIALS_BASE64")
-    if creds_base64:
-        try:
-            import base64
-
-            decoded = base64.b64decode(creds_base64).decode("utf-8")
-            return json.loads(decoded)
-        except Exception as e:
-            logger.error("Failed to decode GCP_CREDENTIALS_BASE64: %s", e)
-            raise ValueError(f"Invalid GCP_CREDENTIALS_BASE64: {e}")
-
-    # Try loading from file
-    creds_file = os.getenv("GCP_CREDENTIALS_FILE")
-    if creds_file:
-        try:
-            with open(creds_file, "r") as f:
-                return json.load(f)
-        except Exception as e:
-            logger.error("Failed to load GCP_CREDENTIALS_FILE from %s: %s", creds_file, e)
-            raise ValueError(f"Could not load credentials from file {creds_file}: {e}")
-
-    # Otherwise try to build from individual env vars
-    required_fields = [
-        "type",
-        "project_id",
-        "private_key_id",
-        "private_key",
-        "client_email",
-        "client_id",
-        "auth_uri",
-        "token_uri",
-        "auth_provider_x509_cert_url",
-        "client_x509_cert_url",
-    ]
-
-    credentials = {}
-    for field in required_fields:
-        env_key = f"GCP_{field.upper()}"
-        value = os.getenv(env_key)
-        if not value:
-            raise ValueError(f"Missing required GCP credential field: {env_key}")
-        credentials[field] = value
-
-    # Add universe_domain with default
-    credentials["universe_domain"] = os.getenv("GCP_UNIVERSE_DOMAIN", "googleapis.com")
-
-    return credentials
-
-
-@setup.tool("sheets_from_xlsx")
-async def sheets_from_xlsx(
-    ctx: Context, file_url: Optional[str] = None, sheet_name: str = "Worksheet"
-):
-    """Create a Google Sheet from an Excel file URL.
-
-    Args:
-        file_url: URL of the Excel file to convert
-        sheet_name: Name for the new Google Sheet (default: "Worksheet")
-
-    Returns:
-        Status dictionary with sheet information
-    """
-    logger.info("Starting sheets_from_xlsx setup")
-
-    # Validate parameters
-    if not file_url:
-        logger.error("Missing required file_url parameter")
-        return TextContent(text="Missing required parameter: file_url", type="text")
-
-    logger.info(f"Downloading Excel file from: {file_url}")
-
-    try:
-        # Download the Excel file
-        async with httpx.AsyncClient() as client:
-            response = await client.get(file_url, follow_redirects=True, timeout=30.0)
-            response.raise_for_status()
-            file_bytes = response.content
-
-            logger.info(f"Downloaded {len(file_bytes)} bytes")
-
-            # Create Google Drive service
-            scopes = ["https://www.googleapis.com/auth/drive"]
-            gcp_creds = get_gcp_credentials()
-            credentials = Credentials.from_service_account_info(gcp_creds, scopes=scopes)
-            drive_service = build("drive", "v3", credentials=credentials)
-
-            # Upload to Google Drive with conversion
-            file_metadata = {
-                "name": sheet_name,
-                "mimeType": "application/vnd.google-apps.spreadsheet",
-            }
-
-            media = MediaIoBaseUpload(
-                io.BytesIO(file_bytes),
-                mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
-                resumable=True,
-            )
-
-            logger.info("Uploading to Google Drive with conversion to Sheets")
-            drive_file = (
-                drive_service.files()
-                .create(body=file_metadata, media_body=media, fields="id,webViewLink")
-                .execute()
-            )
-
-            sheet_id = drive_file.get("id")
-            sheet_url = drive_file.get("webViewLink")
-
-            logger.info(f"Created Google Sheet: {sheet_id}")
-
-            # Set sharing permissions
-            permission = {"type": "anyone", "role": "writer", "allowFileDiscovery": False}
-
-            drive_service.permissions().create(
-                fileId=sheet_id, body=permission, fields="id"
-            ).execute()
-
-        logger.info("Set sharing permissions")
-
-        # Navigate to the sheet
-        persistent_ctx = setup.env
-        playwright_tool = getattr(persistent_ctx, "playwright_tool", None)
-        if playwright_tool and hasattr(playwright_tool, "page") and playwright_tool.page:
-            page = playwright_tool.page
-            logger.info(f"Navigating to sheet: {sheet_url}")
-
-            # Use the unified navigation function
-            await navigate_to_google_sheet(page, sheet_url, max_attempts=3)
-        else:
-            logger.warning("No playwright tool available for navigation")
-
-        sheet_info = {"sheet_id": sheet_id, "sheet_url": sheet_url, "sheet_name": sheet_name}
-
-        return TextContent(
-            text=f"Created and navigated to Google Sheet '{sheet_name}': {sheet_url}", type="text"
-        )
-
-    except httpx.HTTPError as e:
-        logger.error(f"HTTP error downloading file: {str(e)}")
-        return TextContent(text=f"Failed to download Excel file: {str(e)}", type="text")
-    except Exception as e:
-        logger.error(f"Error in sheets_from_xlsx: {str(e)}")
-        return TextContent(text=f"Failed to create sheet: {str(e)}", type="text")
-
-
-@setup.tool("sheets_from_bytes")
-async def sheets_from_bytes(
-    ctx: Context, base64_bytes: Optional[str] = None, sheet_name: str = "Worksheet"
-):
-    """Create a Google Sheet from base64 encoded Excel bytes.
-
-    Args:
-        base64_bytes: Base64 encoded Excel file bytes
-        sheet_name: Name for the new Google Sheet (default: "Worksheet")
-
-    Returns:
-        Status dictionary with sheet information
-    """
-    logger.info("Starting sheets_from_bytes setup")
-
-    # Validate parameters
-    if not base64_bytes:
-        logger.error("Missing required base64_bytes parameter")
-        return TextContent(text="Missing required parameter: base64_bytes", type="text")
-
-    file_bytes_b64 = base64_bytes
-
-    logger.info(f"Creating sheet from bytes, name: {sheet_name}")
-
-    try:
-        # Decode base64 bytes
-        file_bytes = base64.b64decode(file_bytes_b64)
-        logger.info(f"Decoded {len(file_bytes)} bytes")
-
-        # Create Google Drive service
-        scopes = ["https://www.googleapis.com/auth/drive"]
-        gcp_creds = get_gcp_credentials()
-        credentials = Credentials.from_service_account_info(gcp_creds, scopes=scopes)
-        drive_service = build("drive", "v3", credentials=credentials)
-
-        # Upload to Google Drive with conversion
-        file_metadata = {
-            "name": sheet_name,
-            "mimeType": "application/vnd.google-apps.spreadsheet",
-        }
-
-        media = MediaIoBaseUpload(
-            io.BytesIO(file_bytes),
-            mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
-            resumable=True,
-        )
-
-        logger.info("Uploading to Google Drive with conversion to Sheets")
-        drive_file = (
-            drive_service.files()
-            .create(body=file_metadata, media_body=media, fields="id,webViewLink")
-            .execute()
-        )
-
-        sheet_id = drive_file.get("id")
-        sheet_url = drive_file.get("webViewLink")
-
-        logger.info(f"Created Google Sheet: {sheet_id}")
-
-        # Set sharing permissions
-        permission = {"type": "anyone", "role": "writer", "allowFileDiscovery": False}
-
-        drive_service.permissions().create(fileId=sheet_id, body=permission, fields="id").execute()
-
-        logger.info("Set sharing permissions")
-
-        # Navigate to the sheet
-        persistent_ctx = setup.env
-        playwright_tool = getattr(persistent_ctx, "playwright_tool", None)
-        if playwright_tool and hasattr(playwright_tool, "page") and playwright_tool.page:
-            page = playwright_tool.page
-            logger.info(f"Navigating to sheet: {sheet_url}")
-
-            # Use the unified navigation function
-            await navigate_to_google_sheet(page, sheet_url, max_attempts=2)
-        else:
-            logger.warning("No playwright tool available for navigation")
-
-        sheet_info = {"sheet_id": sheet_id, "sheet_url": sheet_url, "sheet_name": sheet_name}
-
-        return TextContent(
-            text=f"Created and navigated to Google Sheet '{sheet_name}': {sheet_url}", type="text"
-        )
-
-    except Exception as e:
-        logger.error(f"Error in sheets_from_bytes: {str(e)}")
-        return TextContent(text=f"Failed to create sheet: {str(e)}", type="text")
diff --git a/environments/remote_browser/src/hud_controller/tools/__init__.py b/environments/remote_browser/src/hud_controller/tools/__init__.py
deleted file mode 100644
index cc6667d7..00000000
--- a/environments/remote_browser/src/hud_controller/tools/__init__.py
+++ /dev/null
@@ -1,9 +0,0 @@
-"""Tools module for remote browser environment."""
-
-from .playwright import PlaywrightToolWithMemory
-from .executor import BrowserExecutor
-
-__all__ = [
-    "PlaywrightToolWithMemory",
-    "BrowserExecutor",
-]
diff --git a/environments/remote_browser/src/hud_controller/tools/executor.py b/environments/remote_browser/src/hud_controller/tools/executor.py
deleted file mode 100644
index 94196832..00000000
--- a/environments/remote_browser/src/hud_controller/tools/executor.py
+++ /dev/null
@@ -1,379 +0,0 @@
-"""Browser-based executor for computer tools that uses Playwright."""
-
-import base64
-import logging
-from typing import Literal, Optional
-
-from hud.tools.executors.base import BaseExecutor
-from hud.tools.types import ContentResult
-
-logger = logging.getLogger(__name__)
-
-# Mapping from common key names to Playwright key names
-PLAYWRIGHT_KEY_MAP = {
-    # Control keys
-    "ctrl": "Control",
-    "control": "Control",
-    "alt": "Alt",
-    "shift": "Shift",
-    "meta": "Meta",
-    "cmd": "Meta",  # macOS Command key
-    "command": "Meta",
-    "win": "Meta",  # Windows key
-    "windows": "Meta",
-    # Navigation keys
-    "enter": "Enter",
-    "return": "Enter",
-    "tab": "Tab",
-    "backspace": "Backspace",
-    "delete": "Delete",
-    "del": "Delete",
-    "escape": "Escape",
-    "esc": "Escape",
-    "space": "Space",
-    # Arrow keys
-    "up": "ArrowUp",
-    "down": "ArrowDown",
-    "left": "ArrowLeft",
-    "right": "ArrowRight",
-    # Page navigation
-    "pageup": "PageUp",
-    "page_up": "PageUp",  # Support underscore variant
-    "pagedown": "PageDown",
-    "page_down": "PageDown",  # Support underscore variant
-    "next": "PageDown",  # Common alias for page down
-    "previous": "PageUp",  # Common alias for page up
-    "prev": "PageUp",  # Short alias for page up
-    "home": "Home",
-    "end": "End",
-    # Function keys
-    "f1": "F1",
-    "f2": "F2",
-    "f3": "F3",
-    "f4": "F4",
-    "f5": "F5",
-    "f6": "F6",
-    "f7": "F7",
-    "f8": "F8",
-    "f9": "F9",
-    "f10": "F10",
-    "f11": "F11",
-    "f12": "F12",
-    # Other keys
-    "insert": "Insert",
-    "ins": "Insert",
-    "pause": "Pause",
-    "capslock": "CapsLock",
-    "numlock": "NumLock",
-    "scrolllock": "ScrollLock",
-    "printscreen": "PrintScreen",
-    "contextmenu": "ContextMenu",
-}
-
-
-class BrowserExecutor(BaseExecutor):
-    """
-    Executor that performs all actions within a browser viewport using Playwright.
-
-    This allows HudComputerTool (and its subclasses like AnthropicComputerTool
-    and OpenAIComputerTool) to work with remote browser environments.
-
-    The executor translates computer control actions into browser page actions,
-    making it possible to control web applications as if they were desktop apps.
-    """
-
-    def __init__(self, playwright_tool, display_num: int | None = None):
-        """
-        Initialize the browser executor.
-
-        Args:
-            playwright_tool: PlaywrightToolWithMemory instance for browser control
-            display_num: Not used for browser executor, kept for compatibility
-        """
-        super().__init__(display_num)
-        self.playwright_tool = playwright_tool
-        logger.info("BrowserExecutor initialized with Playwright backend")
-
-    def _map_key(self, key: str) -> str:
-        """Map a key name to Playwright format."""
-        key = key.strip()
-        key_lower = key.lower()
-        mapped = PLAYWRIGHT_KEY_MAP.get(key_lower, key)
-        logger.debug(f"Mapping key '{key}' -> '{mapped}'")
-        return mapped
-
-    async def _ensure_page(self):
-        """Ensure browser and page are available."""
-        await self.playwright_tool._ensure_browser()
-        if not self.playwright_tool.page:
-            raise RuntimeError("No browser page available")
-        return self.playwright_tool.page
-
-    async def screenshot(self) -> str | None:
-        """Take a screenshot and return base64 encoded image."""
-        try:
-            page = await self._ensure_page()
-            screenshot_bytes = await page.screenshot(full_page=False)
-            screenshot_b64 = base64.b64encode(screenshot_bytes).decode()
-            logger.debug("Browser screenshot captured")
-            return screenshot_b64
-        except Exception as e:
-            logger.error(f"Screenshot failed: {e}")
-            return None
-
-    async def click(
-        self,
-        x: int | None = None,
-        y: int | None = None,
-        button: Literal["left", "right", "middle", "back", "forward"] = "left",
-        pattern: list[int] | None = None,
-        hold_keys: list[str] | None = None,
-        take_screenshot: bool = True,
-    ) -> ContentResult:
-        """Click at coordinates in the browser viewport."""
-        try:
-            page = await self._ensure_page()
-
-            if x is None or y is None:
-                return ContentResult(error="Coordinates required for click")
-
-            # Handle modifier keys
-            if hold_keys:
-                for key in hold_keys:
-                    mapped_key = self._map_key(key)
-                    await page.keyboard.down(mapped_key)
-
-            # Map button names
-            button_map = {
-                "left": "left",
-                "right": "right",
-                "middle": "middle",
-                "back": "left",  # Browser doesn't have back button
-                "forward": "left",  # Browser doesn't have forward button
-            }
-
-            # Perform click(s)
-            if pattern:
-                # Multi-click pattern
-                for delay in pattern:
-                    await page.mouse.click(x, y, button=button_map[button])
-                    if delay > 0:
-                        await page.wait_for_timeout(delay)
-            else:
-                # Single click
-                await page.mouse.click(x, y, button=button_map[button])
-
-            # Release modifier keys
-            if hold_keys:
-                for key in hold_keys:
-                    mapped_key = self._map_key(key)
-                    await page.keyboard.up(mapped_key)
-
-            logger.debug(f"Clicked at ({x}, {y}) with button {button}")
-
-            result = ContentResult(output=f"Clicked at ({x}, {y})")
-            if take_screenshot:
-                result = result + ContentResult(base64_image=await self.screenshot())
-
-            return result
-
-        except Exception as e:
-            logger.error(f"Click failed: {e}")
-            return ContentResult(error=str(e))
-
-    async def write(
-        self,
-        text: str,
-        enter_after: bool = False,
-        hold_keys: list[str] | None = None,
-        take_screenshot: bool = True,
-    ) -> ContentResult:
-        """Type text in the browser."""
-        try:
-            page = await self._ensure_page()
-
-            # Handle modifier keys
-            if hold_keys:
-                for key in hold_keys:
-                    mapped_key = self._map_key(key)
-                    await page.keyboard.down(mapped_key)
-
-            # Type the text
-            await page.keyboard.type(text)
-
-            if enter_after:
-                await page.keyboard.press("Enter")
-
-            # Release modifier keys
-            if hold_keys:
-                for key in hold_keys:
-                    mapped_key = self._map_key(key)
-                    await page.keyboard.up(mapped_key)
-
-            logger.debug(f"Typed text: {text[:50]}...")
-
-            result = ContentResult(output=f"Typed: {text}")
-            if take_screenshot:
-                result = result + ContentResult(base64_image=await self.screenshot())
-
-            return result
-
-        except Exception as e:
-            logger.error(f"Type failed: {e}")
-            return ContentResult(error=str(e))
-
-    async def press(
-        self,
-        keys: list[str],
-        take_screenshot: bool = True,
-    ) -> ContentResult:
-        """Press keyboard keys in the browser."""
-        try:
-            page = await self._ensure_page()
-
-            # Map keys to Playwright format
-            mapped_keys = [self._map_key(key) for key in keys]
-
-            # Always capitalize single letter keys in press method
-            processed_keys = []
-            for key in mapped_keys:
-                # Capitalize single letters (e.g., 'a' -> 'A')
-                if len(key) == 1 and key.isalpha() and key.islower():
-                    processed_keys.append(key.upper())
-                else:
-                    processed_keys.append(key)
-            mapped_keys = processed_keys
-
-            logger.info(f"Mapped keys: {mapped_keys}")
-
-            # Press the keys as a combination (at the same time)
-            key_combination = "+".join(mapped_keys)
-            await page.keyboard.press(key_combination)
-
-            logger.debug(f"Pressed keys: {keys} (mapped to: {mapped_keys})")
-
-            result = ContentResult(output=f"Pressed: {key_combination}")
-            if take_screenshot:
-                result = result + ContentResult(base64_image=await self.screenshot())
-
-            return result
-
-        except Exception as e:
-            logger.error(f"Key press failed: {e}")
-            return ContentResult(error=str(e))
-
-    async def scroll(
-        self,
-        x: int | None = None,
-        y: int | None = None,
-        scroll_x: int | None = None,
-        scroll_y: int | None = None,
-        hold_keys: list[str] | None = None,
-        take_screenshot: bool = True,
-    ) -> ContentResult:
-        """Scroll in the browser viewport."""
-        try:
-            page = await self._ensure_page()
-
-            # Default to center of viewport if coordinates not provided
-            if x is None or y is None:
-                viewport = page.viewport_size
-                x = viewport["width"] // 2 if viewport else 400
-                y = viewport["height"] // 2 if viewport else 300
-
-            # Move to position
-            await page.mouse.move(x, y)
-
-            # Perform scroll
-            delta_x = scroll_x or 0
-            delta_y = scroll_y or 0
-            await page.mouse.wheel(delta_x, delta_y)
-
-            logger.debug(f"Scrolled at ({x}, {y}) by ({delta_x}, {delta_y})")
-
-            result = ContentResult(output=f"Scrolled by ({delta_x}, {delta_y})")
-            if take_screenshot:
-                result = result + ContentResult(base64_image=await self.screenshot())
-
-            return result
-
-        except Exception as e:
-            logger.error(f"Scroll failed: {e}")
-            return ContentResult(error=str(e))
-
-    async def move(
-        self,
-        x: int | None = None,
-        y: int | None = None,
-        take_screenshot: bool = True,
-    ) -> ContentResult:
-        """Move mouse to coordinates in the browser."""
-        try:
-            page = await self._ensure_page()
-
-            if x is None or y is None:
-                return ContentResult(error="Coordinates required for move")
-
-            await page.mouse.move(x, y)
-
-            logger.debug(f"Moved mouse to ({x}, {y})")
-
-            result = ContentResult(output=f"Moved to ({x}, {y})")
-            if take_screenshot:
-                result = result + ContentResult(base64_image=await self.screenshot())
-
-            return result
-
-        except Exception as e:
-            logger.error(f"Move failed: {e}")
-            return ContentResult(error=str(e))
-
-    async def drag(
-        self,
-        path: list[tuple[int, int]],
-        button: Literal["left", "right", "middle"] = "left",
-        hold_keys: list[str] | None = None,
-        take_screenshot: bool = True,
-    ) -> ContentResult:
-        """Drag along a path in the browser."""
-        try:
-            page = await self._ensure_page()
-
-            if not path or len(path) < 2:
-                return ContentResult(error="Path must have at least 2 points")
-
-            # Handle modifier keys
-            if hold_keys:
-                for key in hold_keys:
-                    mapped_key = self._map_key(key)
-                    await page.keyboard.down(mapped_key)
-
-            # Start drag
-            start_x, start_y = path[0]
-            await page.mouse.move(start_x, start_y)
-            await page.mouse.down(button=button)
-
-            # Move through path
-            for x, y in path[1:]:
-                await page.mouse.move(x, y)
-
-            # End drag
-            await page.mouse.up(button=button)
-
-            # Release modifier keys
-            if hold_keys:
-                for key in hold_keys:
-                    mapped_key = self._map_key(key)
-                    await page.keyboard.up(mapped_key)
-
-            logger.debug(f"Dragged from {path[0]} through {len(path)} points")
-
-            result = ContentResult(output=f"Dragged through {len(path)} points")
-            if take_screenshot:
-                result = result + ContentResult(base64_image=await self.screenshot())
-
-            return result
-
-        except Exception as e:
-            logger.error(f"Drag failed: {e}")
-            return ContentResult(error=str(e))
diff --git a/environments/remote_browser/src/hud_controller/tools/playwright.py b/environments/remote_browser/src/hud_controller/tools/playwright.py
deleted file mode 100644
index 475451fe..00000000
--- a/environments/remote_browser/src/hud_controller/tools/playwright.py
+++ /dev/null
@@ -1,189 +0,0 @@
-"""PlaywrightTool with memory/history tracking for remote browser environment."""
-
-import logging
-from typing import Any, Dict, List, Optional, Literal
-from datetime import datetime
-from pydantic import Field
-from hud.tools.playwright import PlaywrightTool
-from mcp.types import ContentBlock, ImageContent, TextContent
-
-logger = logging.getLogger(__name__)
-
-
-class PlaywrightToolWithMemory(PlaywrightTool):
-    """Extended PlaywrightTool that tracks navigation and action history.
-
-    This tool extends the base PlaywrightTool to add:
-    - Navigation history tracking
-    - Action history tracking
-    - Selector history for debugging
-    """
-
-    def __init__(self, context: Any = None, cdp_url: str | None = None) -> None:
-        """Initialize with history tracking capabilities.
-
-        Args:
-            context: Optional context (not used, for compatibility)
-            cdp_url: Chrome DevTools Protocol URL for connecting to browser
-        """
-        # Initialize base tool with CDP URL as context
-        super().__init__(cdp_url=cdp_url)
-
-        # Initialize history tracking
-        self.navigation_history: List[Dict[str, Any]] = []
-        self.action_history: List[Dict[str, Any]] = []
-        self.selector_history: List[str] = []
-
-    async def _ensure_browser(self) -> None:
-        """Ensure browser is launched and setup event listeners."""
-        await super()._ensure_browser()
-
-        # Setup event listeners for tracking
-        if self.page:
-            self._setup_event_listeners()
-
-    def _setup_event_listeners(self) -> None:
-        """Setup event listeners to track navigation history."""
-        if not self.page:
-            return
-
-        try:
-            # Set up dialog handler using a method reference (not a lambda/closure)
-            # This avoids pickling issues while still handling dialogs
-            self.page.on("dialog", self._handle_dialog)
-            logger.debug("Dialog handler registered")
-        except Exception as e:
-            logger.warning(f"Failed to setup event listeners: {e}")
-
-    async def _handle_dialog(self, dialog) -> None:
-        """Handle JavaScript dialogs (alert, confirm, prompt).
-
-        This is an async method that can be used as an event handler without
-        creating unpicklable closures.
-
-        Args:
-            dialog: Playwright dialog object
-        """
-        try:
-            dialog_info = {
-                "type": dialog.type,
-                "message": dialog.message,
-                "timestamp": datetime.now().isoformat(),
-            }
-            logger.info(f"Dialog detected: {dialog_info}")
-
-            # Record the dialog in action history
-            self._record_action("dialog", dialog_info)
-
-            # Auto-dismiss the dialog
-            await dialog.dismiss()
-            logger.debug(f"Dialog dismissed: {dialog.type}")
-        except Exception as e:
-            logger.error(f"Error handling dialog: {e}")
-
-    def _record_action(self, action_type: str, details: Dict[str, Any], result: Any = None) -> None:
-        """Record an action in the history.
-
-        Args:
-            action_type: Type of action performed
-            details: Details about the action
-            result: Result of the action
-        """
-        action_record = {
-            "type": action_type,
-            "timestamp": datetime.now().isoformat(),
-            "details": details,
-            "result": result,
-        }
-        self.action_history.append(action_record)
-        logger.debug(f"Action recorded: {action_type} - {details}")
-
-    async def navigate(
-        self,
-        url: str = Field(..., description="URL to navigate to"),
-        wait_for_load_state: Literal["load", "domcontentloaded", "networkidle"] = Field(
-            "networkidle", description="Wait condition after navigation"
-        ),
-    ) -> dict:
-        """Navigate to a URL with history tracking.
-
-        Args:
-            url: URL to navigate to
-            wait_for_load_state: State to wait for after navigation
-
-        Returns:
-            Navigation result dictionary
-        """
-        # Record the navigation action
-        self._record_action("navigate", {"url": url, "wait_for_load_state": wait_for_load_state})
-
-        # Perform the navigation using parent class
-        result = await super().navigate(url, wait_for_load_state)
-
-        # Update action record with result
-        if self.action_history:
-            self.action_history[-1]["result"] = result
-
-        # Track navigation history directly (instead of using event listeners)
-        if result.get("success") and self.page:
-            self.navigation_history.append(
-                {"url": self.page.url, "timestamp": datetime.now().isoformat()}
-            )
-            logger.debug(f"Navigation tracked: {self.page.url}")
-
-        return result
-
-    async def click(
-        self,
-        selector: str = Field(..., description="CSS selector to click"),
-        button: Literal["left", "right", "middle"] = Field("left", description="Mouse button"),
-        count: int = Field(1, description="Number of clicks"),
-        wait_for_navigation: bool = Field(False, description="Wait for navigation after click"),
-    ) -> dict:
-        """Click an element with history tracking.
-
-        Args:
-            selector: CSS selector to click
-            button: Mouse button to use
-            count: Number of clicks
-            wait_for_navigation: Whether to wait for navigation
-
-        Returns:
-            Click result dictionary
-        """
-        # Track selector
-        self.selector_history.append(selector)
-
-        # Record the action
-        self._record_action(
-            "click",
-            {
-                "selector": selector,
-                "button": button,
-                "count": count,
-                "wait_for_navigation": wait_for_navigation,
-            },
-        )
-
-        # Perform the click using parent class
-        result = await super().click(selector, button, count, wait_for_navigation)
-
-        # Update action record with result
-        if self.action_history:
-            self.action_history[-1]["result"] = result
-
-        return result
-
-    def get_history_summary(self) -> Dict[str, Any]:
-        """Get a summary of the browsing history.
-
-        Returns:
-            Dictionary with history statistics
-        """
-        return {
-            "navigation_count": len(self.navigation_history),
-            "action_count": len(self.action_history),
-            "unique_selectors": len(set(self.selector_history)),
-            "last_navigation": self.navigation_history[-1] if self.navigation_history else None,
-            "last_action": self.action_history[-1] if self.action_history else None,
-        }
diff --git a/environments/remote_browser/test_task.json b/environments/remote_browser/test_task.json
deleted file mode 100644
index 0d6f3864..00000000
--- a/environments/remote_browser/test_task.json
+++ /dev/null
@@ -1,34 +0,0 @@
-{
-    "prompt": "Calculate from the RawData tab the z-scores from the mean close price for each row. Return, starting in ANSWER!A1 and descending to ANSWER!A5, the 5 dates with the greatest absolute value of standard deviations from the mean",
-    "mcp_config": {
-        "test_browser": {
-          "url": "http://localhost:8765/mcp"
-        }
-    },
-    "id": "6e4744c7-b2c9-4bb6-807e-2cc144a4e8c2",
-    "metadata": {
-        "partial": true, 
-        "gold_file_url": "https://gahludmjcsmszgyufydt.supabase.co//storage/v1/object/public/sheetbench/c6ddeb9a-0c16-4f5e-8a06-f148ebb4be8a/gold_solution_2.xlsx?",
-        "display_width": 1448,
-        "display_height": 944
-    },
-    "setup_tool": {
-        "name": "setup", 
-        "arguments": {
-            "name": "sheets_from_xlsx", 
-            "arguments": {
-                "file_url": "https://gahludmjcsmszgyufydt.supabase.co//storage/v1/object/public/sheetbench/c6ddeb9a-0c16-4f5e-8a06-f148ebb4be8a/setup_input_2.xlsx?"
-            }
-        }
-    },
-    "evaluate_tool": {
-        "name": "evaluate", 
-        "arguments": {
-            "name": "sheets_cell_values", 
-            "arguments": {
-                "arguments": {"A1": "1/12/2024", "A2": "1/10/2024", "A3": "1/15/2024", "A4": "1/11/2024", "A5": "1/17/2024"}
-            }
-        }
-    },
-    "system_prompt": "All solutions should be put in the sheet called \"ANSWER\". In the answer sheet, all dates should use the American standard format MM/DD/YYYY with no leading zero. All numbers should use the format and decimal place precision given in the input sheets (e.g., with or without a thousands separator should depend on the inputs), unless specified otherwise."
-}
\ No newline at end of file
diff --git a/environments/rubrics/.env.example b/environments/rubrics/.env.example
deleted file mode 100644
index 0f0c1b6b..00000000
--- a/environments/rubrics/.env.example
+++ /dev/null
@@ -1,5 +0,0 @@
-EXA_API_KEY=your-exa-api-key
-ANTHROPIC_API_KEY=your-anthropic-api-key
-HUD_API_KEY=your-hud-api-key
-OPENAI_API_KEY=your-openai-api-key
-EDGAR_IDENTITY="Your Name your.email@example.com"
diff --git a/environments/rubrics/.gitignore b/environments/rubrics/.gitignore
deleted file mode 100644
index 15b4ad3f..00000000
--- a/environments/rubrics/.gitignore
+++ /dev/null
@@ -1,13 +0,0 @@
-wandb/
-outputs/
-checkpoints/
-src/hud_controller/.env
-
-.venv/
-.env
-.env.local
-.env.development.local
-.env.test.local
-.env.production.local
-
-__pycache__/
\ No newline at end of file
diff --git a/environments/rubrics/Dockerfile b/environments/rubrics/Dockerfile
deleted file mode 100644
index c5f50344..00000000
--- a/environments/rubrics/Dockerfile
+++ /dev/null
@@ -1,24 +0,0 @@
-FROM python:3.11-slim
-
-WORKDIR /app
-
-# Install git for dependency installation
-RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
-
-# Copy and install MCP server dependencies
-COPY server/pyproject.toml ./server/
-RUN pip install --no-cache-dir ./server
-
-# Copy and install environment dependencies
-COPY environment/pyproject.toml ./environment/
-RUN pip install --no-cache-dir ./environment
-
-# Copy source code after dependencies
-COPY server/ ./server/
-COPY environment/ ./environment/
-
-ENV ENV_SERVER_PORT=8000
-ENV PYTHONPATH=/app
-
-# Start environment server in background, then run MCP server with stdio
-CMD ["sh", "-c", "uvicorn environment.server:app --host 0.0.0.0 --port $ENV_SERVER_PORT --log-level warning --reload >&2 & sleep 0.5 && cd /app/server && exec hud dev server.main --stdio"]
diff --git a/environments/rubrics/README.md b/environments/rubrics/README.md
deleted file mode 100644
index f2d9ec85..00000000
--- a/environments/rubrics/README.md
+++ /dev/null
@@ -1,239 +0,0 @@
-# SEC EDGAR Rubrics Environment
-
-SEC filing research environment powered by the SEC EDGAR database for accessing company filings and financial data, with rubric-based evaluation for structured grading provided by [The LLM Data Company](https://llmdata.com).
-
-See [docs](https://docs.hud.ai/build-environments) for the complete environment design workflow.
-
-## Architecture
-
-**`environment/`** - Manages SEC EDGAR and web search integration
-- Uses the edgartools Python library to access SEC filing data
-- Integrates with Exa API for supplementary web search capabilities
-- Exposes HTTP endpoints for research workflows with exponential backoff for rate limiting
-
-**`server/`** - Wraps data in MCP tools
-- Provides research tools for agents to access SEC filings, financial data, and web search
-- Agents and tasks interact only with these tools
-
-**Why separate?** Edit tools for the agent or tasks without restarting the environment backend.
-
-## Tools
-
-### SEC EDGAR Tools
-- **`setup()`** - Initialize the environment and reset state.
-- **`search_company(query: str)`** - Search for a company by ticker symbol or name. Returns company information including ticker, name, and CIK.
-- **`get_filings(ticker?: str, form_type?: str, limit?: int, cutoff_date?: str)`** - Get SEC filings. When `ticker` is provided, returns company-specific filings. Otherwise, returns global recent filings. Can filter by form type (e.g., "10-K", "10-Q", "8-K"), limit results, and filter by date (YYYY-MM-DD).
-- **`get_filing_content(filing_url: str)`** - Fetch the full text content of a specific SEC filing from its URL.
-- **`get_financial_data(ticker: str, accession_number: str)`** - Extract financial statements and key metrics from a 10-K or 10-Q filing. Returns income statement, balance sheet, cash flow, and other financial data.
-- **`get_segment_data(ticker: str, accession_number: str)`** - Extract segment-level financial data from a 10-K or 10-Q filing for companies with multiple business segments.
-- **`get_filing_sections(ticker: str, accession_number: str)`** - Extract specific sections from a 10-K or 10-Q filing (e.g., Business, Risk Factors, MD&A).
-
-### Web Search Tools
-- **`web_search(query: str)`** - Search the web using Exa API. Returns titles and URLs of relevant results.
-- **`web_fetch(url: str)`** - Fetch and extract content from a web URL. Returns summary, highlights, and full content.
-
-### Evaluation Tools
-- **`answer(final_answer: str)`** - Submit the final research answer.
-- **`evaluate(rubric: list[dict])`** - Evaluate submitted answer using a structured rubric with weighted requirements.
-
-### Rubric-Based Evaluation
-
-The `evaluate` tool uses The LLM Data Company's [rubric](https://github.com/The-LLM-Data-Company/rubric/) package to grade answers against structured criteria with autograders.
-
-## Setup
-
-### Environment Variables
-
-The environment requires several API keys and configuration:
-
-**Required:**
-- `EDGAR_IDENTITY` - Your identity for SEC EDGAR access (required by SEC regulations)
-  - Format: `"Your Name your.email@example.com"`
-
-**Optional:**
-- `EXA_API_KEY` - For web search and content fetching capabilities (if using web_search/web_fetch tools)
-- `HUD_API_KEY` - For HUD telemetry and tracing
-- `ANTHROPIC_API_KEY` - For Claude agent (if using Claude)
-- `OPENAI_API_KEY` - For rubric evaluation (if using OpenAI-based autograders)
-
-Add these to your .env before running `hud eval`:
-```bash
-export EDGAR_IDENTITY="Your Name your.email@example.com"
-export EXA_API_KEY="your-exa-key" # optional, for web search
-export ANTHROPIC_API_KEY="your-anthropic-key" # only if using an Anthropic model
-export OPENAI_API_KEY="your-openai-key"
-# Optional
-export HUD_API_KEY="your-hud-key"
-```
-
-## Development
-
-```bash
-# Terminal 1 - Environment backend
-cd environment
-export EDGAR_IDENTITY="Your Name your.email@example.com"
-export EXA_API_KEY="your-exa-key"  # optional, for web search
-uv run uvicorn server:app --reload
-
-# Terminal 2 - MCP server
-cd server
-uv run hud dev
-```
-
-The environment includes exponential backoff for rate limiting, so API calls will automatically retry on 429 errors.
-
-In general, we recommend starting work on the environment backend first, then developing the MCP server to expose the right things to the agent.
-
-For complex environments that require many dependencies, we recommend running `hud dev` in the environment root:
-```bash
-cd ..
-hud dev
-```
-
-## Tasks & Evaluation
-
-```bash
-# Build first in the global folder with the Dockerfile (creates rubrics:latest)
-hud build
-```
-
-Your `tasks.json` uses `docker run` to launch the environment:
-
-```json
-{
-  "prompt": "Analyze Tesla's FY2024 10-K filing...",
-  "mcp_config": {
-    "local": {
-      "command": "docker",
-      "args": ["run", "--rm", "-i", "rubrics:latest"]
-    }
-  },
-  "evaluate_tool": {
-    "name": "evaluate",
-    "arguments": {
-      "rubric": [...]
-    }
-  }
-}
-```
-
-**Note:** Export environment variables before running. The Docker container will inherit them from your shell.
-
-**Commands:**
-```bash
-# Build first
-hud build
-
-# Test task locally
-export EDGAR_IDENTITY="Your Name your.email@example.com"
-export EXA_API_KEY="your-exa-key"  # optional, for web search
-export ANTHROPIC_API_KEY="your-anthropic-key"
-export OPENAI_API_KEY="your-openai-key"
-hud eval tasks.json --max-steps 25
-
-# Push environment for remote running
-hud push
-
-# Production RL training
-hud rl tasks.json  # Auto-converts docker→remote, builds & pushes if needed
-```
-
-## Publishing Your Environment
-
-Once your environment is ready, you can share it with the community:
-
-### 1. Push to Registry
-```bash
-# Build and push your environment (requires docker hub login and hud api key)
-hud build
-hud push
-```
-
-### 2. Create a Dataset
-
-Create a dataset on HuggingFace with your tasks:
-
-**Option A: Upload manually**
-1. Upload your `tasks.json` to HuggingFace
-2. Make sure it's **public** to appear on leaderboards
-
-**Option B: Use the SDK**
-```python
-from hud.datasets import save_tasks
-import json
-
-# Load your tasks
-with open("tasks.json") as f:
-    tasks = json.load(f)
-
-# Push to HuggingFace
-save_tasks(tasks, repo_id="your-org/your-dataset")
-```
-
-### 3. Run and Track Performance
-
-```bash
-# Run Claude on your benchmark
-hud eval "your-org/your-dataset" --agent claude
-
-# View results at:
-# hud.ai/leaderboards/your-org/your-dataset
-```
-
-**Note**: Only public HuggingFace datasets appear as leaderboards!
-
-📚 Learn more: [Creating Benchmarks](https://docs.hud.ai/evaluate-agents/create-benchmarks) | [Leaderboards](https://docs.hud.ai/evaluate-agents/leaderboards)
-
-## Example Research Workflow
-
-```python
-# Initialize environment
-setup()
-
-# Agent searches for a company
-company_info = search_company("TSLA")
-# Returns: [{"ticker": "TSLA", "name": "Tesla Inc", "cik": "1318605"}]
-
-# Agent gets recent filings
-filings = get_filings(ticker="TSLA", form_type="10-K", limit=1)
-# Returns: [{"filing_date": "2024-01-01", "form_type": "10-K", "accession_number": "...", "filing_url": "..."}]
-
-# Agent extracts financial data
-financial_data = get_financial_data(ticker="TSLA", accession_number=filings[0]["accession_number"])
-# Returns: {"has_financials": True, "financial_data": {...income statement, balance sheet, etc...}}
-
-# Agent gets specific sections from the filing
-sections = get_filing_sections(ticker="TSLA", accession_number=filings[0]["accession_number"])
-# Returns: {"sections": {"business": "...", "risk_factors": "...", "mda": "..."}}
-
-# Agent uses web search for additional context
-search_results = web_search("Tesla FY2024 revenue analysis")
-# Returns: [{"title": "...", "url": "..."}]
-
-# Agent fetches web content
-web_content = web_fetch(search_results[0]["url"])
-# Returns: "=== SUMMARY ===\n...\n=== KEY HIGHLIGHTS ===\n...\n=== FULL CONTENT ===\n..."
-
-# Agent submits final answer
-answer("Based on Tesla's FY2024 10-K, revenue was $96.8B...")
-
-# Evaluate answer using rubric
-result = evaluate(rubric=[
-    {"requirement": "Correctly states FY2024 revenue", "weight": 15},
-    {"requirement": "Provides segment breakdown", "weight": 5},
-])
-# Returns: {"reward": float, "info": {"report": [...]}, "done": True}
-```
-
-## Dependencies
-
-- **edgartools**: Python library for accessing SEC EDGAR data
-- **fastapi**: Web framework for the environment server
-- **httpx**: HTTP client for API calls
-- **rubric**: LLM Data Company's rubric evaluation package
-- **Exa API**: Web search and content extraction (optional, for web_search/web_fetch tools)
-
-## Acknowledgments
-
-* [EdgarTools](https://github.com/dgunning/edgartools) - Python library to access SEC EDGAR
-* [SEC EDGAR MCP](https://github.com/stefanoamorelli/sec-edgar-mcp) - Rich OSS SEC MCP server
diff --git a/environments/rubrics/environment/__init__.py b/environments/rubrics/environment/__init__.py
deleted file mode 100644
index 2b02ade4..00000000
--- a/environments/rubrics/environment/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-"""Rubrics environment package."""
diff --git a/environments/rubrics/environment/edgar_utils.py b/environments/rubrics/environment/edgar_utils.py
deleted file mode 100644
index 7e752e84..00000000
--- a/environments/rubrics/environment/edgar_utils.py
+++ /dev/null
@@ -1,126 +0,0 @@
-import asyncio
-import logging
-import os
-from typing import Any, Callable, Dict
-from urllib.parse import urlparse
-
-import httpx
-from edgar import Company, Filing
-
-logger = logging.getLogger(__name__)
-
-
-async def get_content_with_fallback(filing: Filing, filing_url: str) -> str:
-    content = ""
-    try:
-        content = filing.text()
-    except Exception:
-        try:
-            content = filing.text
-        except Exception:
-            content = ""
-
-    # Fallback to HTML
-    if not content:
-        try:
-            content = filing.html()
-        except Exception:
-            try:
-                content = filing.html
-            except Exception:
-                content = ""
-
-    if not content:
-        try:
-            edgar_identity = os.getenv("EDGAR_IDENTITY")
-            if not edgar_identity:
-                raise ValueError("EDGAR_IDENTITY environment variable not set")
-
-            async with httpx.AsyncClient(timeout=30.0) as client:
-                resp = await client.get(filing_url, headers={"User-Agent": edgar_identity})
-                resp.raise_for_status()
-                content = resp.text
-        except Exception:
-            content = ""
-
-    return content
-
-
-def populate_financal_data(financials_fn: Callable[[], Any]) -> Dict[str, Any]:
-    fn_name = getattr(financials_fn, "__name__", "unknown")
-    try:
-        data = financials_fn()
-        if data is not None:
-            return {
-                fn_name: {
-                    "data": data.to_dict(orient="index")
-                    if hasattr(data, "to_dict")
-                    else str(data)[:5000],
-                    "columns": list(data.columns) if hasattr(data, "columns") else None,
-                }
-            }
-    except Exception as e:
-        logger.warning(f"Could not extract {fn_name}: {e}")
-    return {}
-
-
-def get_filing_from_url(filing_url: str) -> Filing:
-    parsed = urlparse(filing_url)
-    path_parts = parsed.path.split("/")
-
-    accession_no_dashes = None
-    for part in path_parts:
-        if len(part) >= 18 and part.isdigit():
-            accession_no_dashes = part
-            break
-
-    if not accession_no_dashes:
-        raise ValueError(f"Could not extract accession number from URL: {filing_url}")
-
-    accession = (
-        f"{accession_no_dashes[:10]}-{accession_no_dashes[10:12]}-{accession_no_dashes[12:]}"
-    )
-
-    filing = None
-    try:
-        cik = None
-        parts = [p for p in path_parts if p]
-        if "data" in parts:
-            idx = parts.index("data")
-            if idx + 1 < len(parts) and parts[idx + 1].isdigit():
-                cik = parts[idx + 1]
-
-        if cik:
-            comp = Company(cik)
-            for f in comp.get_filings():
-                if getattr(f, "accession_number", "").replace("-", "") == accession_no_dashes:
-                    filing = f
-                    break
-    except Exception:
-        filing = None
-
-    if filing is None:
-        try:
-            filing = Filing(accession)
-        except TypeError:
-            filing = None
-
-    if filing is None:
-        raise LookupError(f"Filing not found for accession {accession}")
-
-    return filing
-
-
-def get_filing_by_accession(identifier: str, accession_number: str) -> Filing:
-    company = Company(identifier)
-    filing = None
-
-    for f in company.get_filings():
-        if f.accession_number.replace("-", "") == accession_number.replace("-", ""):
-            filing = f
-            break
-
-    if filing is None:
-        raise LookupError(f"Filing {accession_number} not found for {identifier}")
-
-    return filing
diff --git a/environments/rubrics/environment/exa_utils.py b/environments/rubrics/environment/exa_utils.py
deleted file mode 100644
index 1ec8cacb..00000000
--- a/environments/rubrics/environment/exa_utils.py
+++ /dev/null
@@ -1,57 +0,0 @@
-import asyncio
-import logging
-import os
-from typing import Any, Callable, Dict
-from urllib.parse import urlparse
-
-import httpx
-from edgar import Company, Filing
-import asyncio
-import logging
-import os
-import socket
-from typing import Any, Awaitable, Callable, Dict, List, Optional, TypeVar
-
-import httpx
-from fastapi import FastAPI, HTTPException
-from pydantic import BaseModel
-
-logger = logging.getLogger(__name__)
-
-
-async def execute_fetch(url: str, exa_api_key: str, max_length: int = 2500) -> Dict[str, Any]:
-    """Execute the actual Exa contents API call."""
-    contents_url = "https://api.exa.ai/contents"
-    async with httpx.AsyncClient(timeout=30.0) as client:
-        response = await client.post(
-            contents_url,
-            headers={"x-api-key": exa_api_key, "Content-Type": "application/json"},
-            json={
-                "urls": [url],
-                "text": {"maxCharacters": max_length, "includeHtmlTags": False},
-                "highlights": {"numSentences": 5, "highlightsPerUrl": 3},
-                "summary": {"query": "main takeaways"},
-                "livecrawl": "fallback",
-            },
-        )
-        response.raise_for_status()
-        return response.json()
-
-
-async def execute_search(query: str, exa_api_key: str, max_results: int = 1) -> Dict[str, Any]:
-    """Execute the actual Exa search API call."""
-    search_url = "https://api.exa.ai/search"
-    async with httpx.AsyncClient(timeout=30.0) as client:
-        response = await client.post(
-            search_url,
-            headers={"x-api-key": exa_api_key, "Content-Type": "application/json"},
-            json={
-                "query": query,
-                "numResults": max_results,
-                "type": "keyword",
-                "userLocation": "us",
-                "contents": {"text": {"maxCharacters": 1000}},
-            },
-        )
-        response.raise_for_status()
-        return response.json()
diff --git a/environments/rubrics/environment/pyproject.toml b/environments/rubrics/environment/pyproject.toml
deleted file mode 100644
index 3d10ae3e..00000000
--- a/environments/rubrics/environment/pyproject.toml
+++ /dev/null
@@ -1,19 +0,0 @@
-[project]
-name = "rubrics-environment"
-version = "0.1.0"
-description = "Backend service for Rubrics environment with SEC EDGAR integration"
-requires-python = ">=3.11"
-dependencies = [
-    "fastapi>=0.104.1",
-    "uvicorn[standard]>=0.24.0",
-    "httpx>=0.24.0",
-    "rubric==1.1.8",
-    "edgartools>=4.21.3",
-]
-
-[build-system]
-requires = ["hatchling"]
-build-backend = "hatchling.build"
-
-[tool.hatch.build.targets.wheel]
-packages = ["environment"]
diff --git a/environments/rubrics/environment/server.py b/environments/rubrics/environment/server.py
deleted file mode 100644
index ec14b6ed..00000000
--- a/environments/rubrics/environment/server.py
+++ /dev/null
@@ -1,596 +0,0 @@
-"""
-FastAPI server for Rubrics environment with SEC EDGAR integration.
-Manages SEC filing data access and state.
-"""
-
-import asyncio
-import logging
-import os
-import traceback
-from datetime import datetime
-from typing import Any, Awaitable, Callable, Dict, List, Optional, TypeVar
-
-import httpx
-import uvicorn
-from edgar import Company, set_identity, get_filings as edgar_get_filings
-from edgar.financials import Financials
-from fastapi import FastAPI, HTTPException
-from pydantic import BaseModel
-from rubric import Rubric
-
-from .edgar_utils import (
-    get_content_with_fallback,
-    get_filing_by_accession,
-    get_filing_from_url,
-    populate_financal_data,
-)
-
-from .exa_utils import execute_fetch, execute_search
-
-T = TypeVar("T")
-
-
-# Set up logging
-logger = logging.getLogger(__name__)
-
-
-async def call_with_exponential_backoff(
-    func: Callable[..., Awaitable[T]],
-    *args: Any,
-    max_retries: int = 5,
-    initial_delay: float = 1.0,
-    max_delay: float = 60.0,
-    exponential_base: float = 2.0,
-    **kwargs: Any,
-) -> T:
-    """
-    Call an async function with exponential backoff on rate limit errors.
-
-    Args:
-        func: The async function to call
-        *args: Positional arguments for the function
-        max_retries: Maximum number of retry attempts (default: 5)
-        initial_delay: Initial delay in seconds (default: 1.0)
-        max_delay: Maximum delay in seconds (default: 60.0)
-        exponential_base: Base for exponential backoff (default: 2.0)
-        **kwargs: Keyword arguments for the function
-
-    Returns:
-        The result of the function call
-
-    Raises:
-        The last exception if all retries fail
-    """
-    last_exception: Optional[Exception] = None
-    delay = initial_delay
-
-    for attempt in range(max_retries + 1):
-        try:
-            return await func(*args, **kwargs)
-        except httpx.HTTPStatusError as e:
-            if e.response.status_code == 429:
-                last_exception = e
-                if attempt < max_retries:
-                    # Log the retry attempt
-                    logger.warning(
-                        "Rate limit hit (429), retrying in %s seconds... (attempt %s/%s)",
-                        delay,
-                        attempt + 1,
-                        max_retries,
-                    )
-                    await asyncio.sleep(delay)
-                    # Calculate next delay with exponential backoff
-                    delay = min(delay * exponential_base, max_delay)
-                else:
-                    # All retries exhausted
-                    raise
-            else:
-                # Not a rate limit error, raise immediately
-                raise
-        except Exception:
-            # Not an HTTP error, raise immediately
-            raise
-
-    # This should never be reached, but just in case
-    if last_exception:
-        raise last_exception
-    raise RuntimeError("Unexpected error in exponential backoff")
-
-
-class _EnvState:
-    """In-memory environment state for tracking usage and agent answer."""
-
-    def __init__(self) -> None:
-        self.search_count: int = 0
-        self.fetch_count: int = 0
-        self.submitted_answer: Optional[str] = None
-
-    def reset(self) -> None:
-        self.search_count = 0
-        self.fetch_count = 0
-        self.submitted_answer = None
-
-
-state = _EnvState()
-
-
-class SearchCompanyRequest(BaseModel):
-    query: str
-
-
-class GetFilingsRequest(BaseModel):
-    ticker: str | None = None
-    form_type: str | None = None
-    limit: int = 10
-    cutoff_date: str | None = None
-
-
-class GetFilingContentRequest(BaseModel):
-    filing_url: str
-
-
-class AnswerRequest(BaseModel):
-    final_answer: str
-
-
-class EvaluateRequest(BaseModel):
-    rubric: list[dict[str, str | float]]
-
-
-class FilingByAccessionRequest(BaseModel):
-    ticker: str
-    accession_number: str
-
-
-class FetchRequest(BaseModel):
-    url: str
-
-
-class SearchRequest(BaseModel):
-    query: str
-
-
-app = FastAPI(title="SEC EDGAR Environment API", version="0.1.0")
-
-
-set_identity(os.getenv("EDGAR_IDENTITY"))
-
-
-@app.get("/health")
-async def health() -> Dict[str, Any]:
-    return {"status": "healthy"}
-
-
-@app.post("/setup")
-async def setup() -> Dict[str, Any]:
-    state.reset()
-    return {"ok": True}
-
-
-@app.post("/search_company")
-async def search_company(req: SearchCompanyRequest) -> List[Dict[str, str]]:
-    """Search for a company by ticker or name."""
-    try:
-        company = Company(req.query)
-
-        ticker = company.tickers[0] if company.tickers else ""
-
-        results = [
-            {
-                "ticker": ticker,
-                "name": company.name,
-                "cik": str(company.cik),
-                "message": f"Found company: {company.name} ({ticker})",
-            }
-        ]
-
-        state.search_count += 1
-        return results
-
-    except Exception as e:
-        logger.error(f"Company search failed: {type(e).__name__}: {e}")
-        logger.error(traceback.format_exc())
-        raise HTTPException(
-            status_code=500, detail=f"Company search failed: {type(e).__name__}: {e}"
-        )
-
-
-@app.post("/get_filings")
-async def get_filings(req: GetFilingsRequest) -> List[Dict[str, Any]]:
-    """Get filings for a company (by ticker/CIK) or globally.
-
-    Args:
-        ticker: Optional ticker or CIK. If omitted, returns global recent filings
-        form_type: Optional form filter (e.g., "10-K", "8-K", ["3","4","5"])
-        limit: Max number of results
-        cutoff_date: Optional date string (YYYY-MM-DD). Only filings on or after this date will be returned
-    """
-    try:
-        results: list[dict[str, Any]] = []
-
-        if req.cutoff_date:
-            try:
-                datetime.strptime(req.cutoff_date, "%Y-%m-%d")
-            except ValueError as e:
-                raise HTTPException(
-                    status_code=400,
-                    detail=f"Invalid cutoff_date format. Expected YYYY-MM-DD, got: {req.cutoff_date}. Error: {e}",
-                )
-
-        if req.ticker:
-            company = Company(req.ticker)
-            filings = (
-                company.get_filings(form=req.form_type) if req.form_type else company.get_filings()
-            )
-        else:
-            filings = edgar_get_filings(form=req.form_type)
-
-        for i, filing in enumerate(filings):
-            if i >= req.limit:
-                break
-
-            if req.cutoff_date and filing.filing_date:
-                filing_date_str = (
-                    filing.filing_date
-                    if isinstance(filing.filing_date, str)
-                    else filing.filing_date.strftime("%Y-%m-%d")
-                )
-                if filing_date_str < req.cutoff_date:
-                    continue
-
-            results.append(
-                {
-                    "filing_date": filing.filing_date
-                    if isinstance(filing.filing_date, str)
-                    else filing.filing_date.strftime("%Y-%m-%d")
-                    if filing.filing_date
-                    else "",
-                    "form_type": filing.form,
-                    "company": getattr(filing, "company", None),
-                    "cik": getattr(filing, "cik", None),
-                    "file_number": getattr(filing, "file_number", None),
-                    "acceptance_datetime": getattr(filing, "acceptance_datetime", None),
-                    "period_of_report": getattr(filing, "period_of_report", None),
-                    "filing_url": getattr(filing, "filing_url", getattr(filing, "url", None)),
-                    "accession_number": filing.accession_number,
-                    "description": getattr(filing, "primary_doc_description", ""),
-                }
-            )
-
-        state.search_count += 1
-        return results
-
-    except HTTPException:
-        raise
-    except Exception as e:
-        logger.error(f"Get filings failed: {type(e).__name__}: {e}")
-        logger.error(traceback.format_exc())
-        raise HTTPException(status_code=500, detail=f"Get filings failed: {type(e).__name__}: {e}")
-
-
-@app.post("/get_filing_content")
-async def get_filing_content(req: GetFilingContentRequest) -> Dict[str, str]:
-    """Get the content of a specific filing."""
-    try:
-        filing = get_filing_from_url(req.filing_url)
-
-        content = await get_content_with_fallback(filing, req.filing_url)
-
-        max_length = 50000
-        if len(content) > max_length:
-            content = content[:max_length] + "\n\n...[truncated]"
-
-        state.fetch_count += 1
-        return {"content": content}
-
-    except ValueError as e:
-        raise HTTPException(status_code=400, detail=str(e))
-    except LookupError as e:
-        raise HTTPException(status_code=404, detail=str(e))
-    except HTTPException:
-        raise
-    except Exception as e:
-        logger.error(f"Get filing content failed: {type(e).__name__}: {e}")
-        raise HTTPException(
-            status_code=500, detail=f"Get filing content failed: {type(e).__name__}: {e}"
-        )
-
-
-@app.post("/get_financial_data")
-async def get_financial_data(req: FilingByAccessionRequest) -> Dict[str, Any]:
-    """Extract financial statements and key metrics from a 10-K or 10-Q filing."""
-    try:
-        filing = get_filing_by_accession(req.ticker, req.accession_number)
-        company = Company(req.ticker)
-
-        result = {
-            "accession_number": filing.accession_number,
-            "form_type": filing.form,
-            "filing_date": filing.filing_date.isoformat()
-            if hasattr(filing.filing_date, "isoformat")
-            else str(filing.filing_date),
-            "has_financials": False,
-            "financial_data": None,
-        }
-
-        try:
-            financials = Financials.extract(filing)
-
-            if financials:
-                result["has_financials"] = True
-                result["cik"] = str(company.cik)
-                result["name"] = company.name
-                financial_data = {}
-
-                financial_sections = [
-                    financials.income_statement,
-                    financials.balance_sheet,
-                    financials.cashflow_statement,
-                    financials.statement_of_equity,
-                    financials.comprehensive_income,
-                ]
-
-                for section in financial_sections:
-                    financial_data.update(populate_financal_data(section))
-
-                result["financial_data"] = financial_data
-        except Exception as e:
-            logger.warning(f"Could not extract financials: {e}")
-
-        return {"success": True, **result}
-    except LookupError as e:
-        raise HTTPException(status_code=404, detail=str(e))
-    except HTTPException:
-        raise
-    except Exception as e:
-        logger.error(f"get_financials failed: {type(e).__name__}: {e}")
-        logger.error(traceback.format_exc())
-        raise HTTPException(
-            status_code=500, detail=f"get_financials failed: {type(e).__name__}: {e}"
-        )
-
-
-@app.post("/get_segment_data")
-async def get_segment_data(req: FilingByAccessionRequest) -> Dict[str, Any]:
-    """Extract segment-level financial data from a 10-K or 10-Q filing."""
-    try:
-        filing = get_filing_by_accession(req.ticker, req.accession_number)
-        company = Company(req.ticker)
-
-        result = {
-            "success": True,
-            "accession_number": filing.accession_number,
-            "form_type": filing.form,
-            "filing_date": filing.filing_date.isoformat()
-            if hasattr(filing.filing_date, "isoformat")
-            else str(filing.filing_date),
-            "cik": str(company.cik),
-            "name": company.name,
-            "has_segment_data": False,
-            "segment_data": None,
-        }
-
-        try:
-            filing_obj = filing.obj()
-
-            if hasattr(filing_obj, "segments"):
-                result["has_segment_data"] = True
-                result["segment_data"] = str(filing_obj.segments)[:10000]
-            elif hasattr(filing_obj, "notes") and hasattr(filing_obj.notes, "segments"):
-                result["has_segment_data"] = True
-                result["segment_data"] = str(filing_obj.notes.segments)[:10000]
-        except Exception as e:
-            logger.warning(f"Could not extract segment data: {e}")
-
-        return result
-    except LookupError as e:
-        raise HTTPException(status_code=404, detail=str(e))
-    except HTTPException:
-        raise
-    except Exception as e:
-        logger.error(f"get_segment_data failed: {type(e).__name__}: {e}")
-        logger.error(traceback.format_exc())
-        raise HTTPException(
-            status_code=500, detail=f"get_segment_data failed: {type(e).__name__}: {e}"
-        )
-
-
-@app.post("/answer")
-async def answer(req: AnswerRequest) -> Dict[str, Any]:
-    state.submitted_answer = req.final_answer
-    return {"ok": True, "message": "Answer submitted"}
-
-
-@app.post("/evaluate")
-async def evaluate(req: EvaluateRequest) -> Dict[str, Any]:
-    submitted = state.submitted_answer
-    if submitted is None:
-        return {
-            "reward": 0.0,
-            "content": f"No answer submitted. Searches: {state.search_count}, Fetches: {state.fetch_count}",
-            "done": False,
-        }
-
-    logger.info(f"Evaluating answer (length: {len(submitted)} chars)")
-    logger.info(f"Answer preview: {submitted}")
-
-    try:
-        rubric = Rubric.from_dict(req.rubric)
-        evaluation = await rubric.grade(submitted)
-        reward = evaluation.score
-        info = {"report": [r.model_dump() for r in evaluation.report] if evaluation.report else []}
-
-        logger.info(f"Rubric evaluation completed. Score: {reward}")
-        logger.info(f"Evaluation report: {info}")
-        return {"reward": reward / 100, "info": info, "done": True}
-    except Exception as e:
-        logger.error(f"Rubric evaluation failed: {type(e).__name__}: {e}")
-        logger.error(traceback.format_exc())
-        return {
-            "reward": 0.0,
-            "content": f"Evaluation failed: {type(e).__name__}: {e}",
-            "done": True,
-        }
-
-
-@app.post("/get_filing_sections")
-async def get_filing_sections(req: FilingByAccessionRequest) -> Dict[str, Any]:
-    """Get specific sections from a 10-K or 10-Q filing."""
-    try:
-        filing = get_filing_by_accession(req.ticker, req.accession_number)
-        form_type = filing.form
-
-        sections = {"form_type": form_type, "has_structure": False}
-
-        try:
-            filing_obj = filing.obj()
-            sections["has_structure"] = True
-
-            # Extract sections based on form type
-            if form_type in ["10-K", "10-Q"]:
-                if hasattr(filing_obj, "business"):
-                    sections["business"] = str(filing_obj.business)[:5000]
-                if hasattr(filing_obj, "risk_factors"):
-                    sections["risk_factors"] = str(filing_obj.risk_factors)[:5000]
-                if hasattr(filing_obj, "mda"):
-                    sections["mda"] = str(filing_obj.mda)[:5000]
-                if hasattr(filing_obj, "financials"):
-                    sections["has_financials"] = True
-        except Exception as e:
-            logger.warning(f"Could not get structured sections: {e}")
-
-        return {"success": True, "sections": sections}
-    except LookupError as e:
-        raise HTTPException(status_code=404, detail=str(e))
-    except HTTPException:
-        raise
-    except Exception as e:
-        logger.error(f"get_filing_sections failed: {type(e).__name__}: {e}")
-        logger.error(traceback.format_exc())
-        raise HTTPException(
-            status_code=500, detail=f"get_filing_sections failed: {type(e).__name__}: {e}"
-        )
-
-
-@app.post("/web_search")
-async def web_search(req: SearchRequest) -> List[Dict[str, str]]:
-    results: List[Dict[str, str]] = []
-    max_results: int = 1
-
-    exa_api_key: Optional[str] = os.getenv("EXA_API_KEY")
-    if not exa_api_key:
-        raise HTTPException(status_code=400, detail="EXA_API_KEY not set on environment")
-
-    try:
-        # Use exponential backoff for the API call
-        data = await call_with_exponential_backoff(
-            execute_search,
-            req.query,
-            exa_api_key,
-            max_results,
-        )
-
-        for item in data.get("results", []):
-            title = item.get("title", "")
-            url = item.get("url", "")
-            if title and url:
-                results.append({"title": title, "url": url})
-
-        if not results:
-            autoprompt = data.get("autopromptString", req.query)
-            return [
-                {
-                    "message": "No results found",
-                    "query": req.query,
-                    "autopromptString": autoprompt,
-                }
-            ]
-
-    except (
-        httpx.HTTPStatusError
-    ) as e:  # pragma: no cover - network errors are environment dependent
-        status_code = e.response.status_code
-        if status_code == 401:
-            raise HTTPException(status_code=401, detail="Invalid EXA_API_KEY")
-        if status_code == 429:
-            # This should be handled by exponential backoff, but if all retries fail
-            raise HTTPException(status_code=429, detail="Exa API rate limit exceeded after retries")
-        raise HTTPException(status_code=502, detail=f"Exa API error: {status_code}")
-    except Exception as e:  # pragma: no cover
-        raise HTTPException(status_code=500, detail=f"Search failed: {type(e).__name__}: {e}")
-
-    state.search_count += 1
-    return results
-
-
-@app.post("/web_fetch")
-async def web_fetch(req: FetchRequest) -> Dict[str, str]:
-    from urllib.parse import urlparse
-
-    max_length: int = 2500
-    parsed = urlparse(req.url)
-    if not parsed.scheme or not parsed.netloc:
-        raise HTTPException(status_code=400, detail=f"Invalid URL: {req.url}")
-
-    exa_api_key: Optional[str] = os.getenv("EXA_API_KEY")
-    if not exa_api_key:
-        raise HTTPException(status_code=400, detail="EXA_API_KEY not set on environment")
-
-    try:
-        # Use exponential backoff for the API call
-        data = await call_with_exponential_backoff(
-            execute_fetch,
-            req.url,
-            exa_api_key,
-            max_length,
-        )
-
-        results = data.get("results", [])
-        if results:
-            result = results[0]
-            text = result.get("text", "")
-            summary = result.get("summary", "")
-            highlights = result.get("highlights", [])
-
-            parts: List[str] = []
-            if summary:
-                parts.append("=== SUMMARY (Main Takeaways) ===")
-                parts.append(summary)
-                parts.append("")
-            if highlights:
-                parts.append("=== KEY HIGHLIGHTS ===")
-                for idx, hl in enumerate(highlights[:3], 1):
-                    parts.append(f"\nHighlight {idx}:")
-                    parts.append(str(hl))
-                parts.append("")
-            if text:
-                parts.append("=== FULL CONTENT ===")
-                if len(text) > max_length:
-                    text = text[:max_length] + "...[truncated]"
-                parts.append(text)
-
-            content = "\n".join(parts) if parts else "No content available"
-        else:
-            content = "No content available for this URL"
-
-    except httpx.HTTPStatusError as e:  # pragma: no cover
-        status_code = e.response.status_code
-        if status_code == 401:
-            raise HTTPException(status_code=401, detail="Invalid EXA_API_KEY")
-        if status_code == 429:
-            # This should be handled by exponential backoff, but if all retries fail
-            raise HTTPException(status_code=429, detail="Exa API rate limit exceeded after retries")
-        raise HTTPException(status_code=502, detail=f"Exa API error: {status_code}")
-    except Exception as e:  # pragma: no cover
-        raise HTTPException(status_code=500, detail=f"Fetch failed: {type(e).__name__}: {e}")
-
-    state.fetch_count += 1
-    return {"content": content}
-
-
-if __name__ == "__main__":
-    logging.basicConfig(
-        level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
-    )
-
-    uvicorn.run(app, host="0.0.0.0", port=8000)
diff --git a/environments/rubrics/pyproject.toml b/environments/rubrics/pyproject.toml
deleted file mode 100644
index 038f151d..00000000
--- a/environments/rubrics/pyproject.toml
+++ /dev/null
@@ -1,19 +0,0 @@
-[project]
-name = "rubrics"
-version = "0.1.0"
-description = "Rubrics HUD environment with HTTP backend (EXA on server)"
-requires-python = ">=3.11"
-dependencies = [ "hud-python==0.4.42", "fastapi>=0.104.1", "uvicorn[standard]>=0.24.0", "httpx>=0.24.0",]
-
-[build-system]
-requires = [ "hatchling",]
-build-backend = "hatchling.build"
-
-[tool.hud]
-image = "rubrics:dev"
-
-[tool.hatch.metadata]
-allow-direct-references = true
-
-[tool.hatch.build.targets.wheel]
-packages = [ "server", "environment",]
diff --git a/environments/rubrics/remote_tasks.json b/environments/rubrics/remote_tasks.json
deleted file mode 100644
index e66ceef5..00000000
--- a/environments/rubrics/remote_tasks.json
+++ /dev/null
@@ -1,901 +0,0 @@
-[
-  {
-    "prompt": "I need help analyzing Valero's refining margins in early 2024. Based on its FY2023 10-K and Q1 2024 10-Q, can you calculate the refining margin per barrel? And compare year over year? \n\nIf margins were getting squeezed while throughput volumes were consistent, figure out what drove that - is it because crude differentials were tightening up, or could it be related to how they timed their maintenance schedules? And based on the findings at that time, what would have been implied about their optimization approach heading into Q2 2024?",
-    "mcp_config": {
-      "hud": {
-        "url": "https://mcp.hud.ai/v3/mcp",
-        "headers": {
-          "Authorization": "Bearer ${HUD_API_KEY}",
-          "Mcp-Image": "hudpython/rubrics:0.1.0",
-          "Env-Edgar-Identity": "${EDGAR_IDENTITY}",
-          "Env-Exa-Api-Key": "${EXA_API_KEY}",
-          "Env-Anthropic-Api-Key": "${ANTHROPIC_API_KEY}",
-          "Env-Openai-Api-Key": "${OPENAI_API_KEY}"
-        }
-      }
-    },
-    "setup_tool": {
-      "name": "setup",
-      "arguments": {}
-    },
-    "evaluate_tool": {
-      "name": "evaluate",
-      "arguments": {
-        "rubric": [
-          {
-            "requirement": "States FY2023 refining margin per barrel as $17.55 (+/- $0.05)",
-            "weight": 10
-          },
-          {
-            "weight": 10,
-            "requirement": "States FY2022 refining margin per barrel as $21.82 (+/- $0.05)"
-          },
-          {
-            "requirement": "States Q1 2024 refining margin per barrel as $14.07 (+/- $0.05)",
-            "weight": 10
-          },
-          {
-            "weight": 10,
-            "requirement": "States Q1 2023 refining margin per barrel as $22.37 (+/- $0.05)"
-          },
-          {
-            "weight": 15,
-            "requirement": "Explicitly cites Valero's Q1 2024 10-Q attribution to weaker cracks and narrower crude diffs"
-          },
-          {
-            "weight": 10,
-            "requirement": "States that steady throughput rules out maintenance as primary driver"
-          },
-          {
-            "weight": 10,
-            "requirement": "States that narrowing/tightening crude differentials contributed to margin compression"
-          },
-          {
-            "weight": 5,
-            "requirement": "Notes RIN/RVO costs were lower YoY in Q1 2024 and not the margin compression driver"
-          },
-          {
-            "weight": 10,
-            "requirement": "Directly cites or references Valero FY2023 10-K filing"
-          },
-          {
-            "weight": 10,
-            "requirement": "Directly cites or references Valero Q1 2024 10-Q filing"
-          },
-          {
-            "weight": 12,
-            "requirement": "Lists at least 3 specific optimization levers (crude slate, product yield, hedging)"
-          },
-          {
-            "weight": 8,
-            "requirement": "Recommends adjusting crude slate based on differential movements (lighter if heavy discounts tight)"
-          },
-          {
-            "weight": 8,
-            "requirement": "Recommends maximizing gasoline/FCC runs for Q2 summer driving season strength"
-          },
-          {
-            "weight": 8,
-            "requirement": "Recommends exploiting lower natural gas/energy costs to reduce operating expenses"
-          }
-        ]
-      }
-    },
-    "id": "finance_rubric_00000"
-  },
-  {
-    "prompt": "Using T-Mobile's Q1 2024 and Q3 2024 10-Qs, calculate wireless service margins and ARPU if disclosed in segment data. Should margins compress despite ARPU growth and subscriber additions, would this reflect promotional intensity to defend share or network infrastructure costs outpacing revenue? What does that imply for fixed cost leverage as the network scales?",
-    "mcp_config": {
-      "hud": {
-        "url": "https://mcp.hud.ai/v3/mcp",
-        "headers": {
-          "Authorization": "Bearer ${HUD_API_KEY}",
-          "Mcp-Image": "hudpython/rubrics:0.1.0",
-          "Env-Edgar-Identity": "${EDGAR_IDENTITY}",
-          "Env-Exa-Api-Key": "${EXA_API_KEY}",
-          "Env-Anthropic-Api-Key": "${ANTHROPIC_API_KEY}",
-          "Env-Openai-Api-Key": "${OPENAI_API_KEY}"
-        }
-      }
-    },
-    "setup_tool": {
-      "name": "setup",
-      "arguments": {}
-    },
-    "evaluate_tool": {
-      "name": "evaluate",
-      "arguments": {
-        "rubric": [
-          {
-            "weight": 12,
-            "requirement": "States Q1 2024 service margin as 83.3% or 84.0% (adjusted for merger costs)"
-          },
-          {
-            "weight": 12,
-            "requirement": "States Q3 2024 service margin as 83.7%"
-          },
-          {
-            "weight": 10,
-            "requirement": "States margins expanded (not compressed) Q1 to Q3 2024"
-          },
-          {
-            "weight": 8,
-            "requirement": "Shows calculation methodology as (Service Revenue - Cost of Services ex-D&A) / Service Revenue"
-          },
-          {
-            "weight": 8,
-            "requirement": "States Q1 2024 service revenues as $16,096 million (+/- $10m)"
-          },
-          {
-            "weight": 8,
-            "requirement": "States Q1 2024 cost of services as $2,688 million (+/- $10m)"
-          },
-          {
-            "weight": 8,
-            "requirement": "States Q3 2024 service revenues as $16,725 million (+/- $10m)"
-          },
-          {
-            "weight": 8,
-            "requirement": "States Q3 2024 cost of services as $2,722 million (+/- $10m)"
-          },
-          {
-            "weight": 6,
-            "requirement": "States Q1 2024 postpaid phone ARPU as $48.79 (+/- $0.10)"
-          },
-          {
-            "weight": 6,
-            "requirement": "States Q3 2024 postpaid phone ARPU as $49.79 (+/- $0.10)"
-          },
-          {
-            "weight": 5,
-            "requirement": "Quantifies postpaid phone ARPU increase as $1.00 or +2.0%"
-          },
-          {
-            "weight": 5,
-            "requirement": "States postpaid ARPA increased from $140.88 to $145.60 (+/- $0.50 each)"
-          },
-          {
-            "weight": 4,
-            "requirement": "Notes prepaid ARPU declined from Q1 to Q3 2024"
-          },
-          {
-            "weight": 12,
-            "requirement": "Identifies network infrastructure costs as primary or dominant driver of margin dynamics"
-          },
-          {
-            "weight": 10,
-            "requirement": "References Q3 10-Q language about higher site costs or 5G network build-out costs"
-          },
-          {
-            "weight": 8,
-            "requirement": "Discusses promotional activity as present but secondary factor or offset to ARPU gains"
-          },
-          {
-            "weight": 6,
-            "requirement": "Notes record-low churn of 0.86% as evidence against defensive promotional intensity"
-          },
-          {
-            "weight": 6,
-            "requirement": "Compares cost of services growth rate to service revenue growth rate Q1 to Q3"
-          },
-          {
-            "weight": 10,
-            "requirement": "References T-Mobile Q1 2024 10-Q"
-          },
-          {
-            "weight": 10,
-            "requirement": "References T-Mobile Q3 2024 10-Q"
-          },
-          {
-            "weight": 8,
-            "requirement": "Explains wireless operating leverage concept as incremental subscribers adding high-margin revenue over fixed network"
-          },
-          {
-            "weight": 7,
-            "requirement": "Discusses transition from network investment phase to operating leverage phase"
-          },
-          {
-            "weight": 5,
-            "requirement": "Explains how network cost step-ups temporarily mute operating leverage realization"
-          }
-        ]
-      }
-    },
-    "id": "finance_rubric_00001"
-  },
-  {
-    "prompt": "Using PNC Financial's Q2 2024 and Q2 2023 10-Qs, calculate loan growth and operating cash flow if disclosed in the statements. Should loan balances expand while cash generation contracts, would this reflect provision expense buildup or deposit funding cost pressure? What does that imply for capital deployment efficiency and near-term profitability?",
-    "mcp_config": {
-      "hud": {
-        "url": "https://mcp.hud.ai/v3/mcp",
-        "headers": {
-          "Authorization": "Bearer ${HUD_API_KEY}",
-          "Mcp-Image": "hudpython/rubrics:0.1.0",
-          "Env-Edgar-Identity": "${EDGAR_IDENTITY}",
-          "Env-Exa-Api-Key": "${EXA_API_KEY}",
-          "Env-Anthropic-Api-Key": "${ANTHROPIC_API_KEY}",
-          "Env-Openai-Api-Key": "${OPENAI_API_KEY}"
-        }
-      }
-    },
-    "setup_tool": {
-      "name": "setup",
-      "arguments": {}
-    },
-    "evaluate_tool": {
-      "name": "evaluate",
-      "arguments": {
-        "rubric": [
-          {
-            "weight": 10,
-            "requirement": "States Q2 2024 period-end total loans as $321.0B to $321.9B"
-          },
-          {
-            "weight": 10,
-            "requirement": "States Q2 2023 period-end total loans as $321.0B to $322.5B"
-          },
-          {
-            "weight": 6,
-            "requirement": "Calculates YoY loan change as -0.5% to +0.2% or -$1.0B to +$0.5B"
-          },
-          {
-            "weight": 10,
-            "requirement": "States 2024 six-month operating cash flow as $2.5B to $3.0B"
-          },
-          {
-            "weight": 10,
-            "requirement": "States 2023 six-month operating cash flow as $4.9B to $5.4B"
-          },
-          {
-            "weight": 6,
-            "requirement": "Calculates YoY cash flow decline as -40% to -50% or -$2.0B to -$2.6B"
-          },
-          {
-            "weight": 12,
-            "requirement": "Explicitly identifies funding cost pressure (not provision buildup) as primary driver of CFO decline"
-          },
-          {
-            "weight": 8,
-            "requirement": "States provision expense changed by less than $50M or increased less than 10% YoY"
-          },
-          {
-            "weight": 8,
-            "requirement": "States deposit interest expense increased by $1.0B to $1.5B YoY"
-          },
-          {
-            "weight": 6,
-            "requirement": "Explains why provision expense cannot explain CFO decline (non-cash add-back or similar logic)"
-          },
-          {
-            "weight": 6,
-            "requirement": "Recommends at least 2 of prioritizing capital return over growth, tightening loan pricing, or managing deposit mix"
-          },
-          {
-            "weight": 5,
-            "requirement": "States net interest margin compressed by 15 to 25 basis points YoY"
-          },
-          {
-            "weight": 4,
-            "requirement": "States borrowed-funds interest expense increased by $500M to $800M YoY"
-          },
-          {
-            "weight": 5,
-            "requirement": "References PNC Financial Q2 2024 10-Q"
-          },
-          {
-            "weight": 5,
-            "requirement": "References PNC Financial Q2 2023 10-Q"
-          }
-        ]
-      }
-    },
-    "id": "finance_rubric_00002"
-  },
-  {
-    "prompt": "Using Boeing's Q3 2024 10-Q, calculate operating margins by segment if disclosed and compare to FY2023 10-K levels. Should defense margins compress while backlog grows, would this reflect fixed-price contract overruns or engineering labor inflation? What does that imply for program profitability and bid discipline on new awards?",
-    "mcp_config": {
-      "hud": {
-        "url": "https://mcp.hud.ai/v3/mcp",
-        "headers": {
-          "Authorization": "Bearer ${HUD_API_KEY}",
-          "Mcp-Image": "hudpython/rubrics:0.1.0",
-          "Env-Edgar-Identity": "${EDGAR_IDENTITY}",
-          "Env-Exa-Api-Key": "${EXA_API_KEY}",
-          "Env-Anthropic-Api-Key": "${ANTHROPIC_API_KEY}",
-          "Env-Openai-Api-Key": "${OPENAI_API_KEY}"
-        }
-      }
-    },
-    "setup_tool": {
-      "name": "setup",
-      "arguments": {}
-    },
-    "evaluate_tool": {
-      "name": "evaluate",
-      "arguments": {
-        "rubric": [
-          {
-            "weight": 10,
-            "requirement": "States BCA Q3 2024 operating margin as -54.0% within +/- 1 percentage point"
-          },
-          {
-            "weight": 10,
-            "requirement": "States BDS Q3 2024 operating margin as -43.1% within +/- 1 percentage point"
-          },
-          {
-            "weight": 10,
-            "requirement": "States BGS Q3 2024 operating margin as +17.0% within +/- 1 percentage point"
-          },
-          {
-            "weight": 5,
-            "requirement": "Shows or describes margin calculation as operating income divided by revenue"
-          },
-          {
-            "weight": 8,
-            "requirement": "States BCA FY2023 full-year margin as -4.8% within +/- 1 percentage point"
-          },
-          {
-            "weight": 8,
-            "requirement": "States BDS FY2023 full-year margin as -7.1% within +/- 1 percentage point"
-          },
-          {
-            "weight": 8,
-            "requirement": "States BGS FY2023 full-year margin as +17.4% within +/- 1 percentage point"
-          },
-          {
-            "weight": 6,
-            "requirement": "Calculates or states BDS margin deterioration as 35-37 percentage points YoY"
-          },
-          {
-            "weight": 10,
-            "requirement": "Attributes defense margin compression primarily to fixed-price contract overruns not labor inflation"
-          },
-          {
-            "weight": 8,
-            "requirement": "Quantifies Q3 2024 defense charges as $2.0-2.4 billion"
-          },
-          {
-            "weight": 6,
-            "requirement": "Names at least 3 of T-7A, KC-46A, VC-25B, Commercial Crew, or MQ-25 as troubled programs"
-          },
-          {
-            "weight": 6,
-            "requirement": "States defense backlog grew from approx $59B Dec-2023 to $62B Sep-2024"
-          },
-          {
-            "weight": 8,
-            "requirement": "Identifies backlog growth concurrent with margin compression as bid discipline failure red flag"
-          },
-          {
-            "weight": 8,
-            "requirement": "Recommends improved bid discipline or avoiding unprofitable fixed-price development work"
-          },
-          {
-            "weight": 4,
-            "requirement": "Explicitly cites or references Boeing Q3 2024 10-Q filing as data source"
-          },
-          {
-            "weight": 4,
-            "requirement": "Explicitly cites or references Boeing FY2023 10-K filing as data source"
-          }
-        ]
-      }
-    },
-    "id": "finance_rubric_00003"
-  },
-  {
-    "prompt": "Using Amazon's FY2024 10-K, calculate inventory turnover and fulfillment costs as a percentage of revenue if disclosed in the statements. Should turnover deteriorate while online mix reaches 65% yet fulfillment intensity remains stable, would this reflect assortment expansion or promotional cadence changes? What does that imply for working capital efficiency?",
-    "mcp_config": {
-      "hud": {
-        "url": "https://mcp.hud.ai/v3/mcp",
-        "headers": {
-          "Authorization": "Bearer ${HUD_API_KEY}",
-          "Mcp-Image": "hudpython/rubrics:0.1.0",
-          "Env-Edgar-Identity": "${EDGAR_IDENTITY}",
-          "Env-Exa-Api-Key": "${EXA_API_KEY}",
-          "Env-Anthropic-Api-Key": "${ANTHROPIC_API_KEY}",
-          "Env-Openai-Api-Key": "${OPENAI_API_KEY}"
-        }
-      }
-    },
-    "setup_tool": {
-      "name": "setup",
-      "arguments": {}
-    },
-    "evaluate_tool": {
-      "name": "evaluate",
-      "arguments": {
-        "rubric": [
-          {
-            "weight": 10,
-            "requirement": "States inventory turnover between 9.5x and 9.8x for FY2024"
-          },
-          {
-            "weight": 10,
-            "requirement": "States fulfillment costs as 15.2% to 15.6% of revenue"
-          },
-          {
-            "weight": 6,
-            "requirement": "Calculates or states Days Inventory Outstanding between 37 and 39 days"
-          },
-          {
-            "weight": 8,
-            "requirement": "References Amazon FY2024 10-K"
-          },
-          {
-            "weight": 10,
-            "requirement": "States assortment expansion or SKU proliferation as primary driver of turnover deterioration"
-          },
-          {
-            "weight": 8,
-            "requirement": "Explains stable fulfillment percentage means same units per revenue dollar ruling out promotional changes"
-          },
-          {
-            "weight": 6,
-            "requirement": "Mentions long-tail items, SKU count growth, or category expansion reducing average turnover"
-          },
-          {
-            "weight": 5,
-            "requirement": "Discusses forward-stocking, regional distribution, or geographic duplication increasing inventory"
-          },
-          {
-            "weight": 8,
-            "requirement": "Explicitly states promotional changes would alter units per revenue and move fulfillment percentage"
-          },
-          {
-            "weight": 10,
-            "requirement": "States each day of inventory ties up approximately $0.8B to $1.0B in working capital"
-          },
-          {
-            "weight": 6,
-            "requirement": "Calculates cash impact for specific turnover deterioration scenario with dollar amounts"
-          },
-          {
-            "weight": 6,
-            "requirement": "Mentions at least 2 of extending DPO, pushing to FBA/3P, or vendor financing as offsets"
-          },
-          {
-            "weight": 5,
-            "requirement": "Discusses impact on cash conversion cycle or free cash flow from inventory build"
-          },
-          {
-            "weight": 5,
-            "requirement": "Recommends tracking SKU count growth or category mix shifts"
-          },
-          {
-            "weight": 4,
-            "requirement": "Recommends monitoring 1P vs 3P mix or FBA share"
-          },
-          {
-            "weight": 4,
-            "requirement": "Recommends tracking aged inventory, obsolescence reserves, or markdown risk"
-          },
-          {
-            "weight": 4,
-            "requirement": "Recommends monitoring Days Payable Outstanding or supplier payment terms"
-          }
-        ]
-      }
-    },
-    "id": "finance_rubric_00004"
-  },
-  {
-    "prompt": "Using Moderna's Q1, Q2, and Q3 2024 10-Qs, calculate R&D spend as a percentage of revenue and operating cash burn if disclosed in the cash flow statements. Should burn rate accelerate while Phase III programs advance, would this reflect CRO cost inflation or inefficient trial design? What does that imply for runway and financing needs?",
-    "mcp_config": {
-      "hud": {
-        "url": "https://mcp.hud.ai/v3/mcp",
-        "headers": {
-          "Authorization": "Bearer ${HUD_API_KEY}",
-          "Mcp-Image": "hudpython/rubrics:0.1.0",
-          "Env-Edgar-Identity": "${EDGAR_IDENTITY}",
-          "Env-Exa-Api-Key": "${EXA_API_KEY}",
-          "Env-Anthropic-Api-Key": "${ANTHROPIC_API_KEY}",
-          "Env-Openai-Api-Key": "${OPENAI_API_KEY}"
-        }
-      }
-    },
-    "setup_tool": {
-      "name": "setup",
-      "arguments": {}
-    },
-    "evaluate_tool": {
-      "name": "evaluate",
-      "arguments": {
-        "rubric": [
-          {
-            "weight": 10,
-            "requirement": "States Q1 2024 R&D as percentage of revenue is 636% (+/- 2pp)"
-          },
-          {
-            "weight": 10,
-            "requirement": "States Q2 2024 R&D as percentage of revenue is 507% (+/- 2pp)"
-          },
-          {
-            "weight": 10,
-            "requirement": "States Q3 2024 R&D as percentage of revenue is 61% (+/- 2pp)"
-          },
-          {
-            "weight": 8,
-            "requirement": "Shows underlying revenue and R&D expense amounts for at least 2 quarters"
-          },
-          {
-            "weight": 10,
-            "requirement": "States Q1 2024 operating cash burn is $989M (+/- $20M)"
-          },
-          {
-            "weight": 10,
-            "requirement": "States Q2 2024 operating cash burn is $1,274M (+/- $30M)"
-          },
-          {
-            "weight": 10,
-            "requirement": "States Q3 2024 operating cash burn is $1,566M (+/- $30M)"
-          },
-          {
-            "weight": 5,
-            "requirement": "Explicitly notes operating cash burn accelerated from Q1 to Q3"
-          },
-          {
-            "weight": 10,
-            "requirement": "References Moderna Q1 2024 10-Q"
-          },
-          {
-            "weight": 10,
-            "requirement": "References Moderna Q2 2024 10-Q"
-          },
-          {
-            "weight": 10,
-            "requirement": "References Moderna Q3 2024 10-Q"
-          },
-          {
-            "weight": 10,
-            "requirement": "Attributes burn acceleration to working capital or prepayments not GAAP expense increases"
-          },
-          {
-            "weight": 6,
-            "requirement": "Notes GAAP R&D expense remained stable at approximately $1.1-1.2B per quarter"
-          },
-          {
-            "weight": 8,
-            "requirement": "Concludes burn reflects Phase III scale-up or timing not trial design inefficiency"
-          },
-          {
-            "weight": 8,
-            "requirement": "States cash position as $9.2B (+/- $0.2B) as of September 30, 2024"
-          },
-          {
-            "weight": 10,
-            "requirement": "Calculates mechanical runway showing 15-22 months from September 2024"
-          },
-          {
-            "weight": 5,
-            "requirement": "Notes Q4 seasonal collections or revenue timing affects true runway"
-          },
-          {
-            "weight": 6,
-            "requirement": "Identifies R&D spending exceeds revenue creating structural deficit"
-          },
-          {
-            "weight": 5,
-            "requirement": "Discusses financing pressure timeline within 18-30 months"
-          }
-        ]
-      }
-    },
-    "id": "finance_rubric_00005"
-  },
-  {
-    "prompt": "Per Netflix's 2024 Proxy Statement (DEF 14A), compile a table of CEO and CFO total compensation (salary, bonus, equity grant-date value) alongside three-year total shareholder return (2022 -2024). Apply the ISS quantitative pay-for-performance test and tabulate percentile ranks plus the overall score. In three sentences, state whether performance-vesting targets exceeded peer median and recommend a Say-on-Pay vote.",
-    "mcp_config": {
-      "hud": {
-        "url": "https://mcp.hud.ai/v3/mcp",
-        "headers": {
-          "Authorization": "Bearer ${HUD_API_KEY}",
-          "Mcp-Image": "hudpython/rubrics:0.1.0",
-          "Env-Edgar-Identity": "${EDGAR_IDENTITY}",
-          "Env-Exa-Api-Key": "${EXA_API_KEY}",
-          "Env-Anthropic-Api-Key": "${ANTHROPIC_API_KEY}",
-          "Env-Openai-Api-Key": "${OPENAI_API_KEY}"
-        }
-      }
-    },
-    "setup_tool": {
-      "name": "setup",
-      "arguments": {}
-    },
-    "evaluate_tool": {
-      "name": "evaluate",
-      "arguments": {
-        "rubric": [
-          {
-            "weight": 15,
-            "requirement": "Contains a table with at least 3 columns and 3 rows displaying executive compensation data"
-          },
-          {
-            "weight": 8,
-            "requirement": "Lists Ted Sarandos by name and identifies him as Co-CEO"
-          },
-          {
-            "weight": 8,
-            "requirement": "Lists Greg Peters by name and identifies him as Co-CEO"
-          },
-          {
-            "weight": 8,
-            "requirement": "Lists Spencer Neumann by name and identifies him as CFO"
-          },
-          {
-            "weight": 5,
-            "requirement": "Table includes a column labeled salary or base salary"
-          },
-          {
-            "weight": 5,
-            "requirement": "Table includes a column labeled bonus or annual bonus"
-          },
-          {
-            "weight": 5,
-            "requirement": "Table includes a column labeled equity, stock, or grant-date value"
-          },
-          {
-            "weight": 8,
-            "requirement": "States Ted Sarandos 2024 salary as $3 million or $3.0M"
-          },
-          {
-            "weight": 8,
-            "requirement": "States Ted Sarandos 2024 bonus as $12 million or $12.0M"
-          },
-          {
-            "weight": 8,
-            "requirement": "States Ted Sarandos 2024 equity grant-date value as $42.7M or $42.71M"
-          },
-          {
-            "weight": 8,
-            "requirement": "States Greg Peters 2024 salary as $3 million or $3.0M"
-          },
-          {
-            "weight": 8,
-            "requirement": "States Greg Peters 2024 bonus as $12 million or $12.0M"
-          },
-          {
-            "weight": 8,
-            "requirement": "States Greg Peters 2024 equity grant-date value as $42.7M or $42.71M"
-          },
-          {
-            "weight": 8,
-            "requirement": "States Spencer Neumann 2024 salary as $1.5 million or $1.5M"
-          },
-          {
-            "weight": 8,
-            "requirement": "States Spencer Neumann 2024 bonus as $3M, $3.0M, or $6M"
-          },
-          {
-            "weight": 8,
-            "requirement": "States Spencer Neumann 2024 equity grant-date value as $14.5M or $14.47M"
-          },
-          {
-            "weight": 10,
-            "requirement": "States Ted Sarandos 2024 total compensation as $61.9M or $61.92M"
-          },
-          {
-            "weight": 10,
-            "requirement": "States Greg Peters 2024 total compensation as $60.3M or $60.27M"
-          },
-          {
-            "weight": 10,
-            "requirement": "States Spencer Neumann 2024 total compensation as $22.9M or $22.90M"
-          },
-          {
-            "weight": 10,
-            "requirement": "Includes three-year total shareholder return (TSR) data covering 2022-2024"
-          },
-          {
-            "weight": 4,
-            "requirement": "States the 2022 TSR performance value as -51.1%"
-          },
-          {
-            "weight": 4,
-            "requirement": "States the 2023 TSR performance value as 65.1%"
-          },
-          {
-            "weight": 4,
-            "requirement": "States the 2024 TSR performance value as 83.1%"
-          },
-          {
-            "weight": 5,
-            "requirement": "States 3-year cumulative TSR as 48% or 49.21%"
-          },
-          {
-            "weight": 10,
-            "requirement": "Explicitly mentions applying ISS quantitative pay-for-performance test methodology"
-          },
-          {
-            "weight": 6,
-            "requirement": "States RDA score showing CEO pay at 95th percentile and TSR at 80th percentile"
-          },
-          {
-            "weight": 6,
-            "requirement": "States MOM score as 2.0x or above 2x peer median"
-          },
-          {
-            "weight": 6,
-            "requirement": "States PTA assessment as Aligned or indicates pay-TSR alignment"
-          },
-          {
-            "weight": 4,
-            "requirement": "States CEO pay percentile rank as 95th or 85th percentile"
-          },
-          {
-            "weight": 4,
-            "requirement": "States 3-year TSR performance percentile as 80th or 75th percentile"
-          },
-          {
-            "weight": 5,
-            "requirement": "States overall ISS concern level as Medium Concern"
-          },
-          {
-            "weight": 10,
-            "requirement": "States that performance-vesting targets exceeded peer median"
-          },
-          {
-            "weight": 15,
-            "requirement": "Makes clear recommendation to vote FOR or AGAINST Say-on-Pay"
-          },
-          {
-            "weight": 5,
-            "requirement": "Explicitly connects performance results to vote recommendation"
-          },
-          {
-            "weight": 15,
-            "requirement": "State whether performance-vesting targets exceeded peer median and recommend a Say-on-Pay vote in exactly 3 sentences"
-          },
-          {
-            "weight": 20,
-            "requirement": "References Netflix 2024 DEF 14A proxy statement"
-          },
-          {
-            "weight": -50,
-            "requirement": "Does not flag that Netflix has co-CEOs"
-          }
-        ]
-      }
-    },
-    "id": "finance_rubric_00006"
-  },
-  {
-    "prompt": "Based on Oracle's FY2024 10-K, what were the operating expenses in 2024, broken out by segment? Which segment had the highest operating expenses and which segment had the lowest?",
-    "mcp_config": {
-      "hud": {
-        "url": "https://mcp.hud.ai/v3/mcp",
-        "headers": {
-          "Authorization": "Bearer ${HUD_API_KEY}",
-          "Mcp-Image": "hudpython/rubrics:0.1.0",
-          "Env-Edgar-Identity": "${EDGAR_IDENTITY}",
-          "Env-Exa-Api-Key": "${EXA_API_KEY}",
-          "Env-Anthropic-Api-Key": "${ANTHROPIC_API_KEY}",
-          "Env-Openai-Api-Key": "${OPENAI_API_KEY}"
-        }
-      }
-    },
-    "setup_tool": {
-      "name": "setup",
-      "arguments": {}
-    },
-    "evaluate_tool": {
-      "name": "evaluate",
-      "arguments": {
-        "rubric": [
-          {
-            "weight": 10,
-            "requirement": "States Cloud Services and License segment operating expenses were $9.4B (or $9,427m)"
-          },
-          {
-            "weight": 10,
-            "requirement": "States Hardware segment operating expenses were $891m"
-          },
-          {
-            "weight": 10,
-            "requirement": "States Services segment operating expenses were $4.8B (or $4,825m)"
-          },
-          {
-            "weight": 10,
-            "requirement": "Correctly identifies Cloud/Cloud & License as highest operating expense segment"
-          },
-          {
-            "weight": 10,
-            "requirement": "Correctly identifies Hardware as lowest operating expense segment"
-          },
-          {
-            "weight": 8,
-            "requirement": "References Oracle's FY2024 Form 10-K"
-          }
-        ]
-      }
-    },
-    "id": "finance_rubric_00007"
-  },
-  {
-    "prompt": "Per Sun Country Airlines' FY2024 Form 10-K, what were Sun's ASMs and RPMs for 2024, and what was the load factor?",
-    "mcp_config": {
-      "hud": {
-        "url": "https://mcp.hud.ai/v3/mcp",
-        "headers": {
-          "Authorization": "Bearer ${HUD_API_KEY}",
-          "Mcp-Image": "hudpython/rubrics:0.1.0",
-          "Env-Exa-Api-Key": "${EXA_API_KEY}",
-          "Env-Anthropic-Api-Key": "${ANTHROPIC_API_KEY}",
-          "Env-Openai-Api-Key": "${OPENAI_API_KEY}"
-        }
-      }
-    },
-    "setup_tool": {
-      "name": "setup",
-      "arguments": {}
-    },
-    "evaluate_tool": {
-      "name": "evaluate",
-      "arguments": {
-        "rubric": [
-          {
-            "weight": 15,
-            "requirement": "States 2024 ASMs as 6.7 billion or 6,707,308 thousand"
-          },
-          {
-            "weight": 15,
-            "requirement": "States 2024 RPMs as 5.6 billion or 5,648,351 thousand"
-          },
-          {
-            "weight": 12,
-            "requirement": "States 2024 load factor as 84.2%"
-          },
-          {
-            "weight": 8,
-            "requirement": "References Sun Country Airlines' FY2024 Form 10-K"
-          }
-        ]
-      }
-    },
-    "id": "finance_rubric_00008"
-  },
-  {
-    "prompt": "Based on ExxonMobil's Q2 2025 10-Q, what were the crude oil and product purchases for the three months ended June 30, 2025, broken out by U.S. vs Non-U.S.? Which was higher and which was lower?",
-    "mcp_config": {
-      "hud": {
-        "url": "https://mcp.hud.ai/v3/mcp",
-        "headers": {
-          "Authorization": "Bearer ${HUD_API_KEY}",
-          "Mcp-Image": "hudpython/rubrics:0.1.0",
-          "Env-Edgar-Identity": "${EDGAR_IDENTITY}",
-          "Env-Exa-Api-Key": "${EXA_API_KEY}",
-          "Env-Anthropic-Api-Key": "${ANTHROPIC_API_KEY}",
-          "Env-Openai-Api-Key": "${OPENAI_API_KEY}"
-        }
-      }
-    },
-    "setup_tool": {
-      "name": "setup",
-      "arguments": {}
-    },
-    "evaluate_tool": {
-      "name": "evaluate",
-      "arguments": {
-        "rubric": [
-          {
-            "weight": 12,
-            "requirement": "States U.S. purchases as $33,257M or $33.257B or within $33.2-33.3B"
-          },
-          {
-            "weight": 12,
-            "requirement": "States Non-U.S. purchases as $40,786M or $40.786B or within $40.7-40.9B"
-          },
-          {
-            "weight": 10,
-            "requirement": "Explicitly states Non-U.S. purchases were higher than U.S. purchases"
-          },
-          {
-            "weight": 8,
-            "requirement": "References ExxonMobil Q2 2025 10-Q"
-          }
-        ]
-      }
-    },
-    "id": "finance_rubric_00009"
-  }
-]
diff --git a/environments/rubrics/server/__init__.py b/environments/rubrics/server/__init__.py
deleted file mode 100644
index 050c5ad3..00000000
--- a/environments/rubrics/server/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-"""Rubrics MCP"""
diff --git a/environments/rubrics/server/main.py b/environments/rubrics/server/main.py
deleted file mode 100644
index d2b53193..00000000
--- a/environments/rubrics/server/main.py
+++ /dev/null
@@ -1,154 +0,0 @@
-"""MCP server for SEC EDGAR research environment."""
-
-from typing import List, Dict, Any, Optional
-import httpx
-import os
-import sys
-import logging
-
-from hud.tools.types import EvaluationResult
-from hud.server import MCPServer
-
-# Configure logging
-logging.basicConfig(
-    stream=sys.stderr,
-    level=logging.INFO,
-    format="[%(levelname)s] %(asctime)s | %(name)s | %(message)s",
-    force=True,  # Force all loggers to use stderr
-)
-
-# MCP server
-mcp = MCPServer(name="sec-rubrics")
-
-# Environment server URL (backend)
-ENV_SERVER_URL = os.getenv("ENV_SERVER_URL", "http://localhost:8000")
-
-# Shared HTTP client to talk to the environment
-http_client = httpx.AsyncClient(
-    base_url=ENV_SERVER_URL,
-    timeout=60.0,  # Increased timeout for SEC EDGAR operations
-    headers={"User-Agent": "HUD-SEC-Rubrics-Controller/1.0"},
-)
-
-
-@mcp.initialize
-async def init():
-    # Ensure environment server is reachable
-    await http_client.get("/health")
-
-
-@mcp.shutdown
-async def cleanup():
-    await http_client.aclose()
-
-
-@mcp.tool()
-async def setup() -> str:
-    await http_client.post("/setup")
-    return "Environment setup complete"
-
-
-@mcp.tool()
-async def search_company(query: str) -> List[Dict[str, str]]:
-    resp = await http_client.post("/search_company", json={"query": query})
-    return resp.json()
-
-
-@mcp.tool()
-async def get_filings(
-    ticker: Optional[str] = None,
-    form_type: Optional[str] = None,
-    limit: int = 10,
-    cutoff_date: Optional[str] = None,
-) -> List[Dict[str, Any]]:
-    resp = await http_client.post(
-        "/get_filings",
-        json={
-            "ticker": ticker,
-            "form_type": form_type,
-            "limit": limit,
-            "cutoff_date": cutoff_date,
-        },
-    )
-    return resp.json()
-
-
-@mcp.tool()
-async def get_filing_content(filing_url: str) -> str:
-    resp = await http_client.post("/get_filing_content", json={"filing_url": filing_url})
-    data = resp.json()
-    return data.get("content", "")
-
-
-@mcp.tool()
-async def get_financial_data(ticker: str, accession_number: str) -> Any:
-    """Extract financial statements and key metrics from a 10-K or 10-Q filing."""
-    resp = await http_client.post(
-        "/get_financial_data",
-        json={
-            "ticker": ticker,
-            "accession_number": accession_number,
-        },
-    )
-    return resp.json()
-
-
-@mcp.tool()
-async def get_segment_data(ticker: str, accession_number: str) -> Any:
-    """Extract segment-level financial data from a 10-K or 10-Q filing."""
-    resp = await http_client.post(
-        "/get_segment_data",
-        json={
-            "ticker": ticker,
-            "accession_number": accession_number,
-        },
-    )
-    return resp.json()
-
-
-@mcp.tool()
-async def answer(final_answer: str) -> str:
-    await http_client.post("/answer", json={"final_answer": final_answer})
-    return f"Answer submitted: {final_answer}"
-
-
-@mcp.tool()
-async def evaluate(rubric: list[dict[str, str | float]]) -> EvaluationResult:
-    try:
-        resp = await http_client.post("/evaluate", json={"rubric": rubric})
-        resp.raise_for_status()
-        return EvaluationResult(**resp.json())
-    except Exception as e:
-        logging.error(f"Evaluation tool error: {e}")
-        return EvaluationResult(
-            reward=0.0, done=True, content=f"Evaluation error: {e}", isError=True
-        )
-
-
-@mcp.tool()
-async def get_filing_sections(ticker: str, accession_number: str) -> Any:
-    resp = await http_client.post(
-        "/get_filing_sections",
-        json={
-            "ticker": ticker,
-            "accession_number": accession_number,
-        },
-    )
-    return resp.json()
-
-
-@mcp.tool()
-async def web_search(query: str) -> List[Dict[str, str]]:
-    resp = await http_client.post("/web_search", json={"query": query})
-    return resp.json()
-
-
-@mcp.tool()
-async def web_fetch(url: str) -> str:
-    resp = await http_client.post("/web_fetch", json={"url": url})
-    data = resp.json()
-    return data.get("content", "")
-
-
-if __name__ == "__main__":
-    mcp.run()
diff --git a/environments/rubrics/server/pyproject.toml b/environments/rubrics/server/pyproject.toml
deleted file mode 100644
index 0aa18755..00000000
--- a/environments/rubrics/server/pyproject.toml
+++ /dev/null
@@ -1,19 +0,0 @@
-[project]
-name = "rubrics-mcp"
-version = "0.1.0"
-description = "MCP server for Rubrics environment"
-requires-python = ">=3.11"
-dependencies = [
-    "hud-python>=0.4.54",
-    "httpx>=0.24.0",
-]
-
-[build-system]
-requires = ["hatchling"]
-build-backend = "hatchling.build"
-
-[tool.hatch.metadata]
-allow-direct-references = true
-
-[tool.hatch.build.targets.wheel]
-packages = ["mcp"]
diff --git a/environments/rubrics/tasks.json b/environments/rubrics/tasks.json
deleted file mode 100644
index 07a8a938..00000000
--- a/environments/rubrics/tasks.json
+++ /dev/null
@@ -1,912 +0,0 @@
-[
-  {
-    "id": "finance_rubric_00000",
-    "prompt": "I need help analyzing Valero's refining margins in early 2024. Based on its FY2023 10-K and Q1 2024 10-Q, can you calculate the refining margin per barrel? And compare year over year? \n\nIf margins were getting squeezed while throughput volumes were consistent, figure out what drove that - is it because crude differentials were tightening up, or could it be related to how they timed their maintenance schedules? And based on the findings at that time, what would have been implied about their optimization approach heading into Q2 2024?",
-    "mcp_config": {
-      "local": {
-        "command": "docker",
-        "args": [
-          "run",
-          "--rm",
-          "-i",
-          "--env-file",
-          ".env",
-          "rubrics:0.1.0"
-        ]
-      }
-    },
-    "setup_tool": {
-      "name": "setup",
-      "arguments": {}
-    },
-    "evaluate_tool": {
-      "name": "evaluate",
-      "arguments": {
-        "rubric": [
-          {
-            "requirement": "States FY2023 refining margin per barrel as $17.55 (+/- $0.05)",
-            "weight": 10
-          },
-          {
-            "weight": 10,
-            "requirement": "States FY2022 refining margin per barrel as $21.82 (+/- $0.05)"
-          },
-          {
-            "requirement": "States Q1 2024 refining margin per barrel as $14.07 (+/- $0.05)",
-            "weight": 10
-          },
-          {
-            "weight": 10,
-            "requirement": "States Q1 2023 refining margin per barrel as $22.37 (+/- $0.05)"
-          },
-          {
-            "weight": 15,
-            "requirement": "Explicitly cites Valero's Q1 2024 10-Q attribution to weaker cracks and narrower crude diffs"
-          },
-          {
-            "weight": 10,
-            "requirement": "States that steady throughput rules out maintenance as primary driver"
-          },
-          {
-            "weight": 10,
-            "requirement": "States that narrowing/tightening crude differentials contributed to margin compression"
-          },
-          {
-            "weight": 5,
-            "requirement": "Notes RIN/RVO costs were lower YoY in Q1 2024 and not the margin compression driver"
-          },
-          {
-            "weight": 10,
-            "requirement": "Directly cites or references Valero FY2023 10-K filing"
-          },
-          {
-            "weight": 10,
-            "requirement": "Directly cites or references Valero Q1 2024 10-Q filing"
-          },
-          {
-            "weight": 12,
-            "requirement": "Lists at least 3 specific optimization levers (crude slate, product yield, hedging)"
-          },
-          {
-            "weight": 8,
-            "requirement": "Recommends adjusting crude slate based on differential movements (lighter if heavy discounts tight)"
-          },
-          {
-            "weight": 8,
-            "requirement": "Recommends maximizing gasoline/FCC runs for Q2 summer driving season strength"
-          },
-          {
-            "weight": 8,
-            "requirement": "Recommends exploiting lower natural gas/energy costs to reduce operating expenses"
-          }
-        ]
-      }
-    },
-    "system_prompt": "You are an AI finance research assistant specializing in public company filings analysis. You must call the answer tool with your final answer when you have finished your research. Search for companies by ticker or name, then retrieve the relevant SEC filings needed to answer the question. Extract financial data and segment information from filings as needed for your analysis. You have access to tools that you must call one-at-a-time. As an ABSOLUTE last resort you can use web search and fetch if you're unable to use the SEC tools.\n\nPull the relevant documents, analyze them thoroughly, and provide your answer in a well-structured format. Call tools one-at-a-time. When finished, call the answer tool with your final answer or else the task will not be completed. Once you have gathered the information needed to answer the question, call the answer tool."
-  },
-  {
-    "id": "finance_rubric_00001",
-    "prompt": "Using T-Mobile's Q1 2024 and Q3 2024 10-Qs, calculate wireless service margins and ARPU if disclosed in segment data. Should margins compress despite ARPU growth and subscriber additions, would this reflect promotional intensity to defend share or network infrastructure costs outpacing revenue? What does that imply for fixed cost leverage as the network scales?",
-    "mcp_config": {
-      "local": {
-        "command": "docker",
-        "args": [
-          "run",
-          "--rm",
-          "-i",
-          "--env-file",
-          ".env",
-          "rubrics:0.1.0"
-        ]
-      }
-    },
-    "setup_tool": {
-      "name": "setup",
-      "arguments": {}
-    },
-    "evaluate_tool": {
-      "name": "evaluate",
-      "arguments": {
-        "rubric": [
-          {
-            "weight": 12,
-            "requirement": "States Q1 2024 service margin as 83.3% or 84.0% (adjusted for merger costs)"
-          },
-          {
-            "weight": 12,
-            "requirement": "States Q3 2024 service margin as 83.7%"
-          },
-          {
-            "weight": 10,
-            "requirement": "States margins expanded (not compressed) Q1 to Q3 2024"
-          },
-          {
-            "weight": 8,
-            "requirement": "Shows calculation methodology as (Service Revenue - Cost of Services ex-D&A) / Service Revenue"
-          },
-          {
-            "weight": 8,
-            "requirement": "States Q1 2024 service revenues as $16,096 million (+/- $10m)"
-          },
-          {
-            "weight": 8,
-            "requirement": "States Q1 2024 cost of services as $2,688 million (+/- $10m)"
-          },
-          {
-            "weight": 8,
-            "requirement": "States Q3 2024 service revenues as $16,725 million (+/- $10m)"
-          },
-          {
-            "weight": 8,
-            "requirement": "States Q3 2024 cost of services as $2,722 million (+/- $10m)"
-          },
-          {
-            "weight": 6,
-            "requirement": "States Q1 2024 postpaid phone ARPU as $48.79 (+/- $0.10)"
-          },
-          {
-            "weight": 6,
-            "requirement": "States Q3 2024 postpaid phone ARPU as $49.79 (+/- $0.10)"
-          },
-          {
-            "weight": 5,
-            "requirement": "Quantifies postpaid phone ARPU increase as $1.00 or +2.0%"
-          },
-          {
-            "weight": 5,
-            "requirement": "States postpaid ARPA increased from $140.88 to $145.60 (+/- $0.50 each)"
-          },
-          {
-            "weight": 4,
-            "requirement": "Notes prepaid ARPU declined from Q1 to Q3 2024"
-          },
-          {
-            "weight": 12,
-            "requirement": "Identifies network infrastructure costs as primary or dominant driver of margin dynamics"
-          },
-          {
-            "weight": 10,
-            "requirement": "References Q3 10-Q language about higher site costs or 5G network build-out costs"
-          },
-          {
-            "weight": 8,
-            "requirement": "Discusses promotional activity as present but secondary factor or offset to ARPU gains"
-          },
-          {
-            "weight": 6,
-            "requirement": "Notes record-low churn of 0.86% as evidence against defensive promotional intensity"
-          },
-          {
-            "weight": 6,
-            "requirement": "Compares cost of services growth rate to service revenue growth rate Q1 to Q3"
-          },
-          {
-            "weight": 10,
-            "requirement": "References T-Mobile Q1 2024 10-Q"
-          },
-          {
-            "weight": 10,
-            "requirement": "References T-Mobile Q3 2024 10-Q"
-          },
-          {
-            "weight": 8,
-            "requirement": "Explains wireless operating leverage concept as incremental subscribers adding high-margin revenue over fixed network"
-          },
-          {
-            "weight": 7,
-            "requirement": "Discusses transition from network investment phase to operating leverage phase"
-          },
-          {
-            "weight": 5,
-            "requirement": "Explains how network cost step-ups temporarily mute operating leverage realization"
-          }
-        ]
-      }
-    },
-    "system_prompt": "You are an AI finance research assistant specializing in public company filings analysis. You must call the answer tool with your final answer when you have finished your research. Search for companies by ticker or name, then retrieve the relevant SEC filings needed to answer the question. Extract financial data and segment information from filings as needed for your analysis. You have access to tools that you must call one-at-a-time. As an ABSOLUTE last resort you can use web search and fetch if you're unable to use the SEC tools.\n\nPull the relevant documents, analyze them thoroughly, and provide your answer in a well-structured format. Call tools one-at-a-time. When finished, call the answer tool with your final answer or else the task will not be completed. Once you have gathered the information needed to answer the question, call the answer tool."
-  },
-  {
-    "id": "finance_rubric_00002",
-    "prompt": "Using PNC Financial's Q2 2024 and Q2 2023 10-Qs, calculate loan growth and operating cash flow if disclosed in the statements. Should loan balances expand while cash generation contracts, would this reflect provision expense buildup or deposit funding cost pressure? What does that imply for capital deployment efficiency and near-term profitability?",
-    "mcp_config": {
-      "local": {
-        "command": "docker",
-        "args": [
-          "run",
-          "--rm",
-          "-i",
-          "--env-file",
-          ".env",
-          "rubrics:0.1.0"
-        ]
-      }
-    },
-    "setup_tool": {
-      "name": "setup",
-      "arguments": {}
-    },
-    "evaluate_tool": {
-      "name": "evaluate",
-      "arguments": {
-        "rubric": [
-          {
-            "weight": 10,
-            "requirement": "States Q2 2024 period-end total loans as $321.0B to $321.9B"
-          },
-          {
-            "weight": 10,
-            "requirement": "States Q2 2023 period-end total loans as $321.0B to $322.5B"
-          },
-          {
-            "weight": 6,
-            "requirement": "Calculates YoY loan change as -0.5% to +0.2% or -$1.0B to +$0.5B"
-          },
-          {
-            "weight": 10,
-            "requirement": "States 2024 six-month operating cash flow as $2.5B to $3.0B"
-          },
-          {
-            "weight": 10,
-            "requirement": "States 2023 six-month operating cash flow as $4.9B to $5.4B"
-          },
-          {
-            "weight": 6,
-            "requirement": "Calculates YoY cash flow decline as -40% to -50% or -$2.0B to -$2.6B"
-          },
-          {
-            "weight": 12,
-            "requirement": "Explicitly identifies funding cost pressure (not provision buildup) as primary driver of CFO decline"
-          },
-          {
-            "weight": 8,
-            "requirement": "States provision expense changed by less than $50M or increased less than 10% YoY"
-          },
-          {
-            "weight": 8,
-            "requirement": "States deposit interest expense increased by $1.0B to $1.5B YoY"
-          },
-          {
-            "weight": 6,
-            "requirement": "Explains why provision expense cannot explain CFO decline (non-cash add-back or similar logic)"
-          },
-          {
-            "weight": 6,
-            "requirement": "Recommends at least 2 of prioritizing capital return over growth, tightening loan pricing, or managing deposit mix"
-          },
-          {
-            "weight": 5,
-            "requirement": "States net interest margin compressed by 15 to 25 basis points YoY"
-          },
-          {
-            "weight": 4,
-            "requirement": "States borrowed-funds interest expense increased by $500M to $800M YoY"
-          },
-          {
-            "weight": 5,
-            "requirement": "References PNC Financial Q2 2024 10-Q"
-          },
-          {
-            "weight": 5,
-            "requirement": "References PNC Financial Q2 2023 10-Q"
-          }
-        ]
-      }
-    },
-    "system_prompt": "You are an AI finance research assistant specializing in public company filings analysis. You must call the answer tool with your final answer when you have finished your research. Search for companies by ticker or name, then retrieve the relevant SEC filings needed to answer the question. Extract financial data and segment information from filings as needed for your analysis. You have access to tools that you must call one-at-a-time. As an ABSOLUTE last resort you can use web search and fetch if you're unable to use the SEC tools.\n\nPull the relevant documents, analyze them thoroughly, and provide your answer in a well-structured format. Call tools one-at-a-time. When finished, call the answer tool with your final answer or else the task will not be completed. Once you have gathered the information needed to answer the question, call the answer tool."
-  },
-  {
-    "id": "finance_rubric_00003",
-    "prompt": "Using Boeing's Q3 2024 10-Q, calculate operating margins by segment if disclosed and compare to FY2023 10-K levels. Should defense margins compress while backlog grows, would this reflect fixed-price contract overruns or engineering labor inflation? What does that imply for program profitability and bid discipline on new awards?",
-    "mcp_config": {
-      "local": {
-        "command": "docker",
-        "args": [
-          "run",
-          "--rm",
-          "-i",
-          "--env-file",
-          ".env",
-          "rubrics:0.1.0"
-        ]
-      }
-    },
-    "setup_tool": {
-      "name": "setup",
-      "arguments": {}
-    },
-    "evaluate_tool": {
-      "name": "evaluate",
-      "arguments": {
-        "rubric": [
-          {
-            "weight": 10,
-            "requirement": "States BCA Q3 2024 operating margin as -54.0% within +/- 1 percentage point"
-          },
-          {
-            "weight": 10,
-            "requirement": "States BDS Q3 2024 operating margin as -43.1% within +/- 1 percentage point"
-          },
-          {
-            "weight": 10,
-            "requirement": "States BGS Q3 2024 operating margin as +17.0% within +/- 1 percentage point"
-          },
-          {
-            "weight": 5,
-            "requirement": "Shows or describes margin calculation as operating income divided by revenue"
-          },
-          {
-            "weight": 8,
-            "requirement": "States BCA FY2023 full-year margin as -4.8% within +/- 1 percentage point"
-          },
-          {
-            "weight": 8,
-            "requirement": "States BDS FY2023 full-year margin as -7.1% within +/- 1 percentage point"
-          },
-          {
-            "weight": 8,
-            "requirement": "States BGS FY2023 full-year margin as +17.4% within +/- 1 percentage point"
-          },
-          {
-            "weight": 6,
-            "requirement": "Calculates or states BDS margin deterioration as 35-37 percentage points YoY"
-          },
-          {
-            "weight": 10,
-            "requirement": "Attributes defense margin compression primarily to fixed-price contract overruns not labor inflation"
-          },
-          {
-            "weight": 8,
-            "requirement": "Quantifies Q3 2024 defense charges as $2.0-2.4 billion"
-          },
-          {
-            "weight": 6,
-            "requirement": "Names at least 3 of T-7A, KC-46A, VC-25B, Commercial Crew, or MQ-25 as troubled programs"
-          },
-          {
-            "weight": 6,
-            "requirement": "States defense backlog grew from approx $59B Dec-2023 to $62B Sep-2024"
-          },
-          {
-            "weight": 8,
-            "requirement": "Identifies backlog growth concurrent with margin compression as bid discipline failure red flag"
-          },
-          {
-            "weight": 8,
-            "requirement": "Recommends improved bid discipline or avoiding unprofitable fixed-price development work"
-          },
-          {
-            "weight": 4,
-            "requirement": "Explicitly cites or references Boeing Q3 2024 10-Q filing as data source"
-          },
-          {
-            "weight": 4,
-            "requirement": "Explicitly cites or references Boeing FY2023 10-K filing as data source"
-          }
-        ]
-      }
-    },
-    "system_prompt": "You are an AI finance research assistant specializing in public company filings analysis. You must call the answer tool with your final answer when you have finished your research. Search for companies by ticker or name, then retrieve the relevant SEC filings needed to answer the question. Extract financial data and segment information from filings as needed for your analysis. You have access to tools that you must call one-at-a-time. As an ABSOLUTE last resort you can use web search and fetch if you're unable to use the SEC tools.\n\nPull the relevant documents, analyze them thoroughly, and provide your answer in a well-structured format. Call tools one-at-a-time. When finished, call the answer tool with your final answer or else the task will not be completed. Once you have gathered the information needed to answer the question, call the answer tool."
-  },
-  {
-    "id": "finance_rubric_00004",
-    "prompt": "Using Amazon's FY2024 10-K, calculate inventory turnover and fulfillment costs as a percentage of revenue if disclosed in the statements. Should turnover deteriorate while online mix reaches 65% yet fulfillment intensity remains stable, would this reflect assortment expansion or promotional cadence changes? What does that imply for working capital efficiency?",
-    "mcp_config": {
-      "local": {
-        "command": "docker",
-        "args": [
-          "run",
-          "--rm",
-          "-i",
-          "--env-file",
-          ".env",
-          "rubrics:0.1.0"
-        ]
-      }
-    },
-    "setup_tool": {
-      "name": "setup",
-      "arguments": {}
-    },
-    "evaluate_tool": {
-      "name": "evaluate",
-      "arguments": {
-        "rubric": [
-          {
-            "weight": 10,
-            "requirement": "States inventory turnover between 9.5x and 9.8x for FY2024"
-          },
-          {
-            "weight": 10,
-            "requirement": "States fulfillment costs as 15.2% to 15.6% of revenue"
-          },
-          {
-            "weight": 6,
-            "requirement": "Calculates or states Days Inventory Outstanding between 37 and 39 days"
-          },
-          {
-            "weight": 8,
-            "requirement": "References Amazon FY2024 10-K"
-          },
-          {
-            "weight": 10,
-            "requirement": "States assortment expansion or SKU proliferation as primary driver of turnover deterioration"
-          },
-          {
-            "weight": 8,
-            "requirement": "Explains stable fulfillment percentage means same units per revenue dollar ruling out promotional changes"
-          },
-          {
-            "weight": 6,
-            "requirement": "Mentions long-tail items, SKU count growth, or category expansion reducing average turnover"
-          },
-          {
-            "weight": 5,
-            "requirement": "Discusses forward-stocking, regional distribution, or geographic duplication increasing inventory"
-          },
-          {
-            "weight": 8,
-            "requirement": "Explicitly states promotional changes would alter units per revenue and move fulfillment percentage"
-          },
-          {
-            "weight": 10,
-            "requirement": "States each day of inventory ties up approximately $0.8B to $1.0B in working capital"
-          },
-          {
-            "weight": 6,
-            "requirement": "Calculates cash impact for specific turnover deterioration scenario with dollar amounts"
-          },
-          {
-            "weight": 6,
-            "requirement": "Mentions at least 2 of extending DPO, pushing to FBA/3P, or vendor financing as offsets"
-          },
-          {
-            "weight": 5,
-            "requirement": "Discusses impact on cash conversion cycle or free cash flow from inventory build"
-          },
-          {
-            "weight": 5,
-            "requirement": "Recommends tracking SKU count growth or category mix shifts"
-          },
-          {
-            "weight": 4,
-            "requirement": "Recommends monitoring 1P vs 3P mix or FBA share"
-          },
-          {
-            "weight": 4,
-            "requirement": "Recommends tracking aged inventory, obsolescence reserves, or markdown risk"
-          },
-          {
-            "weight": 4,
-            "requirement": "Recommends monitoring Days Payable Outstanding or supplier payment terms"
-          }
-        ]
-      }
-    },
-    "system_prompt": "You are an AI finance research assistant specializing in public company filings analysis. You must call the answer tool with your final answer when you have finished your research. Search for companies by ticker or name, then retrieve the relevant SEC filings needed to answer the question. Extract financial data and segment information from filings as needed for your analysis. You have access to tools that you must call one-at-a-time. As an ABSOLUTE last resort you can use web search and fetch if you're unable to use the SEC tools.\n\nPull the relevant documents, analyze them thoroughly, and provide your answer in a well-structured format. Call tools one-at-a-time. When finished, call the answer tool with your final answer or else the task will not be completed. Once you have gathered the information needed to answer the question, call the answer tool."
-  },
-  {
-    "id": "finance_rubric_00005",
-    "prompt": "Using Moderna's Q1, Q2, and Q3 2024 10-Qs, calculate R&D spend as a percentage of revenue and operating cash burn if disclosed in the cash flow statements. Should burn rate accelerate while Phase III programs advance, would this reflect CRO cost inflation or inefficient trial design? What does that imply for runway and financing needs?",
-    "mcp_config": {
-      "local": {
-        "command": "docker",
-        "args": [
-          "run",
-          "--rm",
-          "-i",
-          "--env-file",
-          ".env",
-          "rubrics:0.1.0"
-        ]
-      }
-    },
-    "setup_tool": {
-      "name": "setup",
-      "arguments": {}
-    },
-    "evaluate_tool": {
-      "name": "evaluate",
-      "arguments": {
-        "rubric": [
-          {
-            "weight": 10,
-            "requirement": "States Q1 2024 R&D as percentage of revenue is 636% (+/- 2pp)"
-          },
-          {
-            "weight": 10,
-            "requirement": "States Q2 2024 R&D as percentage of revenue is 507% (+/- 2pp)"
-          },
-          {
-            "weight": 10,
-            "requirement": "States Q3 2024 R&D as percentage of revenue is 61% (+/- 2pp)"
-          },
-          {
-            "weight": 8,
-            "requirement": "Shows underlying revenue and R&D expense amounts for at least 2 quarters"
-          },
-          {
-            "weight": 10,
-            "requirement": "States Q1 2024 operating cash burn is $989M (+/- $20M)"
-          },
-          {
-            "weight": 10,
-            "requirement": "States Q2 2024 operating cash burn is $1,274M (+/- $30M)"
-          },
-          {
-            "weight": 10,
-            "requirement": "States Q3 2024 operating cash burn is $1,566M (+/- $30M)"
-          },
-          {
-            "weight": 5,
-            "requirement": "Explicitly notes operating cash burn accelerated from Q1 to Q3"
-          },
-          {
-            "weight": 10,
-            "requirement": "References Moderna Q1 2024 10-Q"
-          },
-          {
-            "weight": 10,
-            "requirement": "References Moderna Q2 2024 10-Q"
-          },
-          {
-            "weight": 10,
-            "requirement": "References Moderna Q3 2024 10-Q"
-          },
-          {
-            "weight": 10,
-            "requirement": "Attributes burn acceleration to working capital or prepayments not GAAP expense increases"
-          },
-          {
-            "weight": 6,
-            "requirement": "Notes GAAP R&D expense remained stable at approximately $1.1-1.2B per quarter"
-          },
-          {
-            "weight": 8,
-            "requirement": "Concludes burn reflects Phase III scale-up or timing not trial design inefficiency"
-          },
-          {
-            "weight": 8,
-            "requirement": "States cash position as $9.2B (+/- $0.2B) as of September 30, 2024"
-          },
-          {
-            "weight": 10,
-            "requirement": "Calculates mechanical runway showing 15-22 months from September 2024"
-          },
-          {
-            "weight": 5,
-            "requirement": "Notes Q4 seasonal collections or revenue timing affects true runway"
-          },
-          {
-            "weight": 6,
-            "requirement": "Identifies R&D spending exceeds revenue creating structural deficit"
-          },
-          {
-            "weight": 5,
-            "requirement": "Discusses financing pressure timeline within 18-30 months"
-          }
-        ]
-      }
-    },
-    "system_prompt": "You are an AI finance research assistant specializing in public company filings analysis. You must call the answer tool with your final answer when you have finished your research. Search for companies by ticker or name, then retrieve the relevant SEC filings needed to answer the question. Extract financial data and segment information from filings as needed for your analysis. You have access to tools that you must call one-at-a-time. As an ABSOLUTE last resort you can use web search and fetch if you're unable to use the SEC tools.\n\nPull the relevant documents, analyze them thoroughly, and provide your answer in a well-structured format. Call tools one-at-a-time. When finished, call the answer tool with your final answer or else the task will not be completed. Once you have gathered the information needed to answer the question, call the answer tool."
-  },
-  {
-    "id": "finance_rubric_00006",
-    "prompt": "Per Netflix's 2024 Proxy Statement (DEF 14A), compile a table of CEO and CFO total compensation (salary, bonus, equity grant-date value) alongside three-year total shareholder return (2022 -2024). Apply the ISS quantitative pay-for-performance test and tabulate percentile ranks plus the overall score. In three sentences, state whether performance-vesting targets exceeded peer median and recommend a Say-on-Pay vote.",
-    "mcp_config": {
-      "local": {
-        "command": "docker",
-        "args": [
-          "run",
-          "--rm",
-          "-i",
-          "--env-file",
-          ".env",
-          "rubrics:0.1.0"
-        ]
-      }
-    },
-    "setup_tool": {
-      "name": "setup",
-      "arguments": {}
-    },
-    "evaluate_tool": {
-      "name": "evaluate",
-      "arguments": {
-        "rubric": [
-          {
-            "weight": 15,
-            "requirement": "Contains a table with at least 3 columns and 3 rows displaying executive compensation data"
-          },
-          {
-            "weight": 8,
-            "requirement": "Lists Ted Sarandos by name and identifies him as Co-CEO"
-          },
-          {
-            "weight": 8,
-            "requirement": "Lists Greg Peters by name and identifies him as Co-CEO"
-          },
-          {
-            "weight": 8,
-            "requirement": "Lists Spencer Neumann by name and identifies him as CFO"
-          },
-          {
-            "weight": 5,
-            "requirement": "Table includes a column labeled salary or base salary"
-          },
-          {
-            "weight": 5,
-            "requirement": "Table includes a column labeled bonus or annual bonus"
-          },
-          {
-            "weight": 5,
-            "requirement": "Table includes a column labeled equity, stock, or grant-date value"
-          },
-          {
-            "weight": 8,
-            "requirement": "States Ted Sarandos 2024 salary as $3 million or $3.0M"
-          },
-          {
-            "weight": 8,
-            "requirement": "States Ted Sarandos 2024 bonus as $12 million or $12.0M"
-          },
-          {
-            "weight": 8,
-            "requirement": "States Ted Sarandos 2024 equity grant-date value as $42.7M or $42.71M"
-          },
-          {
-            "weight": 8,
-            "requirement": "States Greg Peters 2024 salary as $3 million or $3.0M"
-          },
-          {
-            "weight": 8,
-            "requirement": "States Greg Peters 2024 bonus as $12 million or $12.0M"
-          },
-          {
-            "weight": 8,
-            "requirement": "States Greg Peters 2024 equity grant-date value as $42.7M or $42.71M"
-          },
-          {
-            "weight": 8,
-            "requirement": "States Spencer Neumann 2024 salary as $1.5 million or $1.5M"
-          },
-          {
-            "weight": 8,
-            "requirement": "States Spencer Neumann 2024 bonus as $3M, $3.0M, or $6M"
-          },
-          {
-            "weight": 8,
-            "requirement": "States Spencer Neumann 2024 equity grant-date value as $14.5M or $14.47M"
-          },
-          {
-            "weight": 10,
-            "requirement": "States Ted Sarandos 2024 total compensation as $61.9M or $61.92M"
-          },
-          {
-            "weight": 10,
-            "requirement": "States Greg Peters 2024 total compensation as $60.3M or $60.27M"
-          },
-          {
-            "weight": 10,
-            "requirement": "States Spencer Neumann 2024 total compensation as $22.9M or $22.90M"
-          },
-          {
-            "weight": 10,
-            "requirement": "Includes three-year total shareholder return (TSR) data covering 2022-2024"
-          },
-          {
-            "weight": 4,
-            "requirement": "States the 2022 TSR performance value as -51.1%"
-          },
-          {
-            "weight": 4,
-            "requirement": "States the 2023 TSR performance value as 65.1%"
-          },
-          {
-            "weight": 4,
-            "requirement": "States the 2024 TSR performance value as 83.1%"
-          },
-          {
-            "weight": 5,
-            "requirement": "States 3-year cumulative TSR as 48% or 49.21%"
-          },
-          {
-            "weight": 10,
-            "requirement": "Explicitly mentions applying ISS quantitative pay-for-performance test methodology"
-          },
-          {
-            "weight": 6,
-            "requirement": "States RDA score showing CEO pay at 95th percentile and TSR at 80th percentile"
-          },
-          {
-            "weight": 6,
-            "requirement": "States MOM score as 2.0x or above 2x peer median"
-          },
-          {
-            "weight": 6,
-            "requirement": "States PTA assessment as Aligned or indicates pay-TSR alignment"
-          },
-          {
-            "weight": 4,
-            "requirement": "States CEO pay percentile rank as 95th or 85th percentile"
-          },
-          {
-            "weight": 4,
-            "requirement": "States 3-year TSR performance percentile as 80th or 75th percentile"
-          },
-          {
-            "weight": 5,
-            "requirement": "States overall ISS concern level as Medium Concern"
-          },
-          {
-            "weight": 10,
-            "requirement": "States that performance-vesting targets exceeded peer median"
-          },
-          {
-            "weight": 15,
-            "requirement": "Makes clear recommendation to vote FOR or AGAINST Say-on-Pay"
-          },
-          {
-            "weight": 5,
-            "requirement": "Explicitly connects performance results to vote recommendation"
-          },
-          {
-            "weight": 15,
-            "requirement": "State whether performance-vesting targets exceeded peer median and recommend a Say-on-Pay vote in exactly 3 sentences"
-          },
-          {
-            "weight": 20,
-            "requirement": "References Netflix 2024 DEF 14A proxy statement"
-          },
-          {
-            "weight": -50,
-            "requirement": "Does not flag that Netflix has co-CEOs"
-          }
-        ]
-      }
-    },
-    "system_prompt": "You are an AI finance research assistant specializing in public company filings analysis. You must call the answer tool with your final answer when you have finished your research. Search for companies by ticker or name, then retrieve the relevant SEC filings needed to answer the question. Extract financial data and segment information from filings as needed for your analysis. You have access to tools that you must call one-at-a-time. As an ABSOLUTE last resort you can use web search and fetch if you're unable to use the SEC tools.\n\nPull the relevant documents, analyze them thoroughly, and provide your answer in a well-structured format. Call tools one-at-a-time. When finished, call the answer tool with your final answer or else the task will not be completed. Once you have gathered the information needed to answer the question, call the answer tool."
-  },
-  {
-    "id": "finance_rubric_00007",
-    "prompt": "Based on Oracle's FY2024 10-K, what were the operating expenses in 2024, broken out by segment? Which segment had the highest operating expenses and which segment had the lowest?",
-    "mcp_config": {
-      "local": {
-        "command": "docker",
-        "args": [
-          "run",
-          "--rm",
-          "-i",
-          "--env-file",
-          ".env",
-          "rubrics:0.1.0"
-        ]
-      }
-    },
-    "setup_tool": {
-      "name": "setup",
-      "arguments": {}
-    },
-    "evaluate_tool": {
-      "name": "evaluate",
-      "arguments": {
-        "rubric": [
-          {
-            "weight": 10,
-            "requirement": "States Cloud Services and License segment operating expenses were $9.4B (or $9,427m)"
-          },
-          {
-            "weight": 10,
-            "requirement": "States Hardware segment operating expenses were $891m"
-          },
-          {
-            "weight": 10,
-            "requirement": "States Services segment operating expenses were $4.8B (or $4,825m)"
-          },
-          {
-            "weight": 10,
-            "requirement": "Correctly identifies Cloud/Cloud & License as highest operating expense segment"
-          },
-          {
-            "weight": 10,
-            "requirement": "Correctly identifies Hardware as lowest operating expense segment"
-          },
-          {
-            "weight": 8,
-            "requirement": "References Oracle's FY2024 Form 10-K"
-          }
-        ]
-      }
-    },
-    "system_prompt": "You are an AI finance research assistant specializing in public company filings analysis. You must call the answer tool with your final answer when you have finished your research. Search for companies by ticker or name, then retrieve the relevant SEC filings needed to answer the question. Extract financial data and segment information from filings as needed for your analysis. You have access to tools that you must call one-at-a-time. As an ABSOLUTE last resort you can use web search and fetch if you're unable to use the SEC tools.\n\nPull the relevant documents, analyze them thoroughly, and provide your answer in a well-structured format. Call tools one-at-a-time. When finished, call the answer tool with your final answer or else the task will not be completed. Once you have gathered the information needed to answer the question, call the answer tool."
-  },
-  {
-    "id": "finance_rubric_00008",
-    "prompt": "Per Sun Country Airlines' FY2024 Form 10-K, what were Sun's ASMs and RPMs for 2024, and what was the load factor?",
-    "mcp_config": {
-      "local": {
-        "command": "docker",
-        "args": [
-          "run",
-          "--rm",
-          "-i",
-          "--env-file",
-          ".env",
-          "rubrics:0.1.0"
-        ]
-      }
-    },
-    "setup_tool": {
-      "name": "setup",
-      "arguments": {}
-    },
-    "evaluate_tool": {
-      "name": "evaluate",
-      "arguments": {
-        "rubric": [
-          {
-            "weight": 15,
-            "requirement": "States 2024 ASMs as 6.7 billion or 6,707,308 thousand"
-          },
-          {
-            "weight": 15,
-            "requirement": "States 2024 RPMs as 5.6 billion or 5,648,351 thousand"
-          },
-          {
-            "weight": 12,
-            "requirement": "States 2024 load factor as 84.2%"
-          },
-          {
-            "weight": 8,
-            "requirement": "References Sun Country Airlines' FY2024 Form 10-K"
-          }
-        ]
-      }
-    },
-    "system_prompt": "You are an AI finance research assistant specializing in public company filings analysis. You must call the answer tool with your final answer when you have finished your research. Search for companies by ticker or name, then retrieve the relevant SEC filings needed to answer the question. Extract financial data and segment information from filings as needed for your analysis. You have access to tools that you must call one-at-a-time. As an ABSOLUTE last resort you can use web search and fetch if you're unable to use the SEC tools.\n\nPull the relevant documents, analyze them thoroughly, and provide your answer in a well-structured format. Call tools one-at-a-time. When finished, call the answer tool with your final answer or else the task will not be completed. Once you have gathered the information needed to answer the question, call the answer tool."
-  },
-  {
-    "id": "finance_rubric_00009",
-    "prompt": "Based on ExxonMobil's Q2 2025 10-Q, what were the crude oil and product purchases for the three months ended June 30, 2025, broken out by U.S. vs Non-U.S.? Which was higher and which was lower?",
-    "mcp_config": {
-      "local": {
-        "command": "docker",
-        "args": [
-          "run",
-          "--rm",
-          "-i",
-          "--env-file",
-          ".env",
-          "rubrics:0.1.0"
-        ]
-      }
-    },
-    "setup_tool": {
-      "name": "setup",
-      "arguments": {}
-    },
-    "evaluate_tool": {
-      "name": "evaluate",
-      "arguments": {
-        "rubric": [
-          {
-            "weight": 12,
-            "requirement": "States U.S. purchases as $33,257M or $33.257B or within $33.2-33.3B"
-          },
-          {
-            "weight": 12,
-            "requirement": "States Non-U.S. purchases as $40,786M or $40.786B or within $40.7-40.9B"
-          },
-          {
-            "weight": 10,
-            "requirement": "Explicitly states Non-U.S. purchases were higher than U.S. purchases"
-          },
-          {
-            "weight": 8,
-            "requirement": "References ExxonMobil Q2 2025 10-Q"
-          }
-        ]
-      }
-    },
-    "system_prompt": "You are an AI finance research assistant specializing in public company filings analysis. You must call the answer tool with your final answer when you have finished your research. Search for companies by ticker or name, then retrieve the relevant SEC filings needed to answer the question. Extract financial data and segment information from filings as needed for your analysis. You have access to tools that you must call one-at-a-time. As an ABSOLUTE last resort you can use web search and fetch if you're unable to use the SEC tools.\n\nPull the relevant documents, analyze them thoroughly, and provide your answer in a well-structured format. Call tools one-at-a-time. When finished, call the answer tool with your final answer or else the task will not be completed. Once you have gathered the information needed to answer the question, call the answer tool."
-  }
-]
\ No newline at end of file
diff --git a/environments/text_2048/2048_taskconfigs.json b/environments/text_2048/2048_taskconfigs.json
deleted file mode 100644
index 76967358..00000000
--- a/environments/text_2048/2048_taskconfigs.json
+++ /dev/null
@@ -1,542 +0,0 @@
-[
-  {
-    "id": "2048_target_128",
-    "prompt": "You are playing 2048 and your goal is to reach the 128 tile. Combine tiles strategically to achieve this goal.",
-    "mcp_config": {
-      "hud": {
-        "url": "http://localhost:8765/mcp"
-      }
-    },
-    "setup_tool": {
-      "name": "setup",
-      "arguments": {
-        "name": "board",
-        "arguments": {
-          "board_size": 4
-        }
-      }
-    },
-    "evaluate_tool": {
-      "name": "evaluate",
-      "arguments": {
-        "name": "max_number",
-        "arguments": {
-          "target": 128
-        }
-      }
-    },
-    "metadata": {
-      "description": "Reach the 128 tile - good for beginners",
-      "difficulty": 128,
-      "game": "2048",
-      "board_size": 4
-    }
-  },
-  {
-    "id": "2048_target_256",
-    "prompt": "You are playing 2048 and your goal is to reach the 256 tile. Combine tiles strategically to achieve this goal.",
-    "mcp_config": {
-      "local": {
-        "command": "docker",
-        "arguments": [
-          "run",
-          "--rm",
-          "-i",
-          "hudevals/hud-text-2048:0.1.3"
-        ]
-      }
-    },
-    "setup_tool": {
-      "name": "setup",
-      "arguments": {
-        "name": "board",
-        "arguments": {
-          "board_size": 4
-        }
-      }
-    },
-    "evaluate_tool": {
-      "name": "evaluate",
-      "arguments": {
-        "name": "max_number",
-        "arguments": {
-          "target": 256
-        }
-      }
-    },
-    "metadata": {
-      "description": "Reach the 256 tile - requires basic strategy",
-      "difficulty": 256,
-      "game": "2048",
-      "board_size": 4
-    }
-  },
-  {
-    "id": "2048_target_512",
-    "prompt": "You are playing 2048 and your goal is to reach the 512 tile. Combine tiles strategically to achieve this goal.",
-    "mcp_config": {
-      "local": {
-        "command": "docker",
-        "arguments": [
-          "run",
-          "--rm",
-          "-i",
-          "hudevals/hud-text-2048:0.1.3"
-        ]
-      }
-    },
-    "setup_tool": {
-      "name": "setup",
-      "arguments": {
-        "name": "board",
-        "arguments": {
-          "board_size": 4
-        }
-      }
-    },
-    "evaluate_tool": {
-      "name": "evaluate",
-      "arguments": {
-        "name": "max_number",
-        "arguments": {
-          "target": 512
-        }
-      }
-    },
-    "metadata": {
-      "description": "Reach the 512 tile - needs consistent planning",
-      "difficulty": 512,
-      "game": "2048",
-      "board_size": 4
-    }
-  },
-  {
-    "id": "2048_target_1024",
-    "prompt": "You are playing 2048 and your goal is to reach the 1024 tile. Combine tiles strategically to achieve this goal.",
-    "mcp_config": {
-      "local": {
-        "command": "docker",
-        "arguments": [
-          "run",
-          "--rm",
-          "-i",
-          "hudevals/hud-text-2048:0.1.3"
-        ]
-      }
-    },
-    "setup_tool": {
-      "name": "setup",
-      "arguments": {
-        "name": "board",
-        "arguments": {
-          "board_size": 4
-        }
-      }
-    },
-    "evaluate_tool": {
-      "name": "evaluate",
-      "arguments": {
-        "name": "max_number",
-        "arguments": {
-          "target": 1024
-        }
-      }
-    },
-    "metadata": {
-      "description": "Reach the 1024 tile - requires careful management",
-      "difficulty": 1024,
-      "game": "2048",
-      "board_size": 4
-    }
-  },
-  {
-    "id": "2048_target_2048",
-    "prompt": "You are playing 2048 and your goal is to reach the 2048 tile. Combine tiles strategically to achieve this goal.",
-    "mcp_config": {
-      "local": {
-        "command": "docker",
-        "arguments": [
-          "run",
-          "--rm",
-          "-i",
-          "hudevals/hud-text-2048:0.1.3"
-        ]
-      }
-    },
-    "setup_tool": {
-      "name": "setup",
-      "arguments": {
-        "name": "board",
-        "arguments": {
-          "board_size": 4
-        }
-      }
-    },
-    "evaluate_tool": {
-      "name": "evaluate",
-      "arguments": {
-        "name": "max_number",
-        "arguments": {
-          "target": 2048
-        }
-      }
-    },
-    "metadata": {
-      "description": "Reach the classic 2048 tile",
-      "difficulty": 2048,
-      "game": "2048",
-      "board_size": 4
-    }
-  },
-  {
-    "id": "2048_target_4096",
-    "prompt": "You are playing 2048 and your goal is to reach the 4096 tile. Combine tiles strategically to achieve this goal.",
-    "mcp_config": {
-      "local": {
-        "command": "docker",
-        "arguments": [
-          "run",
-          "--rm",
-          "-i",
-          "hudevals/hud-text-2048:0.1.3"
-        ]
-      }
-    },
-    "setup_tool": {
-      "name": "setup",
-      "arguments": {
-        "name": "board",
-        "arguments": {
-          "board_size": 4
-        }
-      }
-    },
-    "evaluate_tool": {
-      "name": "evaluate",
-      "arguments": {
-        "name": "max_number",
-        "arguments": {
-          "target": 4096
-        }
-      }
-    },
-    "metadata": {
-      "description": "Go beyond 2048 to reach 4096",
-      "difficulty": 4096,
-      "game": "2048",
-      "board_size": 4
-    }
-  },
-  {
-    "id": "2048_efficiency_10",
-    "prompt": "You are playing 2048. Your goal is to play efficiently, maintaining at least 10.0 points per move.",
-    "mcp_config": {
-      "local": {
-        "command": "docker",
-        "arguments": [
-          "run",
-          "--rm",
-          "-i",
-          "hudevals/hud-text-2048:0.1.3"
-        ]
-      }
-    },
-    "setup_tool": {
-      "name": "setup",
-      "arguments": {
-        "name": "board",
-        "arguments": {
-          "board_size": 4
-        }
-      }
-    },
-    "evaluate_tool": {
-      "name": "evaluate",
-      "arguments": {
-        "name": "efficiency",
-        "arguments": {
-          "min_ratio": 10.0
-        }
-      }
-    },
-    "metadata": {
-      "description": "Basic efficiency - maintain 10+ points per move",
-      "challenge_type": "efficiency",
-      "min_ratio": 10.0,
-      "game": "2048"
-    }
-  },
-  {
-    "id": "2048_efficiency_15",
-    "prompt": "You are playing 2048. Your goal is to play efficiently, maintaining at least 15.0 points per move.",
-    "mcp_config": {
-      "local": {
-        "command": "docker",
-        "arguments": [
-          "run",
-          "--rm",
-          "-i",
-          "hudevals/hud-text-2048:0.1.3"
-        ]
-      }
-    },
-    "setup_tool": {
-      "name": "setup",
-      "arguments": {
-        "name": "board",
-        "arguments": {
-          "board_size": 4
-        }
-      }
-    },
-    "evaluate_tool": {
-      "name": "evaluate",
-      "arguments": {
-        "name": "efficiency",
-        "arguments": {
-          "min_ratio": 15.0
-        }
-      }
-    },
-    "metadata": {
-      "description": "Good efficiency - maintain 15+ points per move",
-      "challenge_type": "efficiency",
-      "min_ratio": 15.0,
-      "game": "2048"
-    }
-  },
-  {
-    "id": "2048_efficiency_20",
-    "prompt": "You are playing 2048. Your goal is to play efficiently, maintaining at least 18.0 points per move.",
-    "mcp_config": {
-      "local": {
-        "command": "docker",
-        "arguments": [
-          "run",
-          "--rm",
-          "-i",
-          "hudevals/hud-text-2048:0.1.3"
-        ]
-      }
-    },
-    "setup_tool": {
-      "name": "setup",
-      "arguments": {
-        "name": "board",
-        "arguments": {
-          "board_size": 4
-        }
-      }
-    },
-    "evaluate_tool": {
-      "name": "evaluate",
-      "arguments": {
-        "name": "efficiency",
-        "arguments": {
-          "min_ratio": 18.0
-        }
-      }
-    },
-    "metadata": {
-      "description": "Expert efficiency - maintain 18+ points per move",
-      "challenge_type": "efficiency",
-      "min_ratio": 18.0,
-      "game": "2048"
-    }
-  },
-  {
-    "id": "2048_3x3_target_256",
-    "prompt": "You are playing 2048 on a 3x3 board and your goal is to reach the 256 tile. Combine tiles strategically to achieve this goal.",
-    "mcp_config": {
-      "local": {
-        "command": "docker",
-        "arguments": [
-          "run",
-          "--rm",
-          "-i",
-          "hudevals/hud-text-2048:0.1.3"
-        ]
-      }
-    },
-    "setup_tool": {
-      "name": "setup",
-      "arguments": {
-        "name": "board",
-        "arguments": {
-          "board_size": 3
-        }
-      }
-    },
-    "evaluate_tool": {
-      "name": "evaluate",
-      "arguments": {
-        "name": "max_number",
-        "arguments": {
-          "target": 256
-        }
-      }
-    },
-    "metadata": {
-      "description": "3x3 board - limited space challenge",
-      "board_size": 3,
-      "target": 256,
-      "game": "2048"
-    }
-  },
-  {
-    "id": "2048_5x5_target_4096",
-    "prompt": "You are playing 2048 on a 5x5 board and your goal is to reach the 4096 tile. Combine tiles strategically to achieve this goal.",
-    "mcp_config": {
-      "local": {
-        "command": "docker",
-        "arguments": [
-          "run",
-          "--rm",
-          "-i",
-          "hudevals/hud-text-2048:0.1.3"
-        ]
-      }
-    },
-    "setup_tool": {
-      "name": "setup",
-      "arguments": {
-        "name": "board",
-        "arguments": {
-          "board_size": 5
-        }
-      }
-    },
-    "evaluate_tool": {
-      "name": "evaluate",
-      "arguments": {
-        "name": "max_number",
-        "arguments": {
-          "target": 4096
-        }
-      }
-    },
-    "metadata": {
-      "description": "5x5 board - more room for strategy",
-      "board_size": 5,
-      "target": 4096,
-      "game": "2048"
-    }
-  },
-  {
-    "id": "2048_6x6_target_8192",
-    "prompt": "You are playing 2048 on a 6x6 board and your goal is to reach the 8192 tile. Combine tiles strategically to achieve this goal.",
-    "mcp_config": {
-      "local": {
-        "command": "docker",
-        "arguments": [
-          "run",
-          "--rm",
-          "-i",
-          "hud-text-2048"
-        ]
-      }
-    },
-    "setup_tool": {
-      "name": "setup",
-      "arguments": {
-        "name": "board",
-        "arguments": {
-          "board_size": 6
-        }
-      }
-    },
-    "evaluate_tool": {
-      "name": "evaluate",
-      "arguments": {
-        "name": "max_number",
-        "arguments": {
-          "target": 8192
-        }
-      }
-    },
-    "metadata": {
-      "description": "6x6 board - complex tile management",
-      "board_size": 6,
-      "target": 8192,
-      "game": "2048"
-    }
-  },
-  {
-    "id": "2048_highscore_4x4",
-    "prompt": "You are playing 2048 and your goal is to achieve the highest tile possible. Don't stop at 2048!",
-    "mcp_config": {
-      "local": {
-        "command": "docker",
-        "arguments": [
-          "run",
-          "--rm",
-          "-i",
-          "hud-text-2048"
-        ]
-      }
-    },
-    "setup_tool": {
-      "name": "setup",
-      "arguments": {
-        "name": "board",
-        "arguments": {
-          "board_size": 4
-        }
-      }
-    },
-    "evaluate_tool": {
-      "name": "evaluate",
-      "arguments": {
-        "name": "max_number",
-        "arguments": {
-          "target": 32768
-        }
-      }
-    },
-    "metadata": {
-      "description": "Open-ended: Get the highest tile possible on standard board",
-      "board_size": 4,
-      "game": "2048",
-      "open_ended": true
-    }
-  },
-  {
-    "id": "2048_highscore_5x5",
-    "prompt": "You are playing 2048 on a 5x5 board and your goal is to achieve the highest tile possible. Combine tiles strategically to achieve this goal.",
-    "mcp_config": {
-      "local": {
-        "command": "docker",
-        "arguments": [
-          "run",
-          "--rm",
-          "-i",
-          "hudevals/hud-text-2048:0.1.3"
-        ]
-      }
-    },
-    "setup_tool": {
-      "name": "setup",
-      "arguments": {
-        "name": "board",
-        "arguments": {
-          "board_size": 5
-        }
-      }
-    },
-    "evaluate_tool": {
-      "name": "evaluate",
-      "arguments": {
-        "name": "max_number",
-        "arguments": {
-          "target": 65536
-        }
-      }
-    },
-    "metadata": {
-      "description": "Open-ended: Get the highest tile on larger board",
-      "board_size": 5,
-      "game": "2048",
-      "open_ended": true
-    }
-  }
-]
\ No newline at end of file
diff --git a/environments/text_2048/Dockerfile b/environments/text_2048/Dockerfile
deleted file mode 100644
index 9244df0d..00000000
--- a/environments/text_2048/Dockerfile
+++ /dev/null
@@ -1,27 +0,0 @@
-FROM python:3.11-slim
-
-# For live reload
-ENV PYTHONUNBUFFERED=1 \
-    PYTHONDONTWRITEBYTECODE=1
-
-WORKDIR /app
-
-# Install git for dependency installation
-RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
-
-# Copy project files
-COPY pyproject.toml ./
-COPY src/ ./src/
-
-ENV HUD_LOG_STREAM=stderr
-
-# Install dependencies in editable mode
-RUN pip install --no-cache-dir -e .
-
-# Start context server in background, then run MCP server
-# The context server persists game state across hot-reloads when running hud dev
-CMD ["sh", "-c", "\
-    python -m hud_controller.context & \
-    sleep 1 && \
-    exec python -m hud_controller.server \
-"]
\ No newline at end of file
diff --git a/environments/text_2048/README.md b/environments/text_2048/README.md
deleted file mode 100644
index 2fc34114..00000000
--- a/environments/text_2048/README.md
+++ /dev/null
@@ -1,102 +0,0 @@
-# 2048 Text Environment
-
-ASCII-based 2048 game as an MCP server for HUD SDK evaluation.
-
-## Quick Start
-
-### 1. Direct Python Module
-```bash
-uv run python -m hud_controller.server
-```
-
-### 2. MCP Inspector (Interactive UI)
-```bash
-npx @modelcontextprotocol/inspector uv run python -m hud_controller.server
-```
-Opens a browser UI to explore tools, resources, and test interactions.
-
-### 3. Docker Debug Tool
-```bash
-# Build first
-docker build -t hud-text-2048 .
-
-# Validate all phases
-hud debug hud-text-2048
-```
-
-### 4. Cursor Integration
-Add to `.cursor/mcp.json`:
-```json
-{
-  "mcpServers": {
-    "text-2048": {
-      "command": "docker",
-      "args": ["run", "--rm", "-i", "hud-text-2048"]
-    }
-  }
-}
-```
-
-### 5. HUD SDK Agent
-See `examples/01_hello_2048.py` for a complete working example:
-```bash
-# Build the image first
-docker build -t hud-text-2048 .
-
-# Run the agent
-python ../../examples/01_hello_2048.py
-```
-
-The agent will play 2048 and try to reach a target tile using the available tools.
-
-## Available Tools
-
-- **move** - Slide tiles: `move(direction="up|down|left|right")`
-- **setup** - Initialize game: `setup(name="board", arguments={"board_size": 4})`
-- **evaluate** - Check progress: `evaluate(name="max_number|efficiency")`
-
-## Development Mode
-
-### Option 1: Using `hud dev` (Recommended)
-
-The easiest way to develop with hot-reload:
-
-```bash
-# Start development proxy
-hud dev . --build
-
-# This will:
-# - Build/use hud-text-2048:dev image
-# - Mount ./src for hot-reload
-# - Provide HTTP endpoint for Cursor
-# - Auto-restart on file changes
-```
-
-Add the URL from output to Cursor or click the deeplink.
-
-### Option 2: Manual Setup
-
-For manual control over the development environment:
-
-1. Build dev image:
-```bash
-docker build -t hud-text-2048:dev
-```
-
-2. Add to `.cursor/mcp.json`:
-```json
-{
-  "text-2048-dev": {
-    "command": "npx",
-    "args": [
-      "reloaderoo", "--",
-      "docker", "run", "-i", "--rm",
-      "-v", "./src:/app/src:rw",
-      "-e", "PYTHONPATH=/app/src",
-      "hud-text-2048:dev"
-    ]
-  }
-}
-```
-
-3. Edit code → Call `restart_server` → Changes apply instantly!
\ No newline at end of file
diff --git a/environments/text_2048/pyproject.toml b/environments/text_2048/pyproject.toml
deleted file mode 100644
index c0bfcca9..00000000
--- a/environments/text_2048/pyproject.toml
+++ /dev/null
@@ -1,22 +0,0 @@
-[project]
-name = "hud-text-2048"
-version = "0.1.0"
-description = "Text-based 2048 game environment for hud"
-requires-python = ">=3.11"
-dependencies = [ "hud-python", "numpy",]
-
-[build-system]
-requires = [ "hatchling",]
-build-backend = "hatchling.build"
-
-[project.optional-dependencies]
-test = [ "pytest>=7.0", "pytest-asyncio>=0.20",]
-
-[tool.hud]
-image = "hud-text-2048:dev"
-
-[tool.hatch.metadata]
-allow-direct-references = true
-
-[tool.hatch.build.targets.wheel]
-packages = [ "src/hud_controller",]
diff --git a/environments/text_2048/src/hud_controller/__init__.py b/environments/text_2048/src/hud_controller/__init__.py
deleted file mode 100644
index 659493e4..00000000
--- a/environments/text_2048/src/hud_controller/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-"""hud Text 2048 Controller Package"""
diff --git a/environments/text_2048/src/hud_controller/context.py b/environments/text_2048/src/hud_controller/context.py
deleted file mode 100644
index f9ac6898..00000000
--- a/environments/text_2048/src/hud_controller/context.py
+++ /dev/null
@@ -1,21 +0,0 @@
-"""
-Context server for text-2048 that persists game state across hot-reloads.
-
-Run this as a separate process to maintain game state during development.
-"""
-
-import asyncio
-from hud.server.context import run_context_server
-from .game import Game2048
-import logging
-
-logger = logging.getLogger(__name__)
-
-
-if __name__ == "__main__":
-    # Run the context server with Game2048 directly as the context
-    game = Game2048()
-
-    # Add a startup message
-    logger.info(f"[Context] Starting with {game.size}x{game.size} game")
-    asyncio.run(run_context_server(game, "/tmp/hud_ctx.sock"))
diff --git a/environments/text_2048/src/hud_controller/evaluate/__init__.py b/environments/text_2048/src/hud_controller/evaluate/__init__.py
deleted file mode 100644
index f9c127fc..00000000
--- a/environments/text_2048/src/hud_controller/evaluate/__init__.py
+++ /dev/null
@@ -1,12 +0,0 @@
-"""Evaluation layer for 2048 environment."""
-
-from __future__ import annotations
-
-from hud.tools.base import BaseHub
-
-evaluate = BaseHub("evaluate")
-
-# Import all evaluator functions to register them
-from . import efficiency, max_number
-
-__all__ = ["evaluate"]
diff --git a/environments/text_2048/src/hud_controller/evaluate/efficiency.py b/environments/text_2048/src/hud_controller/evaluate/efficiency.py
deleted file mode 100644
index 350df106..00000000
--- a/environments/text_2048/src/hud_controller/evaluate/efficiency.py
+++ /dev/null
@@ -1,26 +0,0 @@
-"""Evaluator for move efficiency."""
-
-from hud.tools.types import EvaluationResult
-from . import evaluate
-
-
-@evaluate.tool(name="efficiency", description="Evaluate game efficiency based on score/moves ratio")
-async def evaluate_efficiency(min_ratio: float):
-    game = evaluate.env
-    state = game.get_state()
-    score = state.get("score", 0)
-    moves = state.get("moves", 0)
-
-    # Calculate the efficiency ratio directly (no logarithmic scaling)
-    ratio = score / moves if moves > 0 else 0.0
-
-    # Linear reward: proportional to ratio / min_ratio, capped at 1.0
-    reward = min(1.0, ratio / min_ratio) if min_ratio > 0 else 0.0
-    done = ratio >= min_ratio
-
-    return EvaluationResult(
-        reward=reward,
-        done=done,
-        content=f"Efficiency: {ratio:.2f} (target: {min_ratio})",
-        info={"score": score, "moves": moves, "ratio": ratio},
-    )
diff --git a/environments/text_2048/src/hud_controller/evaluate/max_number.py b/environments/text_2048/src/hud_controller/evaluate/max_number.py
deleted file mode 100644
index 0f4a8efa..00000000
--- a/environments/text_2048/src/hud_controller/evaluate/max_number.py
+++ /dev/null
@@ -1,33 +0,0 @@
-"""Evaluator for highest tile."""
-
-import math
-from hud.tools.types import EvaluationResult
-from . import evaluate
-
-
-@evaluate.tool("max_number")
-async def evaluate_max_number(target: int | None = None):
-    game = evaluate.env
-    highest_tile = game.get_state().get("highest_tile", 0)
-
-    if target is None:
-        return EvaluationResult(
-            reward=highest_tile,
-            done=False,
-            content=f"Highest: {highest_tile}",
-            info={"highest_tile": highest_tile},
-        )
-
-    highest_tile = highest_tile - 1
-
-    # Logarithmic reward scale
-    # Reward is proportional to log(highest_tile) / log(target), capped at 1.0
-    reward = min(1.0, math.log(highest_tile) / math.log(target)) if target > 0 else 0.0
-    done = highest_tile >= target
-
-    return EvaluationResult(
-        reward=reward,
-        done=done,
-        content=f"Target: {target}, Highest: {highest_tile}",
-        info={"target": target, "highest_tile": highest_tile},
-    )
diff --git a/environments/text_2048/src/hud_controller/game.py b/environments/text_2048/src/hud_controller/game.py
deleted file mode 100644
index 6de529f0..00000000
--- a/environments/text_2048/src/hud_controller/game.py
+++ /dev/null
@@ -1,204 +0,0 @@
-"""2048 Game Logic"""
-
-import random
-import numpy as np
-from typing import Tuple
-
-
-class Game2048:
-    """Text-based 2048 game implementation"""
-
-    def __init__(self, size: int = 4):
-        self.size = size
-        self.board = np.zeros((size, size), dtype=int)
-        self.score = 0
-        self.game_over = False
-        self.moves_made = 0
-
-        # Start with 2 random tiles
-        self.add_random_tile()
-        self.add_random_tile()
-
-    def add_random_tile(self) -> bool:
-        """Add a random 2 or 4 tile to an empty position"""
-        empty_cells = [
-            (i, j) for i in range(self.size) for j in range(self.size) if self.board[i, j] == 0
-        ]
-
-        if not empty_cells:
-            return False
-
-        i, j = random.choice(empty_cells)
-        # 90% chance of 2, 10% chance of 4
-        self.board[i, j] = 2 if random.random() < 0.9 else 4
-        return True
-
-    def compress(self, row: np.ndarray) -> Tuple[np.ndarray, int]:
-        """Compress a row by moving all non-zero elements to the left"""
-        new_row = np.zeros_like(row)
-        pos = 0
-        score = 0
-
-        # Move all non-zero elements to the left
-        for num in row:
-            if num != 0:
-                new_row[pos] = num
-                pos += 1
-
-        # Merge adjacent equal elements
-        i = 0
-        while i < len(new_row) - 1:
-            if new_row[i] != 0 and new_row[i] == new_row[i + 1]:
-                new_row[i] *= 2
-                score += new_row[i]
-                new_row[i + 1] = 0
-                i += 2
-            else:
-                i += 1
-
-        # Compress again after merging
-        final_row = np.zeros_like(row)
-        pos = 0
-        for num in new_row:
-            if num != 0:
-                final_row[pos] = num
-                pos += 1
-
-        return final_row, score
-
-    def move(self, direction: str) -> bool:
-        """Make a move in the specified direction"""
-        if self.game_over:
-            return False
-
-        direction = direction.lower()
-        if direction not in ["up", "down", "left", "right"]:
-            return False
-
-        original_board = self.board.copy()
-        move_score = 0
-
-        if direction == "left":
-            for i in range(self.size):
-                self.board[i], row_score = self.compress(self.board[i])
-                move_score += row_score
-
-        elif direction == "right":
-            for i in range(self.size):
-                reversed_row = self.board[i][::-1]
-                compressed, row_score = self.compress(reversed_row)
-                self.board[i] = compressed[::-1]
-                move_score += row_score
-
-        elif direction == "up":
-            for j in range(self.size):
-                column = self.board[:, j]
-                compressed, col_score = self.compress(column)
-                self.board[:, j] = compressed
-                move_score += col_score
-
-        elif direction == "down":
-            for j in range(self.size):
-                column = self.board[:, j][::-1]
-                compressed, col_score = self.compress(column)
-                self.board[:, j] = compressed[::-1]
-                move_score += col_score
-
-        # Check if the board changed
-        if not np.array_equal(original_board, self.board):
-            self.score += move_score
-            self.moves_made += 1
-            self.add_random_tile()
-            self.check_game_over()
-            return True
-
-        return False
-
-    def check_game_over(self):
-        """Check if the game is over (no valid moves)"""
-        # Check for empty cells
-        if 0 in self.board:
-            self.game_over = False
-            return
-
-        # Check for possible merges
-        for i in range(self.size):
-            for j in range(self.size):
-                current = self.board[i, j]
-                # Check right neighbor
-                if j < self.size - 1 and current == self.board[i, j + 1]:
-                    self.game_over = False
-                    return
-                # Check bottom neighbor
-                if i < self.size - 1 and current == self.board[i + 1, j]:
-                    self.game_over = False
-                    return
-
-        self.game_over = True
-
-    def get_board_ascii(self) -> str:
-        """Get ASCII representation of the board"""
-        lines = []
-
-        # Top border
-        lines.append("+" + "-------+" * self.size)
-
-        for i in range(self.size):
-            row_str = "|"
-            for j in range(self.size):
-                val = self.board[i, j]
-                if val == 0:
-                    row_str += "       |"
-                else:
-                    # Center the number in 7 spaces
-                    row_str += f" {val:^5} |"
-            lines.append(row_str)
-            lines.append("+" + "-------+" * self.size)
-
-        # Add score and status
-        lines.append(f"\n[ASCII]")
-        if self.game_over:
-            lines.append("GAME OVER!")
-        elif 2048 in self.board:
-            lines.append("YOU WIN! (You can keep playing)")
-
-        return "\n".join(lines)
-
-    def get_state(self) -> dict:
-        """Get the current game state as a dictionary"""
-        return {
-            "board": self.board.tolist(),
-            "score": int(self.score),
-            "moves": int(self.moves_made),
-            "game_over": bool(self.game_over),
-            "won": bool(2048 in self.board),
-            "highest_tile": int(self.board.max()),
-        }
-
-    # Proxy-friendly getter methods for multiprocessing.Manager
-    def get_score(self) -> int:
-        """Get current score (proxy-friendly method)."""
-        return self.score
-
-    def get_moves_made(self) -> int:
-        """Get number of moves made (proxy-friendly method)."""
-        return self.moves_made
-
-    def get_size(self) -> int:
-        """Get board size (proxy-friendly method)."""
-        return self.size
-
-    def reset(self, size: int = 4):
-        """Reset the game to initial state
-
-        Args:
-            size: Optional new board size (if not provided, keeps current size)
-        """
-        if size is not None:
-            self.size = size
-        self.board = np.zeros((self.size, self.size), dtype=int)
-        self.score = 0
-        self.game_over = False
-        self.moves_made = 0
-        self.add_random_tile()
-        self.add_random_tile()
diff --git a/environments/text_2048/src/hud_controller/server.py b/environments/text_2048/src/hud_controller/server.py
deleted file mode 100644
index ffcd683b..00000000
--- a/environments/text_2048/src/hud_controller/server.py
+++ /dev/null
@@ -1,69 +0,0 @@
-"""
-MCP server for text-based 2048 game environment using BaseHub pattern.
-"""
-
-import sys
-import logging
-
-from hud.server import MCPServer
-from hud.server.context import attach_context
-
-from .tools import MoveTool
-
-# Configure logging to stderr
-logging.basicConfig(
-    stream=sys.stderr,
-    level=logging.INFO,
-    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
-)
-logger = logging.getLogger(__name__)
-
-# Suppress MCP server logs
-logging.getLogger("mcp.server.lowlevel.server").setLevel(logging.WARNING)
-
-# Global game instance (initialized during startup)
-game = None
-
-# Import setup/evaluate layers
-from .setup import setup as setup_hub
-from .evaluate import evaluate as evaluate_hub
-
-# Create main server first
-mcp = MCPServer(name="text-2048")
-
-
-@mcp.initialize
-async def initialize_environment(ctx):
-    """Initialize the 2048 environment."""
-    global game
-
-    logger.info("Initializing 2048 environment...")
-
-    # Connect to context server (must be running)
-    game = attach_context("/tmp/hud_ctx.sock")
-    logger.info("Connected to socket-based game context")
-
-    # Log whether we're resuming or starting fresh
-    if game.get_moves_made() > 0:
-        logger.info(f"Resuming game - Score: {game.get_score()}, Moves: {game.get_moves_made()}")
-    else:
-        logger.info("Starting fresh game")
-
-    # Set up the game instance on hubs and tools
-    setup_hub.env = game
-    evaluate_hub.env = game
-
-    # Mount hubs
-    logger.info(f"Mounting hubs: {setup_hub} and {evaluate_hub}")
-
-    mcp.mount(setup_hub)
-    mcp.mount(evaluate_hub)
-
-    # Create and register move tool
-    mcp.add_tool(MoveTool(env=game))
-
-    logger.info("2048 environment ready")
-
-
-if __name__ == "__main__":
-    mcp.run()
diff --git a/environments/text_2048/src/hud_controller/setup/__init__.py b/environments/text_2048/src/hud_controller/setup/__init__.py
deleted file mode 100644
index 443dff09..00000000
--- a/environments/text_2048/src/hud_controller/setup/__init__.py
+++ /dev/null
@@ -1,16 +0,0 @@
-"""Setup layer for 2048 environment.
-
-This module exposes:
-- ``setup_hub``, the BaseHub instance for setup operations
-"""
-
-from __future__ import annotations
-
-from hud.tools.base import BaseHub
-
-setup = BaseHub("setup")
-
-# Import all setup functions to register them
-from . import board
-
-__all__ = ["setup"]
diff --git a/environments/text_2048/src/hud_controller/setup/board.py b/environments/text_2048/src/hud_controller/setup/board.py
deleted file mode 100644
index 52a521fb..00000000
--- a/environments/text_2048/src/hud_controller/setup/board.py
+++ /dev/null
@@ -1,21 +0,0 @@
-"""Board-size setup function for 2048."""
-
-from mcp.types import TextContent, ContentBlock
-from . import setup
-
-
-@setup.tool("board")
-async def setup_board(board_size: int = 4) -> list[ContentBlock]:
-    """Initialize a new game with the specified board size."""
-    game = setup.env
-    game.reset(size=board_size)
-
-    # Get the initial board state to show the agent
-    board_display = game.get_board_ascii()
-
-    # Return the initial board display
-    return [
-        TextContent(
-            text=f"{board_size}x{board_size} game initialized\n\n{board_display}", type="text"
-        )
-    ]
diff --git a/environments/text_2048/src/hud_controller/tools/__init__.py b/environments/text_2048/src/hud_controller/tools/__init__.py
deleted file mode 100644
index a9452b29..00000000
--- a/environments/text_2048/src/hud_controller/tools/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-"""Tools module for 2048 environment."""
-
-from .move import MoveTool
-
-__all__ = ["MoveTool"]
diff --git a/environments/text_2048/src/hud_controller/tools/move.py b/environments/text_2048/src/hud_controller/tools/move.py
deleted file mode 100644
index 4f21cc00..00000000
--- a/environments/text_2048/src/hud_controller/tools/move.py
+++ /dev/null
@@ -1,69 +0,0 @@
-"""Move tool for the 2048 game."""
-
-import logging
-from typing import Any
-from mcp.types import TextContent, ContentBlock
-from hud.tools.base import BaseTool
-
-logger = logging.getLogger(__name__)
-
-
-class MoveTool(BaseTool):
-    """Tool for making moves in the 2048 game."""
-
-    def __init__(self, env: Any = None):
-        """Initialize the move tool.
-
-        Args:
-            context: The game instance
-        """
-        super().__init__(
-            env=env,
-            name="move",
-            title="Move Tiles",
-            description="Make a move in the 2048 game by sliding tiles in a direction",
-        )
-
-    async def __call__(self, direction: str) -> list[ContentBlock]:
-        """Make a move in the 2048 game.
-
-        Args:
-            direction: The direction to move ('up', 'down', 'left', 'right')
-        """
-        if self.env is None:
-            return [TextContent(text="ERROR: Game not initialized. Run setup first.", type="text")]
-
-        direction = direction.lower()
-        if direction not in ["up", "down", "left", "right"]:
-            return [
-                TextContent(
-                    text=f"ERROR: Invalid direction: {direction}. Use: up, down, left, right",
-                    type="text",
-                )
-            ]
-
-        # Make the move using context (the game)
-        moved = self.env.move(direction)
-
-        if not moved:
-            return [
-                TextContent(
-                    text=f"ERROR: Cannot move {direction} - no valid moves in that direction",
-                    type="text",
-                )
-            ]
-
-        # Get game state
-        state = self.env.get_state()
-
-        # Format response
-        board_str = self.env.get_board_ascii()
-
-        text = f"Moved {direction.upper()}\n"
-        text += f"Score: {self.env.get_score()}\n"
-        text += f"{board_str}"
-
-        if state["game_over"]:
-            text += "\nGAME OVER!"
-
-        return [TextContent(text=text, type="text")]
diff --git a/examples/00_agent_env.py b/examples/00_agent_env.py
index 85d4b153..41b935e8 100644
--- a/examples/00_agent_env.py
+++ b/examples/00_agent_env.py
@@ -1,67 +1,52 @@
 """Tiny agent-environment demo in one file.
 
 ┌───────────────┐  tool call (MCP)  ┌───────────────┐
-│   Client      │ ────────────────► │  Server       │
-│ (agent side)  │  JSON-RPC / stdio │ (environment) │
+│   Agent       │ ────────────────► │  Environment  │
+│ (client)      │   hud.eval()      │  (hud.Env)    │
 └───────────────┘                   └───────────────┘
 
-Server = the *environment*
-• Exposes one tool `sum(a, b)` using the FastMCP SDK.
-• In real projects the server runs inside Docker so stdout is reserved for the
-  protocol and stderr for logs.
+Environment = hud.Environment with @env.tool
+• Exposes one tool `sum(a, b)` using the @env.tool decorator.
+• In real projects this would be a Docker image or remote service.
 
-Client = the *agent side*
-• Uses `hud.client.MCPClient` to connect to **any** MCP environment – local
-  subprocess here, Docker or remote HUD in real scenarios.
-• Sends a single tool call and prints the result.
+Agent = the client side
+• Uses `hud.eval(env())` to connect and call tools.
+• The environment handles tool routing automatically.
 
-Run `python examples/00_minimal_fastmcp.py` → prints `3 + 4 = 7`.
+Run `python examples/00_agent_env.py` → prints `3 + 4 = 7`.
 """
 
 from __future__ import annotations
 
 import asyncio
-import sys
-from pathlib import Path
 
-from fastmcp import FastMCP
-from hud.clients import MCPClient
+import hud
 
 # ------------------------------------------------------------------
-# Environment (server)
+# Environment (with local tools)
 # ------------------------------------------------------------------
 
-server = FastMCP("MiniServer")
+env = hud.Environment("calculator")
 
 
-@server.tool()
+@env.tool()
 def sum(a: int, b: int) -> int:
+    """Add two numbers together."""
     return a + b
 
 
 # ------------------------------------------------------------------
-# Agent (client) – spawns the same file with --server and calls the tool
+# Agent (client) – connects to env and calls tools
 # ------------------------------------------------------------------
 
-THIS_FILE = Path(__file__).absolute()
 
-
-async def run_client() -> None:
-    cfg = {
-        "local": {
-            "command": sys.executable,
-            "args": [str(THIS_FILE), "--server"],
-        }
-    }
-    client = MCPClient(mcp_config=cfg)
-    await client.initialize()
-    result = await client.call_tool(name="sum", arguments={"a": 3, "b": 4})
-    print("3 + 4 =", result)
-    await client.shutdown()
+async def main() -> None:
+    """Connect to the environment and call the sum tool."""
+    # Use hud.eval() with env() to create a task and run it
+    async with hud.eval(env(), trace=False) as ctx:
+        result = await ctx.call_tool(name="sum", arguments={"a": 3, "b": 4})
+        print("3 + 4 =", result)
 
 
 if __name__ == "__main__":
-    if "--server" in sys.argv:
-        server.run()
-    else:
-        asyncio.run(run_client())  # The client will run itself with the --server flag
+    asyncio.run(main())
diff --git a/examples/01_agent_lifecycle.py b/examples/01_agent_lifecycle.py
index c094a183..9de673b6 100644
--- a/examples/01_agent_lifecycle.py
+++ b/examples/01_agent_lifecycle.py
@@ -2,140 +2,83 @@
 """
 Complete Agent Lifecycle Example
 
-This example demonstrates the full agent lifecycle:
-- Task definition with setup and evaluation tools
-- Agent initialization
-- Setup phase
-- Agent execution loop
-- Tool call handling
-- Evaluation phase
-- Cleanup
-
-The entire flow is wrapped in hud.trace() to provide RUN_ID context.
+This example demonstrates the full agent lifecycle using the v5 Task format:
+- Task definition with Environment and scenario
+- hud.eval() context for connection and tracing
+- Agent initialization and execution
+- Automatic scenario setup/evaluation
+- Result collection
+
+For simpler usage, just use `await agent.run(ctx)` which handles everything.
+This example shows what happens under the hood.
 """
 
 import asyncio
+
 import hud
-from hud.datasets import Task
-from hud.clients import MCPClient
 from hud.agents.claude import ClaudeAgent
-from hud.agents.base import find_reward, find_content
-
-
-async def main():
-    # Wrap everything in trace to provide RUN_ID for the task
-    with hud.trace("Agent Lifecycle Demo"):
-        # Define a complete task with setup and evaluation
-        task_dict = {
-            "prompt": "Create a new todo item with the title 'Buy groceries' and description 'Milk, eggs, bread'",
-            "mcp_config": {
-                "hud": {
-                    "url": "https://mcp.hud.ai/v3/mcp",
-                    "headers": {
-                        "Authorization": "Bearer ${HUD_API_KEY}",  # Automatically filled from env
-                        "Mcp-Image": "hudevals/hud-browser:latest",
-                    },
-                }
-            },
-            "setup_tool": {"name": "launch_app", "arguments": {"app_name": "todo"}},
-            "evaluate_tool": {
-                "name": "evaluate",
-                "arguments": {"name": "todo_exists", "arguments": {"title": "Buy groceries"}},
-            },
-        }
-        task = Task(**task_dict)
-
-        # Create MCP client with resolved config
-        client = MCPClient(mcp_config=task.mcp_config)
-
-        # Create agent
-        agent = ClaudeAgent.create(
-            mcp_client=client,
-            checkpoint_name="claude-sonnet-4-5",
-            allowed_tools=["anthropic_computer"],
-            initial_screenshot=True,
-        )
-
-        try:
-            # Phase 1: Initialize agent with task context
-            print("🔧 Initializing agent...")
-            await agent.initialize(task)
-
-            # Phase 2: Run setup tool
-            print("📋 Running setup...")
-            setup_result = await agent.call_tools(task.setup_tool)
-            setup_content = setup_result[0].content
-            print("✅ Setup complete")
-
-            # Phase 3: Add context and first messages
-            print(f"\n🤖 Running task: {task.prompt}")
-            messages = await agent.get_system_messages()
-
-            # Add context
-            context = await agent.format_message(
-                [
-                    *setup_content,
-                    task.prompt,
-                ]
-            )
-
-            messages.extend(context)
-            print(f"Messages: {messages}")
+from hud.eval.task import Task
 
-            # Phase 4: Run agent loop
-            done = False
-            steps = 0
-            max_steps = 10
 
-            # Use messages as the state for the agent
-            while not done and steps < max_steps:
-                # Get model response
-                response = await agent.get_response(messages)
-                print(f"\n   Step {steps + 1}:")
-
-                if response.content:
-                    print(f"   💭 Agent: {response.content[:100]}...")
-
-                if response.tool_calls:
-                    # Execute tool calls
-                    tool_results = await agent.call_tools(response.tool_calls)
-
-                    # Format results back into messages
-                    messages.extend(
-                        await agent.format_tool_results(response.tool_calls, tool_results)
-                    )
-                else:
-                    # No more tool calls, we're done
-                    done = True
-
-                steps += 1
-
-            # Phase 4: Run evaluation
-            print("\n📊 Running evaluation...")
-            eval_result = await agent.call_tools(task.evaluate_tool)
-
-            if eval_result[0].isError:
-                print(f"❌ Evaluation failed: {eval_result[0].content}")
-            else:
-                reward = find_reward(eval_result[0])
-                eval_content = find_content(eval_result[0])
-                print(f"✅ Evaluation complete - Reward: {reward}")
-                print(f"✅ Evaluation complete - Content: {eval_content}")
-
-            # Summary
-            print("\n📈 Summary:")
-            print(f"   Total steps: {steps}")
-            print(f"   Task completed: {done}")
+async def main() -> None:
+    print("🚀 Agent Lifecycle Example")
+    print("=" * 50)
 
-        finally:
-            # Phase 5: Cleanup
-            print("\n🧹 Cleaning up...")
-            await client.shutdown()
+    # Phase 1: Define task using v5 Task format
+    # The Task holds environment config and scenario info
+    print("📋 Defining task...")
+    task = Task(
+        # Environment config - connects to HUD browser hub
+        env={"name": "browser"},
+        # Scenario to run (defined on the environment)
+        scenario="checkout",
+        # Scenario arguments
+        args={"product": "laptop", "quantity": 1},
+        # Optional: agent configuration
+        agent_config={"system_prompt": "You are a helpful shopping assistant."},
+    )
+
+    # Phase 2: Create agent
+    print("🤖 Creating Claude agent...")
+    agent = ClaudeAgent.create(
+        checkpoint_name="claude-sonnet-4-20250514",
+        allowed_tools=["anthropic_computer"],
+        initial_screenshot=True,
+    )
+
+    # Phase 3: Enter eval context and run agent
+    # The context manager handles:
+    # - Environment connection (MCP servers start)
+    # - Scenario setup execution
+    # - Trace creation for telemetry
+    print("🔧 Entering eval context...")
+    async with hud.eval(task, name="agent-lifecycle-demo") as ctx:
+        print("   ✅ Environment connected")
+        print(f"   📝 Prompt: {ctx.prompt[:50] if ctx.prompt else 'N/A'}...")
+
+        # Phase 4: Run the agent
+        # agent.run() handles the agentic loop:
+        # - Gets system messages
+        # - Sends prompt to model
+        # - Processes tool calls
+        # - Continues until done or max_steps
+        print("\n🏃 Running agent loop...")
+        result = await agent.run(ctx, max_steps=10)
+
+        print(f"\n   Agent finished:")
+        print(f"   - Done: {result.done}")
+        print(f"   - Has error: {result.isError}")
+        if result.content:
+            print(f"   - Response: {result.content[:100]}...")
+
+    # Phase 5: After exit, scenario evaluation was automatically called
+    # and ctx.reward is set based on the evaluation
+    print("\n📊 Evaluation complete")
+    print(f"   Reward: {ctx.reward}")
+    print(f"   Success: {ctx.success}")
 
     print("\n✨ Agent lifecycle demo complete!")
 
 
 if __name__ == "__main__":
-    print("🚀 Agent Lifecycle Example")
-    print("=" * 50)
     asyncio.run(main())
diff --git a/examples/02_claude_agent.py b/examples/02_claude_agent.py
index 0c5150be..bd4c0e62 100644
--- a/examples/02_claude_agent.py
+++ b/examples/02_claude_agent.py
@@ -13,65 +13,57 @@
 """
 
 import asyncio
-import hud
-from hud.agents import ClaudeAgent
-from hud.clients import MCPClient
-from hud.settings import settings
 
+import hud
+from hud.agents.claude import ClaudeAgent
+from hud.eval.task import Task
 
-async def main():
-    with hud.trace("Claude Agent Demo"):
-        # For any environment, you can run :
-        # hud debug <IMAGE_NAME> to see the logs
-        # hud analyze <IMAGE_NAME> to get a report about its capabilities (tools, resources, etc.)
-        # e.g. hud analyze hudpython/hud-remote-browser:latest
 
-        mcp_config = {
-            "hud": {
-                "url": "https://mcp.hud.ai/v3/mcp",
-                "headers": {
-                    "Authorization": f"Bearer {settings.api_key}",
-                    "Mcp-Image": "hudpython/hud-remote-browser:latest",
-                },
-            }
-        }
+async def main() -> None:
+    # For any environment, you can run:
+    # hud debug <IMAGE_NAME> to see the logs
+    # hud analyze <IMAGE_NAME> to get a report about its capabilities
 
-        # Create Claude-specific agent
-        client = MCPClient(mcp_config=mcp_config)
-        agent = ClaudeAgent.create(
-            mcp_client=client,
-            checkpoint_name="claude-sonnet-4-5",
-            allowed_tools=["anthropic_computer"],
-            initial_screenshot=True,
-        )
+    initial_url = "https://httpbin.org/forms/post"
 
-        initial_url = "https://httpbin.org/forms/post"
+    prompt = f"""
+    Please help me test a web form:
+    1. Navigate to {initial_url}
+    2. Fill in the customer name as "Claude Test"
+    3. Enter the telephone as "555-0123"
+    4. Type "Testing form submission with Claude" in the comments
+    5. Select a small pizza size
+    6. Choose "bacon" as a topping
+    7. Set delivery time to "20:30"
+    8. Submit the form
+    9. Verify the submission was successful
+    """
 
-        prompt = f"""
-        Please help me test a web form:
-        1. Navigate to {initial_url}
-        2. Fill in the customer name as "Claude Test"
-        3. Enter the telephone as "555-0123"
-        4. Type "Testing form submission with Claude" in the comments
-        5. Select a small pizza size
-        6. Choose "bacon" as a topping
-        7. Set delivery time to "20:30"
-        8. Submit the form
-        9. Verify the submission was successful
-        """
+    # Create v5 Task with Environment config
+    task = Task(
+        env={"name": "browser"},  # Connect to browser hub
+        scenario="form_fill",  # Scenario name
+        args={"url": initial_url},  # Scenario args
+        agent_config={"system_prompt": prompt},  # Pass prompt via agent config
+    )
 
-        print(f"📋 Task: Multi-step form interaction")
-        print(f"🚀 Running Claude agent...\n")
+    # Create Claude-specific agent
+    agent = ClaudeAgent.create(
+        checkpoint_name="claude-sonnet-4-20250514",
+        allowed_tools=["anthropic_computer"],
+        initial_screenshot=True,
+    )
 
-        await client.call_tool(
-            name="setup",
-            arguments={"name": "navigate_to_url", "arguments": {"url": initial_url}},
-        )
+    print("📋 Task: Multi-step form interaction")
+    print("🚀 Running Claude agent...\n")
 
-        # Run the task
-        await agent.run(prompt, max_steps=15)
+    # Run with hud.eval() context
+    async with hud.eval(task, name="claude-form-demo") as ctx:
+        result = await agent.run(ctx, max_steps=15)
 
     print("\n✨ Claude agent demo complete!")
+    print(f"   Reward: {result.reward}")
+    print(f"   Done: {result.done}")
 
 
 if __name__ == "__main__":
diff --git a/examples/03_openai_compatible_agent.py b/examples/03_openai_compatible_agent.py
index 5e1fbd5f..82668bc9 100644
--- a/examples/03_openai_compatible_agent.py
+++ b/examples/03_openai_compatible_agent.py
@@ -3,8 +3,8 @@
 OpenAI-compatible Chat Agent playing 2048 (text or browser).
 
 Usage:
-  python examples/openai_compatible_agent.py --mode text    # default
-  python examples/openai_compatible_agent.py --mode browser
+  python examples/03_openai_compatible_agent.py --mode text    # default
+  python examples/03_openai_compatible_agent.py --mode browser
 
 Requirements:
 - pip install openai
@@ -24,7 +24,7 @@
 
 import hud
 from hud.agents.openai_chat import OpenAIChatAgent
-from hud.datasets import Task
+from hud.eval.task import Task
 
 
 def _system_prompt(mode: Literal["text", "browser"]) -> str:
@@ -46,7 +46,7 @@ def _system_prompt(mode: Literal["text", "browser"]) -> str:
             "- Continue until target or game ends; no confirmations needed.\n\n"
             "Strategy: keep highest tiles in a corner; maintain order; avoid random moves."
         )
-    # text
+    # text mode
     return (
         "You are an expert 2048 game player. Your goal is to reach the tile specified by the user.\n\n"
         "HOW 2048 WORKS:\n"
@@ -66,46 +66,29 @@ def _system_prompt(mode: Literal["text", "browser"]) -> str:
     )
 
 
-def _task_for_mode(mode: Literal["text", "browser"], target: int) -> Task:
+def _create_task(mode: Literal["text", "browser"], target: int) -> Task:
+    """Create a v5 Task for the 2048 game."""
     if mode == "browser":
-        mcp_config = {
-            "local": {
-                "command": "docker",
-                "args": ["run", "--rm", "-i", "-p", "8080:8080", "hudevals/hud-browser:0.1.3"],
-            }
-        }
-        prompt = (
-            "Play the browser-based 2048 game and try to reach the target tile. "
-            "Start by taking a screenshot, then make strategic moves using arrow keys."
+        # Use local Docker environment for browser mode
+        env = hud.Environment("2048-browser")
+        env.connect_image(
+            "hudevals/hud-browser:0.1.3",
+            docker_args=["-p", "8080:8080"],
+        )
+        return Task(
+            env=env,
+            scenario="game_2048",
+            args={"target": target},
         )
-        setup_tool = {"name": "launch_app", "arguments": {"app_name": "2048"}}
-        evaluate_tool = {
-            "name": "evaluate",
-            "arguments": {"name": "game_2048_max_number", "arguments": {"target": target}},
-        }
     else:
-        mcp_config = {
-            "local": {
-                "command": "docker",
-                "args": ["run", "--rm", "-i", "hudevals/hud-text-2048:0.1.6"],
-            }
-        }
-        prompt = f"Aim for the {target} tile (at least a score of 800!)"
-        setup_tool = {
-            "name": "setup",
-            "arguments": {"name": "board", "arguments": {"board_size": 4}},
-        }
-        evaluate_tool = {
-            "name": "evaluate",
-            "arguments": {"name": "max_number", "arguments": {"target": target}},
-        }
-
-    return Task(
-        prompt=prompt,
-        mcp_config=mcp_config,
-        setup_tool=setup_tool,  # type: ignore[arg-type]
-        evaluate_tool=evaluate_tool,  # type: ignore[arg-type]
-    )
+        # Use local Docker environment for text mode
+        env = hud.Environment("2048-text")
+        env.connect_image("hudevals/hud-text-2048:0.1.6")
+        return Task(
+            env=env,
+            scenario="max_number",
+            args={"target": target},
+        )
 
 
 async def run_example(mode: Literal["text", "browser"], target: int) -> None:
@@ -118,10 +101,10 @@ async def run_example(mode: Literal["text", "browser"], target: int) -> None:
         api_key=api_key,
     )
 
-    task = _task_for_mode(mode, target)
+    task = _create_task(mode, target)
     system_prompt = _system_prompt(mode)
 
-    checkpoint = "gpt-5-mini"  # Replace with your model checkpoint
+    checkpoint = "gpt-4o-mini"  # Replace with your model checkpoint
 
     # Allowed tools differ by mode
     allowed_tools = ["computer"] if mode == "browser" else ["move"]
@@ -135,21 +118,20 @@ async def run_example(mode: Literal["text", "browser"], target: int) -> None:
         system_prompt=system_prompt,
     )
 
-    title = "OpenAI 2048 Game (Browser)" if mode == "browser" else "OpenAI 2048 Game (Text)"
-    async with hud.async_job(title, metadata={"model": checkpoint, "mode": mode}) as job:
-        print("🎮 Starting 2048 game with OpenAI-compatible agent...")
-        print(f"🤖 Model: {agent.config.checkpoint_name}")
-        print(f"🧩 Mode: {mode}")
-        print("=" * 50)
-
-        async with hud.async_trace("Game Execution", job_id=job.id):
-            result = await agent.run(task, max_steps=100)
-
-        print("=" * 50)
-        print("✅ Game completed!")
-        print(f"🏆 Final Score/Max Tile: {result.reward}")
-        if result.info:
-            print(f"📊 Game Stats: {result.info}")
+    print("🎮 Starting 2048 game with OpenAI-compatible agent...")
+    print(f"🤖 Model: {agent.config.model}")
+    print(f"🧩 Mode: {mode}")
+    print("=" * 50)
+
+    # Use hud.eval() for the task
+    async with hud.eval(task, variants={"model": checkpoint, "mode": mode}) as ctx:
+        result = await agent.run(ctx, max_steps=100)
+
+    print("=" * 50)
+    print("✅ Game completed!")
+    print(f"🏆 Final Score/Max Tile: {result.reward}")
+    if result.info:
+        print(f"📊 Game Stats: {result.info}")
 
 
 def _parse_args() -> argparse.Namespace:
diff --git a/examples/04_grounded_agent.py b/examples/04_grounded_agent.py
index e2a31685..32bd8f3d 100644
--- a/examples/04_grounded_agent.py
+++ b/examples/04_grounded_agent.py
@@ -17,74 +17,72 @@
 
 import hud
 from hud.agents.grounded_openai import GroundedOpenAIChatAgent
+from hud.eval.task import Task
 from hud.settings import settings
 from hud.tools.grounding import GrounderConfig
 from openai import AsyncOpenAI
 
 
-async def main():
+async def main() -> None:
     """Run the grounded agent example."""
 
-    with hud.trace("Grounded Agent Demo"):
-        # Configure the grounding model
-        grounder_config = GrounderConfig(
-            api_base="https://openrouter.ai/api/v1",  # OpenRouter API
-            model="qwen/qwen-2.5-vl-7b-instruct",  # Vision model for grounding
-            api_key=settings.openrouter_api_key,
-        )
-
-        # MCP configuration for environment
-        mcp_config = {
-            "local": {
-                "command": "docker",
-                "args": ["run", "--rm", "-i", "-p", "8080:8080", "hudevals/hud-browser:0.1.6"],
-            }
-        }
-
-        # Create OpenAI client for planning
-        openai_client = AsyncOpenAI(
-            api_key=os.getenv("OPENAI_API_KEY", settings.openai_api_key)
-        )  # can use any OpenAI-compatible endpoint
-
-        agent = GroundedOpenAIChatAgent.create(
-            grounder_config=grounder_config,
-            openai_client=openai_client,
-            checkpoint_name="gpt-4o-mini",  # Planning model
-        )
-
-        try:
-            # Create a task with MCP config
-            from hud.datasets import Task
-
-            form_url = "https://hb.cran.dev/forms/post"
-
-            form_prompt = f"""
-            Fill out the form:
-            1. Enter "Grounded Test" in the customer name field
-            2. Enter "555-9876" in the telephone field
-            3. Type "Testing grounded agent with separated vision and reasoning" in comments
-            4. Select medium pizza size
-            5. Choose mushroom as a topping
-            6. Submit the form
-            """
-
-            task = Task(
-                prompt=form_prompt,
-                mcp_config=mcp_config,
-                setup_tool={
-                    "name": "playwright",
-                    "arguments": {"action": "navigate", "url": form_url},
-                },
-            )
-
-            print(f"📋 Task: Form interaction")
-            print(f"🚀 Running grounded agent...\n")
-
-            result = await agent.run(task, max_steps=10)
-            print(f"Result: {result.content}\n")
-
-        except Exception as e:
-            print(f"Error during agent execution: {e}")
+    # Configure the grounding model
+    openrouter_key = os.getenv("OPENROUTER_API_KEY") or settings.openrouter_api_key
+    if not openrouter_key:
+        raise ValueError("OPENROUTER_API_KEY is required for grounding model")
+
+    grounder_config = GrounderConfig(
+        api_base="https://openrouter.ai/api/v1",  # OpenRouter API
+        model="qwen/qwen-2.5-vl-7b-instruct",  # Vision model for grounding
+        api_key=openrouter_key,
+    )
+
+    # Create OpenAI client for planning
+    openai_client = AsyncOpenAI(
+        api_key=os.getenv("OPENAI_API_KEY", settings.openai_api_key)
+    )  # can use any OpenAI-compatible endpoint
+
+    agent = GroundedOpenAIChatAgent.create(
+        grounder_config=grounder_config,
+        openai_client=openai_client,
+        checkpoint_name="gpt-4o-mini",  # Planning model
+    )
+
+    form_url = "https://hb.cran.dev/forms/post"
+
+    form_prompt = """
+    Fill out the form:
+    1. Enter "Grounded Test" in the customer name field
+    2. Enter "555-9876" in the telephone field
+    3. Type "Testing grounded agent with separated vision and reasoning" in comments
+    4. Select medium pizza size
+    5. Choose mushroom as a topping
+    6. Submit the form
+    """
+
+    # Create v5 Task with local Docker environment
+    env = hud.Environment("browser-grounded")
+    env.connect_image(
+        "hudevals/hud-browser:0.1.6",
+        docker_args=["-p", "8080:8080"],
+    )
+
+    task = Task(
+        env=env,
+        scenario="form_fill",
+        args={"url": form_url},
+        agent_config={"system_prompt": form_prompt},
+    )
+
+    print("📋 Task: Form interaction with grounded agent")
+    print("🚀 Running grounded agent...\n")
+
+    try:
+        async with hud.eval(task, name="grounded-form-demo") as ctx:
+            result = await agent.run(ctx, max_steps=10)
+        print(f"Result: {result.content}\n")
+    except Exception as e:
+        print(f"Error during agent execution: {e}")
 
     print("\n✨ Grounded agent demo complete!")
 
diff --git a/examples/05_custom_agent.py b/examples/05_custom_agent.py
index 262e7e78..49a3a5e1 100644
--- a/examples/05_custom_agent.py
+++ b/examples/05_custom_agent.py
@@ -7,21 +7,21 @@
 3. Works with any model available via the gateway
 
 Usage:
-    HUD_API_KEY=sk-hud-... python examples/custom_gateway_agent.py
+    HUD_API_KEY=sk-hud-... python examples/05_custom_agent.py
 """
 
 import asyncio
 import json
-import os
 from typing import Any
 
 import mcp.types as types
 from openai import AsyncOpenAI
 
-from hud import instrument
+import hud
 from hud.agents.base import MCPAgent
-from hud.datasets import Task
+from hud.eval.task import Task
 from hud.settings import settings
+from hud.telemetry.instrument import instrument
 from hud.types import AgentResponse, MCPToolCall, MCPToolResult
 
 
@@ -100,7 +100,7 @@ async def get_response(self, messages: list[Any]) -> AgentResponse:
             response = await self.client.chat.completions.create(
                 model=self.checkpoint_name,
                 messages=messages,
-                tools=tools if tools else None,  # type: ignore
+                tools=tools if tools else None,  # type: ignore[arg-type]
                 max_tokens=self.max_tokens,
                 temperature=self.temperature,
             )
@@ -195,7 +195,7 @@ async def format_tool_results(
         return messages
 
 
-async def main():
+async def main() -> None:
     """Example usage of MyAgent."""
 
     # Create agent with Claude via Gateway
@@ -206,23 +206,18 @@ async def main():
         verbose=True,
     )
 
-    # Define a task with HUD MCP environment
+    # Create v5 Task with HUD hub environment
     task = Task(
-        prompt="Go to example.com and tell me the page title",
-        mcp_config={
-            "hud": {
-                "url": "https://mcp.hud.ai/v3/mcp",
-                "headers": {
-                    "Authorization": f"Bearer {os.environ.get('HUD_API_KEY', '')}",
-                    "Mcp-Image": "hudpython/hud-remote-browser:latest",
-                },
-            }
-        },
+        env={"name": "browser"},  # Connect to browser hub
+        scenario="navigate",
+        args={"url": "https://example.com"},
+        agent_config={"system_prompt": "Go to example.com and tell me the page title"},
     )
 
     # Run the agent - traces are automatically captured
     print("Running agent with HUD Gateway inference...")
-    result = await agent.run(task, max_steps=5)
+    async with hud.eval(task, name="custom-agent-demo") as ctx:
+        result = await agent.run(ctx, max_steps=5)
 
     print("\n=== Results ===")
     print(f"Done: {result.done}")
diff --git a/examples/README.md b/examples/README.md
index c7997cb6..88b11a19 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -2,73 +2,126 @@
 
 A collection of examples demonstrating HUD SDK usage patterns.
 
-## Quick Start Examples
+## Quick Start
 
 ### 00_agent_env.py
-Minimal MCP server and client in one file. Shows the basic agent-environment communication pattern.
+Minimal MCP server and client in one file. Shows the basic agent-environment communication pattern using `hud.eval()`.
 
 ```bash
 python examples/00_agent_env.py
 ```
 
-### 01_hello_2048.py
-Complete agent evaluation on the 2048 environment using Claude.
+### 01_agent_lifecycle.py
+Complete agent lifecycle demonstrating:
+- v5 Task format with Environment and scenario
+- `hud.eval()` context for connection and tracing
+- Agent initialization and execution
+- Automatic scenario setup/evaluation
 
 ```bash
-python examples/01_hello_2048.py
+python examples/01_agent_lifecycle.py
 ```
 
-> | Requires Docker and `ANTHROPIC_API_KEY` environment variable.
+> Requires `HUD_API_KEY` and `ANTHROPIC_API_KEY` environment variables.
 
-### 03_browser_agent_loop.py
-Quick start for the browser environment (Claude). Supports multiple demo apps.
+## Agent Examples
+
+### 02_claude_agent.py
+Claude agent with computer use capabilities for browser automation.
+
+```bash
+python examples/02_claude_agent.py
+```
+
+> Requires `HUD_API_KEY` and `ANTHROPIC_API_KEY`.
+
+### 03_openai_compatible_agent.py
+OpenAI-compatible chat.completions agent with both text and browser 2048 environments.
 
 ```bash
-# 2048 (default)
-python examples/03_browser_agent_loop.py
+export OPENAI_API_KEY=your-key
+# export OPENAI_BASE_URL=http://localhost:8000/v1  # for local servers (e.g., vllm)
 
-# Todo app
-python examples/03_browser_agent_loop.py --app todo
+python examples/03_openai_compatible_agent.py --mode text     # text environment
+python examples/03_openai_compatible_agent.py --mode browser  # browser environment
 ```
 
-> | Requires Docker (exposes port 8080) and `ANTHROPIC_API_KEY`.
+> Requires Docker for local environment execution.
 
-## Core Patterns
+### 04_grounded_agent.py
+Grounded agent that separates visual grounding (element detection) from high-level reasoning.
 
-### 02_agent_lifecycle.py
-Demonstrates the full agent lifecycle with telemetry and state management.
-- Task creation and configuration
-- Trace context for debugging
-- State persistence between runs
+```bash
+export OPENAI_API_KEY=your-key
+export OPENROUTER_API_KEY=your-key
+
+python examples/04_grounded_agent.py
+```
+
+> Requires Docker and API keys for both OpenAI and OpenRouter.
+
+### 05_custom_agent.py
+Build a custom MCPAgent using HUD Gateway for unified model access:
+- No need for individual provider API keys
+- Works with Anthropic, OpenAI, Gemini, OpenRouter models
+- Automatic tracing with `@hud.instrument`
+
+```bash
+HUD_API_KEY=sk-hud-... python examples/05_custom_agent.py
+```
+
+## Dataset Evaluation
 
 ### run_evaluation.py
-Generic dataset evaluation runner supporting multiple agents.
+Generic dataset evaluation runner using the programmatic API.
 
 ```bash
-# Run single task
+# Run all tasks in a dataset
 python examples/run_evaluation.py hud-evals/SheetBench-50
 
-# Run full dataset
-python examples/run_evaluation.py hud-evals/SheetBench-50 --full
+# Run specific tasks by index
+python examples/run_evaluation.py hud-evals/SheetBench-50 --task-ids 0 1 2
+
+# Use different agent and concurrency
+python examples/run_evaluation.py hud-evals/OSWorld-Verified-Gold --agent operator --max-concurrent 50
 ```
 
-## Integration Examples
+For production evaluations, prefer the CLI: `hud eval --help`
 
-### claude_agent.py
-Direct usage of Claude agent without environments.
+## Key Concepts
 
-### integration_mcp_use.py
-Using the legacy `mcp_use` client for multi-server setups.
+### v5 Task Format
 
-### integration_otel.py
-Custom OpenTelemetry backend integration (e.g., Jaeger).
+The v5 Task format is the recommended way to define evaluation tasks:
 
-### openai_compatible_agent.py
-OpenAI-compatible chat.completions agent with both text and browser 2048 environments.
+```python
+from hud.eval.task import Task
 
-```bash
-export OPENAI_API_KEY=your-key           # or dummy value for local servers
-# export OPENAI_BASE_URL=http://localhost:8000/v1  # e.g., vllm
-python examples/openai_compatible_agent.py --mode text     # text environment
-python examples/openai_compatible_agent.py --mode browser  # browser environment
+# Simple task with hub environment
+task = Task(
+    env={"name": "browser"},  # Connect to browser hub
+    scenario="checkout",       # Scenario to run
+    args={"user_id": "alice"}, # Scenario arguments
+)
+
+# Task with local Docker environment
+env = hud.Environment("my-env")
+env.connect_local(command="docker", args=["run", "--rm", "-i", "my-image"])
+task = Task(env=env, scenario="test")
+```
+
+### Using hud.eval()
+
+All examples use `hud.eval()` as the primary entry point:
+
+```python
+async with hud.eval(task, name="my-eval", variants={"model": "gpt-4o"}) as ctx:
+    result = await agent.run(ctx, max_steps=10)
+    print(f"Reward: {ctx.reward}")
 ```
+
+The context manager handles:
+- Environment connection (MCP servers start)
+- Scenario setup execution
+- Telemetry and tracing
+- Automatic scenario evaluation on exit
diff --git a/examples/integration_otel.py b/examples/integration_otel.py
deleted file mode 100644
index 4fe4ec7e..00000000
--- a/examples/integration_otel.py
+++ /dev/null
@@ -1,91 +0,0 @@
-"""Example: Running HUD agents with Jaeger as the tracing backend.
-
-This example shows how to run a normal HUD agent (playing 2048 game)
-but send all traces to Jaeger instead of HUD's backend.
-
-To run:
-1. Build the 2048 game:
-   docker build -t hud-text-2048 ../environments/text_2048
-
-2. Start Jaeger:
-   docker run -d --name jaeger \
-     -e COLLECTOR_OTLP_ENABLED=true \
-     -p 16686:16686 -p 4318:4318 \
-     jaegertracing/all-in-one:latest
-
-3. Run this example:
-   python custom_otel_backend.py
-
-4. View traces at http://localhost:16686
-   - Service: "hud-2048-jaeger"
-   - You'll see the agent's get_model_response and execute_tools spans
-
-5. Cleanup:
-   docker stop jaeger && docker rm jaeger
-"""
-
-import asyncio
-
-# Configure telemetry BEFORE importing agents to use Jaeger
-from hud.otel import configure_telemetry
-
-configure_telemetry(
-    service_name="hud-2048-jaeger",
-    enable_otlp=True,
-    otlp_endpoint="localhost:4318",  # Jaeger's OTLP HTTP endpoint
-)
-
-# Now import everything else
-import hud
-from hud.agents import ClaudeAgent
-from hud.clients import MCPClient
-from hud.datasets import Task
-
-
-async def main():
-    """Run 2048 game with Claude agent, traces go to Jaeger."""
-
-    task_dict = {
-        "prompt": "Play 2048 and try to get as high as possible. Do not stop even after 2048 is reached.",
-        "mcp_config": {
-            "local": {"command": "docker", "args": ["run", "--rm", "-i", "hud-text-2048"]}
-        },
-        "setup_tool": {
-            "name": "setup",
-            "arguments": {"name": "board", "arguments": {"board_size": 4}},
-        },
-        "evaluate_tool": {
-            "name": "evaluate",
-            "arguments": {"name": "max_number"},
-        },
-    }
-    task = Task(**task_dict)
-
-    # Create client and agent
-    mcp_client = MCPClient(mcp_config=task.mcp_config)
-    # Create agent - its methods are already instrumented with @hud.instrument
-    agent = ClaudeAgent.create(
-        mcp_client=mcp_client,
-    )
-
-    # Run with hud.trace() - this creates the root span in Jaeger
-    with hud.trace("play_2048_game"):
-        print(f"🎮 Starting 2048 game")
-
-        # Agent will play the game with setup and evaluate phases
-        # Each call to get_model_response() and execute_tools()
-        # will create child spans in Jaeger automatically
-        result = await agent.run(task, max_steps=20)
-
-        print(f"\n🏁 Game finished!")
-        print(f"   Final reward: {result.reward}")
-        print(f"   Success: {not result.isError}")
-
-    print("\n✅ All traces sent to Jaeger!")
-    print("🔍 View at: http://localhost:16686")
-    print("   - Service: 'hud-2048-jaeger'")
-    print("   - You'll see the agent's reasoning and tool calls")
-
-
-if __name__ == "__main__":
-    asyncio.run(main())
diff --git a/examples/run_evaluation.py b/examples/run_evaluation.py
index 7e200001..d6f0d871 100644
--- a/examples/run_evaluation.py
+++ b/examples/run_evaluation.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-"""Example: Running evaluations programmatically with run_tasks.
+"""Example: Running evaluations programmatically with run_dataset.
 
 For CLI usage, prefer `hud eval` which handles config files, interactive
 agent selection, and more. This example shows the programmatic API.
@@ -14,61 +14,58 @@
 
 import argparse
 import asyncio
-from typing import Any, cast
-
-from datasets import load_dataset
-
-from hud.datasets import run_tasks, display_results
-from hud.types import AgentType, Task
 
 
 async def main() -> None:
     parser = argparse.ArgumentParser(description="Run evaluation on a HUD dataset")
-    parser.add_argument("dataset", help="HuggingFace dataset ID (e.g., hud-evals/SheetBench-50)")
+    parser.add_argument("dataset", help="Dataset source (e.g., hud-evals/SheetBench-50)")
     parser.add_argument("--agent", choices=["claude", "operator"], default="claude")
     parser.add_argument("--model", default=None, help="Model name override")
     parser.add_argument("--max-concurrent", type=int, default=30, help="Max concurrent tasks")
     parser.add_argument("--max-steps", type=int, default=50, help="Max steps per task")
     parser.add_argument("--group-size", type=int, default=1, help="Runs per task (for variance)")
-    parser.add_argument("--task-ids", nargs="*", help="Specific task IDs to run (optional)")
+    parser.add_argument("--task-ids", nargs="*", help="Specific task indices to run (optional)")
     args = parser.parse_args()
 
-    # Load dataset and convert to Task objects
+    # Import here to avoid import errors if agents not installed
+    from hud.datasets import load_tasks, run_dataset
+
+    # Load tasks from file or API
     print(f"Loading {args.dataset}...")
-    raw_dataset = load_dataset(args.dataset, split="train")
-    tasks = [Task(**cast("dict[str, Any]", row)) for row in raw_dataset]
+    tasks = load_tasks(args.dataset)
 
-    # Filter by task IDs if specified
+    # Filter by index if specified
     if args.task_ids:
-        tasks = [t for t in tasks if t.id in args.task_ids]
-        print(f"Filtered to {len(tasks)} tasks: {args.task_ids}")
+        indices = [int(tid) for tid in args.task_ids]
+        tasks = [tasks[i] for i in indices if i < len(tasks)]
+        print(f"Filtered to {len(tasks)} tasks at indices: {args.task_ids}")
 
-    # Select agent type and params
+    # Determine agent type and params
     if args.agent == "operator":
-        agent_type = AgentType.OPERATOR
-        agent_params = {
-            "checkpoint_name": args.model or "computer-use-preview",
-            "validate_api_key": False,
-        }
+        agent_type = "operator"
+        agent_params = {"checkpoint_name": args.model or "computer-use-preview"}
     else:
-        agent_type = AgentType.CLAUDE
-        agent_params = {
-            "checkpoint_name": args.model or "claude-sonnet-4-5",
-            "validate_api_key": False,
-        }
+        agent_type = "claude"
+        agent_params = {"checkpoint_name": args.model or "claude-sonnet-4-20250514"}
 
-    # Run evaluation
-    results = await run_tasks(
+    # Run evaluation using run_dataset
+    # Note: run_dataset creates agents fresh per task for proper tool initialization
+    print(f"Running {len(tasks)} tasks with {args.agent} agent...")
+    results = await run_dataset(
         tasks=tasks,
         agent_type=agent_type,
         agent_params=agent_params,
-        name=f"Eval: {args.dataset.split('/')[-1]}",
-        max_concurrent=args.max_concurrent,
         max_steps=args.max_steps,
+        max_concurrent=args.max_concurrent,
         group_size=args.group_size,
     )
 
-    display_results(results, tasks=tasks)
+    # Display results
+    print(f"\n{'=' * 50}")
+    print(f"Completed {len(results)} tasks")
+    for i, ctx in enumerate(results):
+        reward = ctx.reward if hasattr(ctx, "reward") else "N/A"
+        print(f"  Task {i}: reward={reward}")
 
 
 if __name__ == "__main__":
diff --git a/hud/__init__.py b/hud/__init__.py
index 072dde86..cf88add5 100644
--- a/hud/__init__.py
+++ b/hud/__init__.py
@@ -5,28 +5,36 @@
 
 from __future__ import annotations
 
-from .telemetry import (
-    Trace,
-    async_job,
-    async_trace,
-    clear_trace,
-    create_job,
-    get_trace,
-    instrument,
-    job,
-    trace,
-)
+import warnings
+
+# Apply patches to third-party libraries early, before other imports
+from . import patches as _patches  # noqa: F401
+from .environment import Environment
+from .eval import EvalContext
+from .eval import run_eval as eval
+from .telemetry.instrument import instrument
+
+
+def trace(*args: object, **kwargs: object) -> EvalContext:
+    """Deprecated: Use hud.eval() instead.
+
+    .. deprecated:: 0.5.0
+        hud.trace() is deprecated. Use hud.eval() or env.eval() instead.
+    """
+    warnings.warn(
+        "hud.trace() is deprecated. Use hud.eval() or env.eval() instead.",
+        DeprecationWarning,
+        stacklevel=2,
+    )
+    return eval(*args, **kwargs)  # type: ignore[arg-type]
+
 
 __all__ = [
-    "Trace",
-    "async_job",
-    "async_trace",
-    "clear_trace",
-    "create_job",
-    "get_trace",
+    "Environment",
+    "EvalContext",
+    "eval",
     "instrument",
-    "job",
-    "trace",
+    "trace",  # Deprecated alias for eval
 ]
 
 try:
diff --git a/hud/agents/__init__.py b/hud/agents/__init__.py
index edcd569c..547d876b 100644
--- a/hud/agents/__init__.py
+++ b/hud/agents/__init__.py
@@ -1,17 +1,17 @@
 from __future__ import annotations
 
 from .base import MCPAgent
-from .claude import ClaudeAgent
-from .gemini import GeminiAgent
-from .gemini_cua import GeminiCUAAgent
 from .openai import OpenAIAgent
 from .openai_chat import OpenAIChatAgent
 from .operator import OperatorAgent
 
+# Note: These agents are not exported here to avoid requiring optional dependencies.
+# Import directly if needed:
+#   from hud.agents.claude import ClaudeAgent  # requires anthropic
+#   from hud.agents.gemini import GeminiAgent  # requires google-genai
+#   from hud.agents.gemini_cua import GeminiCUAAgent  # requires google-genai
+
 __all__ = [
-    "ClaudeAgent",
-    "GeminiAgent",
-    "GeminiCUAAgent",
     "MCPAgent",
     "OpenAIAgent",
     "OpenAIChatAgent",
diff --git a/hud/agents/base.py b/hud/agents/base.py
index 58d5cad0..cfe9cf2d 100644
--- a/hud/agents/base.py
+++ b/hud/agents/base.py
@@ -3,7 +3,6 @@
 from __future__ import annotations
 
 import asyncio
-import fnmatch
 import json
 import logging
 from abc import ABC, abstractmethod
@@ -12,14 +11,12 @@
 import mcp.types as types
 from pydantic import BaseModel, ConfigDict
 
-from hud.agents.utils import log_agent_metadata_to_status, log_task_config_to_current_trace
-from hud.clients.base import AgentMCPClient
 from hud.types import AgentResponse, BaseAgentConfig, MCPToolCall, MCPToolResult, Trace
 from hud.utils.hud_console import HUDConsole
-from hud.utils.mcp import MCPConfigPatch, patch_mcp_config, setup_hud_telemetry
 
 if TYPE_CHECKING:
-    from hud.datasets import Task
+    from hud.environment import Environment
+    from hud.eval.context import EvalContext
 
 
 logger = logging.getLogger(__name__)
@@ -30,8 +27,9 @@ class BaseCreateParams(BaseModel):
 
     model_config = ConfigDict(arbitrary_types_allowed=True)
 
-    mcp_client: AgentMCPClient | None = None
-    auto_trace: bool = True
+    # Primary way to bind agent to execution context (v5)
+    ctx: Any | None = None  # EvalContext or Environment - agent uses this for tool calls
+
     auto_respond: bool = False
     verbose: bool = False
 
@@ -40,19 +38,14 @@ class MCPAgent(ABC):
     """
     Base class for MCP-enabled agents.
 
-    Provides common behavior for agents that interact with MCP servers, including:
-    - Client management: accepts an `AgentMCPClient` or auto-creates one at
-      runtime when `run()` is called with a `Task` that includes `mcp_config`.
-    - Tool lifecycle: discovery, filtering (`allowed_tools`, `disallowed_tools`),
-      and automatic marking of lifecycle tools (setup/evaluate) from a `Task`.
-    - Messaging: system prompt handling, optional inclusion of setup output on
-      the first turn, and control over initial screenshots.
-    - Telemetry & UX: standardized logging/printing via `HUDConsole` and optional
-      automatic tracing (`auto_trace`).
+    Agents interact with MCP servers through an EvalContext:
+    - run(ctx): Main entry point - takes EvalContext from hud.eval()
+    - ctx.call_tool(): Used internally for all tool execution
+    - ctx.submit(): Called automatically with agent's final response
 
     Subclasses implement provider-specific formatting and response fetching
-    by overriding these abstract methods: `get_system_messages`, `get_response`,
-    `format_blocks`, and `format_tool_results`.
+    by overriding: `get_system_messages`, `get_response`, `format_blocks`,
+    and `format_tool_results`.
     """
 
     metadata: ClassVar[dict[str, Any] | None] = None
@@ -81,9 +74,11 @@ def __init__(self, params: BaseCreateParams | None = None, **kwargs: Any) -> Non
         }
         self.config = self.config_cls(**config_kwargs)
 
-        self.mcp_client = params.mcp_client
+        # v5: Store execution context (EvalContext/Environment) - agent uses ctx.call_tool()
+        self.ctx: EvalContext | Environment | None = params.ctx
+
         self.model_name: str = getattr(params, "model_name", "MCPAgent")
-        self.checkpoint_name: str = getattr(params, "checkpoint_name", "unknown")
+        self.model: str = getattr(params, "model", None) or "unknown"
         self.auto_respond = params.auto_respond
 
         self.console = HUDConsole(logger=logger)
@@ -91,19 +86,11 @@ def __init__(self, params: BaseCreateParams | None = None, **kwargs: Any) -> Non
         if params.verbose:
             self.console.set_verbose(True)
 
-        self.allowed_tools = self.config.allowed_tools
-        self.disallowed_tools = self.config.disallowed_tools
         self.system_prompt = self.config.system_prompt
-        self.append_setup_output = self.config.append_setup_output
-        self.initial_screenshot = self.config.initial_screenshot
-        self.response_tool_name = self.config.response_tool_name
 
         self._available_tools: list[types.Tool] | None = None
         self._tool_map: dict[str, types.Tool] = {}
-
-        # Trace
-        self._auto_trace = params.auto_trace
-        self._auto_trace_cm: Any | None = None
+        self._initialized: bool = False
 
     @classmethod
     def create(cls, **kwargs: Any) -> MCPAgent:
@@ -117,85 +104,21 @@ def create(cls, **kwargs: Any) -> MCPAgent:
         )
         return cls(params=CreateParams(**kwargs))
 
-    async def initialize(self, task: str | Task | None = None) -> None:
-        """Initialize the agent with task-specific configuration."""
-        from hud.datasets import Task
-
-        # Create client if needed
-        if self.mcp_client is None and isinstance(task, Task) and task.mcp_config:
-            from hud.clients import MCPClient
-
-            self.mcp_client = MCPClient(mcp_config=task.mcp_config)
-            self.console.debug("Auto-created MCPClient from task.mcp_config")
+    async def _initialize_from_ctx(self, ctx: EvalContext) -> None:
+        """Initialize agent from EvalContext - discovers tools and sets up state.
 
-        # Ensure we have a client
-        if self.mcp_client is None:
-            raise ValueError(
-                "No MCPClient. Please provide one when initializing the agent or pass a Task with mcp_config."  # noqa: E501
-            )
+        This is the v5 initialization path. The agent uses ctx.call_tool() directly
+        for tool execution (no EnvironmentClient wrapper needed).
+        """
+        from hud.eval.context import EvalContext
 
-        try:
-            client_cfg = getattr(self.mcp_client, "mcp_config", None)
-        except Exception:
-            client_cfg = None
-        await self._setup_config(client_cfg)
+        if not isinstance(ctx, EvalContext):
+            raise TypeError(f"ctx must be EvalContext, got {type(ctx).__name__}")
 
-        # Initialize client if needed
-        try:
-            await self.mcp_client.initialize()
-        except Exception as e:
-            self.console.error_log(f"Failed to initialize MCP client: {e}")
-            self._handle_connection_error(e)
-
-        # If task is provided, apply agent_config and add lifecycle tools
-        if isinstance(task, Task) and task.agent_config:
-            agent_cfg = task.agent_config
-            if agent_cfg.system_prompt:
-                if self.system_prompt is None:
-                    self.system_prompt = agent_cfg.system_prompt
-                else:
-                    self.system_prompt += "\n\n" + agent_cfg.system_prompt
-            if "append_setup_output" in agent_cfg.model_fields_set:
-                self.append_setup_output = agent_cfg.append_setup_output
-            if "initial_screenshot" in agent_cfg.model_fields_set:
-                self.initial_screenshot = agent_cfg.initial_screenshot
-            if agent_cfg.allowed_tools is not None:
-                # If allowed_tools has already been set, we take the intersection of the two
-                # If the list had been empty, we were allowing all tools, so we overwrite this
-                if isinstance(self.allowed_tools, list) and len(self.allowed_tools) > 0:
-                    # If task allows "*", keep CLI's allowed_tools unchanged
-                    if "*" not in agent_cfg.allowed_tools:
-                        self.allowed_tools = [
-                            tool for tool in self.allowed_tools if tool in agent_cfg.allowed_tools
-                        ]
-                    # else: task allows all tools, so CLI's allowed_tools takes precedence
-                else:  # If allowed_tools is None, we overwrite it
-                    self.allowed_tools = agent_cfg.allowed_tools
-            if agent_cfg.disallowed_tools is not None:
-                # If disallowed_tools has already been set, we take the union of the two
-                if isinstance(self.disallowed_tools, list):
-                    self.disallowed_tools.extend(agent_cfg.disallowed_tools)
-                else:  # If disallowed_tools is None, we overwrite it
-                    self.disallowed_tools = agent_cfg.disallowed_tools
-            if agent_cfg.response_tool_name is not None:
-                self.response_tool_name = agent_cfg.response_tool_name
-
-        all_tools = await self.mcp_client.list_tools()
-        self._available_tools = []
-
-        # Filter tools based on allowed and disallowed patterns
-        # No allowed tools and no disallowed tools -> we accept all tools
-        # No allowed tools and disallowed tools -> we accept all tools except the disallowed ones
-        for tool in all_tools:
-            if self.allowed_tools is not None and not any(
-                fnmatch.fnmatch(tool.name, pattern) for pattern in self.allowed_tools
-            ):
-                continue
-            if self.disallowed_tools is not None and any(
-                fnmatch.fnmatch(tool.name, pattern) for pattern in self.disallowed_tools
-            ):
-                continue
-            self._available_tools.append(tool)
+        # Refresh tools from connections, then get filtered list for agent
+        await ctx.list_tools()
+        self._available_tools = ctx.as_tools()
+        self._tool_map = {t.name: t for t in self._available_tools}
 
         # Validate required tools are present
         available_tool_names = {t.name for t in self._available_tools}
@@ -207,181 +130,89 @@ async def initialize(self, task: str | Task | None = None) -> None:
             )
 
         self.console.info(
-            f"Agent initialized with {len(self.get_available_tools())} tools: {', '.join([t.name for t in self.get_available_tools()])}"  # noqa: E501
+            f"Agent initialized with {len(self._available_tools)} tools: "
+            f"{', '.join([t.name for t in self._available_tools])}"
         )
 
-        await log_agent_metadata_to_status(self.model_name, self.checkpoint_name)
+        # Call hook for subclass-specific initialization (e.g., tool format conversion)
+        self._on_tools_ready()
+
+        self._initialized = True
+
+    def _on_tools_ready(self) -> None:
+        """Hook called after tools are discovered and validated.
 
-    async def run(self, prompt_or_task: str | Task | dict[str, Any], max_steps: int = 10) -> Trace:
+        Subclasses can override this to perform provider-specific setup,
+        such as converting MCP tools to the provider's format.
+
+        Called by _initialize_from_ctx() after _available_tools is populated.
+        """
+        return  # Default no-op - subclasses override for provider-specific setup
+
+    async def run(
+        self,
+        ctx: EvalContext,
+        *,
+        max_steps: int = 10,
+    ) -> Trace:
         """
-        Run the agent with the given prompt or task.
+        Run the agent on the given evaluation context.
+
+        The agent uses ctx.prompt as the task and ctx.call_tool() for tool execution.
+        Automatically calls ctx.submit() with the final answer.
 
         Args:
-            prompt_or_task: Either a string prompt for simple execution or a Task object
-            max_steps: Maximum number of steps (-1 for infinite)
+            ctx: EvalContext from hud.eval() - contains prompt and tools
+            max_steps: Maximum number of agent steps (-1 for infinite)
 
         Returns:
-            Trace with reward, done, content, isError fields and trace steps
+            Trace with done, content, isError fields
+
+        Example:
+            ```python
+            async with hud.eval(task) as ctx:
+                agent = ClaudeAgent.create()
+                await agent.run(ctx)
+            # ctx.reward is set by the scenario's evaluate phase
+            ```
         """
-        # Import here to avoid circular imports
-        from hud.datasets import Task
+        from hud.eval.context import EvalContext
 
-        if isinstance(prompt_or_task, dict):
-            prompt_or_task = Task(**prompt_or_task)
-        elif not isinstance(prompt_or_task, str) and not isinstance(prompt_or_task, Task):
-            raise TypeError(f"prompt_or_task must be str or Task, got {type(prompt_or_task)}")
+        if not isinstance(ctx, EvalContext):
+            raise TypeError(f"ctx must be EvalContext, got {type(ctx).__name__}")
 
-        try:
-            # Establish the connection with the MCP server/Environment
-            await self.initialize(prompt_or_task)
+        if not ctx.prompt:
+            raise ValueError("ctx.prompt is not set - did the scenario setup run?")
 
-            # Handle Task objects with full lifecycle
-            if isinstance(prompt_or_task, Task):
-                # Log a compact summary of task config to the current trace (async)
-                await log_task_config_to_current_trace(prompt_or_task)
+        # Store context for tool calls
+        self.ctx = ctx
 
-                return await self.run_task(prompt_or_task, max_steps)
+        # Initialize tools from context
+        if not self._initialized:
+            await self._initialize_from_ctx(ctx)
 
-            # Handle simple string prompts
-            elif isinstance(prompt_or_task, str):
-                context = text_to_blocks(prompt_or_task)
-                return await self._run_context(context, max_steps=max_steps)
+        try:
+            result = await self._run_context(text_to_blocks(ctx.prompt), max_steps=max_steps)
+
+            # Submit final answer to context (only if scenario is running)
+            if result.content and ctx.has_scenario:
+                await ctx.submit(result.content)
+
+            return result
 
         except Exception as e:
             logger.exception("Error while running agent:")
-            # Always return a Trace object for any exception
-            if self._is_connection_error(e):
-                # Return error trace for connection failures
-                return Trace(
-                    reward=0.0,
-                    done=True,
-                    content=self._get_connection_error_message(e),
-                    isError=True,
-                )
-            else:
-                # Return error trace for any other exception
-                return Trace(
-                    reward=0.0,
-                    done=True,
-                    content=f"Task failed with error: {e}",
-                    isError=True,
-                    info={"error": str(e)},
-                )
+            return Trace(
+                reward=0.0,
+                done=True,
+                content=f"Agent failed with error: {e}",
+                isError=True,
+                info={"error": str(e)},
+            )
         finally:
             # Cleanup auto-created resources
             await self._cleanup()
 
-    async def run_task(self, task: Task, max_steps: int = 10) -> Trace:
-        """
-        Execute a task with setup and evaluate phases.
-
-        Args:
-            task: Task object with prompt, setup, and evaluate configs
-            max_steps: Maximum steps for task execution (-1 for infinite)
-
-        Returns:
-            Trace with reward from evaluation
-        """
-        try:
-            # Setup phase
-            start_context: list[types.ContentBlock] = []
-
-            # Extract the initial task information
-            if task.prompt:
-                start_context.extend(text_to_blocks(task.prompt))
-
-            # Execute the setup tool and append the initial observation to the context
-            if task.setup_tool is not None:
-                self.console.progress_log(f"Setting up tool phase: {task.setup_tool}")
-                results = await self.call_tools(task.setup_tool)
-                if any(result.isError for result in results):
-                    for result in results:
-                        if result.isError:
-                            self.console.error_log(f"Error in setup tool: {result}")
-
-                    return Trace(
-                        reward=0.0,
-                        done=True,
-                        content=f"Setup tool failed: {results}",
-                        isError=True,
-                        task=task,
-                    )
-
-                if self.append_setup_output and isinstance(results[0].content, list):
-                    start_context.extend(results[0].content)
-            if not self.initial_screenshot:
-                start_context = await self._filter_messages(start_context, include_types=["text"])
-
-            # Execute the task (agent loop) - this returns a empty trace object with the final response  # noqa: E501
-            prompt_result = await self._run_context(start_context, max_steps=max_steps)
-
-        except Exception as e:
-            self.console.error_log(f"Task execution failed: {e}")
-            # Create an error result but don't return yet - we still want to evaluate
-            prompt_result = Trace(reward=0.0, done=True, content=str(e), isError=True, task=task)
-            prompt_result.populate_from_context()
-
-        # Always evaluate if we have evaluate tool, regardless of errors
-        if task.evaluate_tool is not None:
-            try:
-                results = await self.call_tools(task.evaluate_tool)
-
-                if any(result.isError for result in results):
-                    self.console.warning_log(f"Evaluate tool returned error: {results}")
-                    # Still extract what we can from the error response
-                    if prompt_result is None:
-                        prompt_result = Trace(
-                            reward=0.0,
-                            done=True,
-                            content="Task failed before evaluation",
-                            isError=True,
-                            task=task,
-                        )
-                    prompt_result.reward = 0.0  # Default to 0 on error
-                else:
-                    # Extract reward and content from evaluation
-                    if results:
-                        reward = find_reward(results[0])
-                        self.console.info_log(f"Eval: {reward:.4f} {task.evaluate_tool}")
-                        eval_content = find_content(results[0])
-
-                        # Update the prompt result with evaluation reward
-                        if prompt_result is None:
-                            prompt_result = Trace(
-                                reward=reward,
-                                done=True,
-                                content=eval_content or "",
-                                isError=False,
-                                task=task,
-                            )
-                        else:
-                            prompt_result.reward = reward
-
-                            # Update the prompt result with evaluation content (if available)
-                            if eval_content:
-                                # Prompt result may already have final response content,
-                                # so we append to it
-                                if prompt_result.content:
-                                    prompt_result.content += "\n\n" + eval_content
-                                else:
-                                    prompt_result.content = eval_content
-
-            except Exception as e:
-                self.console.error_log(f"Evaluation phase failed: {e}")
-                # Ensure we have a result even if evaluation failed
-                if prompt_result is None:
-                    prompt_result = Trace(
-                        reward=0.0,
-                        done=True,
-                        content=f"Evaluation failed: {e}",
-                        isError=True,
-                        task=task,
-                    )
-
-        prompt_result.task = task
-
-        return prompt_result
-
     async def _run_context(
         self, context: list[types.ContentBlock], *, max_steps: int = 10
     ) -> Trace:
@@ -435,9 +266,6 @@ async def _run_context(
                             except Exception as e:
                                 self.console.warning_log(f"Auto-respond failed: {e}")
                         if decision == "STOP":
-                            # Try to submit response through lifecycle tool
-                            await self._maybe_submit_response(response, messages)
-
                             self.console.debug("Stopping execution")
                             final_response = response
                             break
@@ -500,16 +328,13 @@ async def _run_context(
         }
         trace_result = Trace(**trace_params)
 
-        # Populate trace steps from current context
-        trace_result.populate_from_context()
-
         return trace_result
 
     async def call_tools(
         self, tool_call: MCPToolCall | list[MCPToolCall] | None = None
     ) -> list[MCPToolResult]:
         """
-        Call a tool through the MCP client.
+        Call tools through the bound EvalContext.
 
         Args:
             tool_call: MCPToolCall or list of MCPToolCall
@@ -523,20 +348,17 @@ async def call_tools(
         if isinstance(tool_call, MCPToolCall):
             tool_call = [tool_call]
 
-        if self.mcp_client is None:
-            raise ValueError("Client is not initialized")
+        if self.ctx is None:
+            raise ValueError("Agent not bound to context - call run(ctx) first")
 
         results: list[MCPToolResult] = []
         for tc in tool_call:
             try:
                 self.console.debug(f"Calling tool: {tc}")
-                results.append(await self.mcp_client.call_tool(tc))
+                result = await self.ctx.call_tool(tc)
+                results.append(MCPToolResult(content=result.content, isError=result.isError))
             except TimeoutError as e:
                 self.console.error_log(f"Tool execution timed out: {e}")
-                try:
-                    await self.mcp_client.shutdown()
-                except Exception as close_err:
-                    self.console.debug(f"Failed to close MCP client cleanly: {close_err}")
                 raise
             except Exception as e:
                 self.console.error_log(f"Tool execution failed: {e}")
@@ -555,8 +377,6 @@ async def get_response(self, messages: list[Any]) -> AgentResponse:
         """
         Get response from the model including any tool calls.
 
-        NOTE: Subclasses should decorate this method with:
-            @hud.instrument(span_type="agent", record_args=False, record_result=True)
 
         Args:
             messages: Current conversation messages
@@ -616,45 +436,6 @@ async def format_message(
 
         return await self.format_blocks(blocks)
 
-    async def _maybe_submit_response(self, response: AgentResponse, messages: list[Any]) -> None:
-        """Submit response through lifecycle tool if available.
-
-        Args:
-            response: The agent's response
-            messages: The current message history (will be modified in-place)
-        """
-        if self.response_tool_name:
-            self.console.debug(f"Calling response lifecycle tool: {self.response_tool_name}")
-            try:
-                # Call the response tool with the agent's response
-                response_tool_call = MCPToolCall(
-                    name=self.response_tool_name, arguments={"response": response.content}
-                )
-                response_results = await self.call_tools(response_tool_call)
-
-                # Format and add the response tool results to messages
-                response_messages = await self.format_tool_results(
-                    [response_tool_call], response_results
-                )
-                messages.extend(response_messages)
-
-                # Mark the task as done
-                self.console.debug("Response lifecycle tool executed, marking task as done")
-            except Exception as e:
-                self.console.error_log(f"Response lifecycle tool failed: {e}")
-
-    async def _setup_config(self, mcp_config: dict[str, dict[str, Any]] | None) -> None:
-        """Inject metadata into the metadata of the initialize request."""
-        if not isinstance(mcp_config, dict):
-            return
-
-        if self.metadata:
-            patch_mcp_config(
-                mcp_config,
-                MCPConfigPatch(meta=self.metadata),
-            )
-        self._auto_trace_cm = setup_hud_telemetry(mcp_config, auto_trace=self._auto_trace)
-
     def get_available_tools(self) -> list[types.Tool]:
         """Get list of available MCP tools for LLM use (excludes lifecycle tools)."""
         if self._available_tools is None:
@@ -697,64 +478,8 @@ async def _filter_messages(
 
     async def _cleanup(self) -> None:
         """Cleanup resources."""
-        # Clean up auto-created trace if any
-        if self._auto_trace_cm:
-            try:
-                self._auto_trace_cm.__exit__(None, None, None)
-                self.console.debug("Closed auto-created trace")
-            except Exception as e:
-                self.console.warning_log(f"Failed to close auto-created trace: {e}")
-            finally:
-                self._auto_trace_cm = None
-
-        # Always clean up the client
-        if self.mcp_client:
-            try:
-                await self.mcp_client.shutdown()
-                self.console.debug("Closed auto-created MCPClient")
-            except Exception as e:
-                self.console.warning_log(f"Failed to close auto-created client: {e}")
-            finally:
-                self.mcp_client = None
-
-    def _is_connection_error(self, e: Exception) -> bool:
-        """Check if an exception is a connection error."""
-        error_msg = str(e).lower()
-        return any(
-            pattern in error_msg
-            for pattern in [
-                "connection",
-                "connect",
-                "refused",
-                "failed",
-                "could not connect",
-                "mcp server",
-            ]
-        )
-
-    def _get_connection_error_message(self, e: Exception) -> str:
-        """Extract a helpful connection error message."""
-        import re
-
-        url_match = re.search(r"https?://[^\s]+", str(e))
-        url = url_match.group(0) if url_match else "the MCP server"
-        return f"Connection failed: Could not connect to {url}. Is your MCP client/server running?"
-
-    def _handle_connection_error(self, e: Exception) -> None:
-        """Handle connection errors with helpful messages."""
-        if self._is_connection_error(e):
-            msg = self._get_connection_error_message(e)
-            # Always show connection errors, not just when logging is enabled
-            self.console.error(f"❌ {msg}")
-            self.console.info("💡 Make sure the MCP server is started before running the agent.")
-
-            # For localhost, provide specific instructions
-            error_str = str(e).lower()
-            if "localhost" in error_str or "127.0.0.1" in error_str:
-                self.console.info("   Run 'hud dev' in another terminal to start the MCP server")
-
-            raise RuntimeError(msg) from e
-        raise
+        # Clear context reference
+        self.ctx = None
 
 
 def _format_error_result(error_message: str) -> MCPToolResult:
diff --git a/hud/agents/claude.py b/hud/agents/claude.py
index b7ba2b72..1fc41559 100644
--- a/hud/agents/claude.py
+++ b/hud/agents/claude.py
@@ -5,8 +5,9 @@
 import copy
 import logging
 from inspect import cleandoc
-from typing import TYPE_CHECKING, Any, ClassVar, Literal, cast
+from typing import Any, ClassVar, Literal, cast
 
+import mcp.types as types
 from anthropic import AsyncAnthropic, AsyncAnthropicBedrock, Omit
 from anthropic.types import CacheControlEphemeralParam
 from anthropic.types.beta import (
@@ -22,13 +23,6 @@
     BetaToolTextEditor20250728Param,
     BetaToolUnionParam,
 )
-
-import hud
-
-if TYPE_CHECKING:
-    from hud.datasets import Task
-
-import mcp.types as types
 from pydantic import ConfigDict
 
 from hud.settings import settings
@@ -46,7 +40,7 @@ class ClaudeConfig(BaseAgentConfig):
     model_config = ConfigDict(arbitrary_types_allowed=True)
 
     model_name: str = "Claude"
-    checkpoint_name: str = "claude-sonnet-4-5"
+    model: str = "claude-sonnet-4-5"
     model_client: AsyncAnthropic | AsyncAnthropicBedrock | None = None
     max_tokens: int = 16384
     use_computer_beta: bool = True
@@ -97,17 +91,15 @@ def __init__(self, params: ClaudeCreateParams | None = None, **kwargs: Any) -> N
         self.tool_mapping: dict[str, str] = {}
         self.claude_tools: list[BetaToolUnionParam] = []
 
-    async def initialize(self, task: str | Task | None = None) -> None:
-        """Initialize the agent and build tool mappings."""
-        await super().initialize(task)
-        # Build tool mappings after tools are discovered
+    def _on_tools_ready(self) -> None:
+        """Build Claude-specific tool mappings after tools are discovered."""
         self._convert_tools_for_claude()
 
-    async def get_system_messages(self) -> list[Any]:
+    async def get_system_messages(self) -> list[BetaMessageParam]:
         """No system messages for Claude because applied in get_response"""
         return []
 
-    async def format_blocks(self, blocks: list[types.ContentBlock]) -> list[Any]:
+    async def format_blocks(self, blocks: list[types.ContentBlock]) -> list[BetaMessageParam]:
         """Format messages for Claude."""
         # Convert MCP content types to Anthropic content types
         anthropic_blocks: list[BetaContentBlockParam] = []
@@ -141,11 +133,6 @@ async def format_blocks(self, blocks: list[types.ContentBlock]) -> list[Any]:
 
         return [BetaMessageParam(role="user", content=anthropic_blocks)]
 
-    @hud.instrument(
-        span_type="agent",
-        record_args=False,  # Messages can be large
-        record_result=True,
-    )
     async def get_response(self, messages: list[BetaMessageParam]) -> AgentResponse:
         """Get response from Claude including any tool calls."""
         messages_cached = self._add_prompt_caching(messages)
@@ -159,7 +146,7 @@ async def get_response(self, messages: list[BetaMessageParam]) -> AgentResponse:
         if isinstance(self.anthropic_client, AsyncAnthropicBedrock):
             try:
                 response = await self.anthropic_client.beta.messages.create(
-                    model=self.config.checkpoint_name,
+                    model=self.config.model,
                     system=self.system_prompt if self.system_prompt is not None else Omit(),
                     max_tokens=self.max_tokens,
                     messages=messages_cached,
@@ -175,7 +162,7 @@ async def get_response(self, messages: list[BetaMessageParam]) -> AgentResponse:
         else:
             # Regular Anthropic client supports .stream()
             async with self.anthropic_client.beta.messages.stream(
-                model=self.config.checkpoint_name,
+                model=self.config.model,
                 system=self.system_prompt if self.system_prompt is not None else Omit(),
                 max_tokens=self.max_tokens,
                 messages=messages_cached,
diff --git a/hud/agents/gemini.py b/hud/agents/gemini.py
index f7745a69..d9cfeb71 100644
--- a/hud/agents/gemini.py
+++ b/hud/agents/gemini.py
@@ -3,19 +3,13 @@
 from __future__ import annotations
 
 import logging
-from typing import TYPE_CHECKING, Any, ClassVar, cast
+from typing import Any, ClassVar, cast
 
+import mcp.types as types
 from google import genai
 from google.genai import types as genai_types
 from pydantic import ConfigDict
 
-import hud
-
-if TYPE_CHECKING:
-    from hud.datasets import Task
-
-import mcp.types as types
-
 from hud.settings import settings
 from hud.types import AgentResponse, BaseAgentConfig, MCPToolCall, MCPToolResult
 from hud.utils.hud_console import HUDConsole
@@ -32,7 +26,7 @@ class GeminiConfig(BaseAgentConfig):
     model_config = ConfigDict(arbitrary_types_allowed=True)
 
     model_name: str = "Gemini"
-    checkpoint_name: str = "gemini-3-pro-preview"
+    model: str = "gemini-3-pro-preview"
     model_client: genai.Client | None = None
     temperature: float = 1.0
     top_p: float = 0.95
@@ -89,13 +83,11 @@ def __init__(self, params: GeminiCreateParams | None = None, **kwargs: Any) -> N
         self._gemini_to_mcp_tool_map: dict[str, str] = {}
         self.gemini_tools: genai_types.ToolListUnion = []
 
-    async def initialize(self, task: str | Task | None = None) -> None:
-        """Initialize the agent and build tool mappings."""
-        await super().initialize(task)
-        # Build tool mappings after tools are discovered
+    def _on_tools_ready(self) -> None:
+        """Build Gemini-specific tool mappings after tools are discovered."""
         self._convert_tools_for_gemini()
 
-    async def get_system_messages(self) -> list[Any]:
+    async def get_system_messages(self) -> list[genai_types.Content]:
         """No system messages for Gemini because applied in get_response"""
         return []
 
@@ -122,11 +114,6 @@ async def format_blocks(self, blocks: list[types.ContentBlock]) -> list[genai_ty
 
         return [genai_types.Content(role="user", parts=gemini_parts)]
 
-    @hud.instrument(
-        span_type="agent",
-        record_args=False,  # Messages can be large
-        record_result=True,
-    )
     async def get_response(self, messages: list[genai_types.Content]) -> AgentResponse:
         """Get response from Gemini including any tool calls."""
         # Build generate content config
@@ -141,7 +128,7 @@ async def get_response(self, messages: list[genai_types.Content]) -> AgentRespon
 
         # Use async API to avoid blocking the event loop
         response = await self.gemini_client.aio.models.generate_content(
-            model=self.config.checkpoint_name,
+            model=self.config.model,
             contents=cast("Any", messages),
             config=generate_config,
         )
diff --git a/hud/agents/gemini_cua.py b/hud/agents/gemini_cua.py
index 68c9dbef..20654d98 100644
--- a/hud/agents/gemini_cua.py
+++ b/hud/agents/gemini_cua.py
@@ -10,7 +10,7 @@
 from pydantic import ConfigDict, Field
 
 from hud.tools.computer.settings import computer_settings
-from hud.types import BaseAgentConfig, MCPToolCall, MCPToolResult
+from hud.types import AgentResponse, BaseAgentConfig, MCPToolCall, MCPToolResult
 from hud.utils.types import with_signature
 
 from .base import BaseCreateParams, MCPAgent
@@ -62,7 +62,7 @@ class GeminiCUAConfig(GeminiConfig):
     model_config = ConfigDict(arbitrary_types_allowed=True)
 
     model_name: str = "GeminiCUA"
-    checkpoint_name: str = "gemini-2.5-computer-use-preview-10-2025"
+    model: str = "gemini-2.5-computer-use-preview-10-2025"
     excluded_predefined_functions: list[str] = Field(default_factory=list)
 
 
@@ -126,7 +126,7 @@ def _to_gemini_tool(self, tool: types.Tool) -> genai_types.Tool | None:
         # For non-computer tools, use the parent implementation
         return super()._to_gemini_tool(tool)
 
-    async def get_response(self, messages: list[genai_types.Content]) -> Any:
+    async def get_response(self, messages: list[genai_types.Content]) -> AgentResponse:
         """Get response from Gemini including any tool calls.
 
         Extends parent to trim old screenshots before making API call.
@@ -252,7 +252,7 @@ def _extract_tool_call(self, part: genai_types.Part) -> MCPToolCall | None:
             # Map common argument shapes used by Gemini Computer Use
             # 1) Coordinate arrays → x/y
             coord = raw_args.get("coordinate") or raw_args.get("coordinates")
-            if isinstance(coord, (list, tuple)) and len(coord) >= 2:
+            if isinstance(coord, list | tuple) and len(coord) >= 2:
                 try:
                     normalized_args["x"] = int(coord[0])
                     normalized_args["y"] = int(coord[1])
@@ -266,7 +266,7 @@ def _extract_tool_call(self, part: genai_types.Part) -> MCPToolCall | None:
                 or raw_args.get("destination_coordinate")
                 or raw_args.get("destinationCoordinate")
             )
-            if isinstance(dest, (list, tuple)) and len(dest) >= 2:
+            if isinstance(dest, list | tuple) and len(dest) >= 2:
                 try:
                     normalized_args["destination_x"] = int(dest[0])
                     normalized_args["destination_y"] = int(dest[1])
diff --git a/hud/agents/grounded_openai.py b/hud/agents/grounded_openai.py
index 1410a448..441bbda9 100644
--- a/hud/agents/grounded_openai.py
+++ b/hud/agents/grounded_openai.py
@@ -7,7 +7,6 @@
 
 from pydantic import ConfigDict, field_validator
 
-from hud import instrument
 from hud.tools.grounding import GroundedComputerTool, Grounder, GrounderConfig
 from hud.types import AgentResponse, MCPToolCall, MCPToolResult
 from hud.utils.types import with_signature
@@ -38,7 +37,7 @@ class GroundedOpenAIConfig(OpenAIChatConfig):
     model_config = ConfigDict(arbitrary_types_allowed=True)
 
     grounder_config: GrounderConfig
-    checkpoint_name: str = "gpt-4o-mini"
+    model: str = "gpt-4o-mini"
     allowed_tools: list[str] | None = None  # Default set in validator
     append_setup_output: bool = False
     system_prompt: str | None = DEFAULT_GROUNDED_PROMPT
@@ -83,15 +82,12 @@ def __init__(self, params: GroundedOpenAICreateParams | None = None, **kwargs: A
         self.grounder = Grounder(self.config.grounder_config)
         self.grounded_tool: GroundedComputerTool | None = None
 
-    async def initialize(self, task: Any = None) -> None:
-        """Initialize the agent and create the grounded tool with mcp_client."""
-        # Call parent initialization first
-        await super().initialize(task)
-
-        if self.mcp_client is None:
-            raise ValueError("mcp_client must be initialized before creating grounded tool")
+    def _on_tools_ready(self) -> None:
+        """Create the grounded tool after context is bound."""
+        if self.ctx is None:
+            raise ValueError("ctx must be set before creating grounded tool")
         self.grounded_tool = GroundedComputerTool(
-            grounder=self.grounder, mcp_client=self.mcp_client, computer_tool_name="computer"
+            grounder=self.grounder, ctx=self.ctx, computer_tool_name="computer"
         )
 
     def get_tool_schemas(self) -> list[Any]:
@@ -107,11 +103,6 @@ def get_tool_schemas(self) -> list[Any]:
             return []
         return [self.grounded_tool.get_openai_tool_schema()]
 
-    @instrument(
-        span_type="agent",
-        record_args=False,
-        record_result=True,
-    )
     async def get_response(self, messages: Any) -> AgentResponse:
         """Get response from the planning model and handle grounded tool calls.
 
@@ -141,11 +132,9 @@ async def get_response(self, messages: Any) -> AgentResponse:
         )
 
         if not has_image:
-            if self.mcp_client is None:
-                raise ValueError("mcp_client is not initialized")
-            screenshot_result = await self.mcp_client.call_tool(
-                MCPToolCall(name="computer", arguments={"action": "screenshot"})
-            )
+            if self.ctx is None:
+                raise ValueError("ctx is not initialized")
+            screenshot_result = await self.ctx.call_tool(("computer", {"action": "screenshot"}))
 
             for block in screenshot_result.content:
                 # Check for ImageContent type from MCP
@@ -169,7 +158,7 @@ async def get_response(self, messages: Any) -> AgentResponse:
         extra = {k: v for k, v in (self.completion_kwargs or {}).items() if k not in protected_keys}
 
         response = await self.oai.chat.completions.create(  # type: ignore
-            model=self.config.checkpoint_name,
+            model=self.config.model,
             messages=messages,
             tools=tool_schemas,
             parallel_tool_calls=False,
diff --git a/hud/agents/misc/integration_test_agent.py b/hud/agents/misc/integration_test_agent.py
index 254b6669..ec9d71e1 100644
--- a/hud/agents/misc/integration_test_agent.py
+++ b/hud/agents/misc/integration_test_agent.py
@@ -1,12 +1,21 @@
 from __future__ import annotations
 
-from typing import Any, ClassVar
+from typing import TYPE_CHECKING, Any, ClassVar
 
-from hud.agents.base import MCPAgent, find_reward
-from hud.types import AgentResponse, BaseAgentConfig, Task, Trace
+from hud.agents.base import MCPAgent
+from hud.types import AgentResponse, BaseAgentConfig, Trace
+
+if TYPE_CHECKING:
+    from hud.eval.context import EvalContext
 
 
 class IntegrationTestRunner(MCPAgent):
+    """Special agent that runs integration tests by executing tools directly.
+
+    Unlike regular agents, this doesn't run an LLM loop - it executes
+    integration_test_tool and evaluate_tool in sequence to verify tool behavior.
+    """
+
     metadata: ClassVar[dict[str, Any] | None] = {}
     config_cls: ClassVar[type[BaseAgentConfig]] = BaseAgentConfig
 
@@ -14,38 +23,50 @@ def __init__(self, **kwargs: Any) -> None:
         kwargs["auto_trace"] = False
         super().__init__(**kwargs)
 
-    async def run(self, prompt_or_task: str | Task | dict[str, Any], max_steps: int = 10) -> Trace:
+    async def run(
+        self,
+        ctx: EvalContext,
+        *,
+        max_steps: int = 10,
+    ) -> Trace:
+        """Run integration test by executing tools directly.
+
+        The EvalContext should have integration_test_tool and evaluate_tool
+        configured in its metadata or environment setup.
+        """
+        from hud.eval.context import EvalContext
+
+        if not isinstance(ctx, EvalContext):
+            raise TypeError(f"ctx must be EvalContext, got {type(ctx).__name__}")
+
+        self.ctx = ctx
+
         try:
-            # Initialize using base to set up client and telemetry correctly
-            if isinstance(prompt_or_task, str):
-                task = Task(prompt=prompt_or_task, mcp_config={})
-            elif isinstance(prompt_or_task, dict):
-                task = Task(**prompt_or_task)
-            else:
-                task = prompt_or_task
-            await self.initialize(task)
+            # Initialize tools from context
+            if not self._initialized:
+                await self._initialize_from_ctx(ctx)
 
             self.console.info(f"Full system prompt: {self.system_prompt}")
 
-            # Validate task shape
-            if not getattr(task, "integration_test_tool", None):
+            # For integration tests, we expect the context's environment to have
+            # _setup_calls, _integration_test_calls, and _evaluate_calls configured
+            env = ctx
+
+            # Run integration test tool (stored in environment metadata or separate list)
+            integration_test_calls = getattr(env, "_integration_test_calls", [])
+            if not integration_test_calls:
                 raise ValueError(
-                    "--integration-test requires task.integration_test_tool (single call)"
+                    "--integration-test requires integration_test_tool to be configured"
                 )
-            elif not getattr(task, "evaluate_tool", None):
-                raise ValueError("--integration-test requires task.evaluate_tool (single call)")
-
-            if task.setup_tool:
-                _ = await self.call_tools(task.setup_tool)
 
-            _ = await self.call_tools(task.integration_test_tool)
-            evaluate_result = await self.call_tools(task.evaluate_tool)
+            for name, args in integration_test_calls:
+                await ctx.call_tool((name, args))
 
-            reward = float(find_reward(evaluate_result[0])) if evaluate_result else 0.0
+            # The evaluate phase runs automatically when ctx exits,
+            # but we can also get the reward from ctx.reward after
+            return Trace(done=True, reward=ctx.reward or 0.0, info={})
 
-            return Trace(done=True, reward=reward, info={})
         finally:
-            # Ensure resources are cleaned up so the CLI can exit cleanly
             await self._cleanup()
 
     # Stub implementations to satisfy abstract base class; not used in --integration-test path
diff --git a/hud/agents/openai.py b/hud/agents/openai.py
index d90dcd0a..84e7c85c 100644
--- a/hud/agents/openai.py
+++ b/hud/agents/openai.py
@@ -31,7 +31,6 @@
 from openai.types.shared_params.reasoning import Reasoning  # noqa: TC002
 from pydantic import ConfigDict
 
-import hud
 from hud.settings import settings
 from hud.types import AgentResponse, BaseAgentConfig, MCPToolCall, MCPToolResult, Trace
 from hud.utils.strict_schema import ensure_strict_json_schema
@@ -48,7 +47,7 @@ class OpenAIConfig(BaseAgentConfig):
     model_config = ConfigDict(arbitrary_types_allowed=True)
 
     model_name: str = "OpenAI"
-    checkpoint_name: str = "gpt-5.1"
+    model: str = "gpt-5.1"
     model_client: AsyncOpenAI | None = None
     max_output_tokens: int | None = None
     temperature: float | None = None
@@ -92,7 +91,7 @@ def __init__(self, params: OpenAICreateParams | None = None, **kwargs: Any) -> N
                 raise ValueError(f"OpenAI API key is invalid: {exc}") from exc
 
         self.openai_client = model_client
-        self.model = self.config.checkpoint_name
+        self._model = self.config.model
         self.max_output_tokens = self.config.max_output_tokens
         self.temperature = self.config.temperature
         self.reasoning = self.config.reasoning
@@ -106,9 +105,8 @@ def __init__(self, params: OpenAICreateParams | None = None, **kwargs: Any) -> N
         self.last_response_id: str | None = None
         self._message_cursor = 0
 
-    async def initialize(self, task: Any | None = None) -> None:
-        """Initialize agent and build tool metadata."""
-        await super().initialize(task)
+    def _on_tools_ready(self) -> None:
+        """Build OpenAI-specific tool mappings after tools are discovered."""
         self._convert_tools_for_openai()
 
     def _to_openai_tool(
@@ -141,8 +139,7 @@ def _to_openai_tool(
             strict_schema = ensure_strict_json_schema(copy.deepcopy(tool.inputSchema))
         except Exception as e:
             self.console.warning_log(f"Failed to convert tool '{tool.name}' schema to strict: {e}")
-            logger.error(json.dumps(tool.inputSchema, indent=2))
-            raise e
+            return None
 
         return FunctionToolParam(
             type="function",
@@ -202,7 +199,7 @@ async def get_system_messages(self) -> list[types.ContentBlock]:
         """System messages are provided via the `instructions` field."""
         return []
 
-    async def format_blocks(self, blocks: list[types.ContentBlock]) -> ResponseInputParam:
+    async def format_blocks(self, blocks: list[types.ContentBlock]) -> list[Message]:
         """Convert MCP content blocks into OpenAI user messages."""
         content: ResponseInputMessageContentListParam = []
         for block in blocks:
@@ -221,11 +218,6 @@ async def format_blocks(self, blocks: list[types.ContentBlock]) -> ResponseInput
             content.append(ResponseInputTextParam(type="input_text", text=""))
         return [Message(role="user", content=content)]
 
-    @hud.instrument(
-        span_type="agent",
-        record_args=False,
-        record_result=True,
-    )
     async def get_response(self, messages: ResponseInputParam) -> AgentResponse:
         """Send the latest input items to OpenAI's Responses API."""
         new_items: ResponseInputParam = messages[self._message_cursor :]
@@ -241,7 +233,7 @@ async def get_response(self, messages: ResponseInputParam) -> AgentResponse:
                 return AgentResponse(content="", tool_calls=[], done=True)
 
         response = await self.openai_client.responses.create(
-            model=self.model,
+            model=self._model,
             input=new_items,
             instructions=self.system_prompt,
             max_output_tokens=self.max_output_tokens,
@@ -289,9 +281,9 @@ async def get_response(self, messages: ResponseInputParam) -> AgentResponse:
 
     async def format_tool_results(
         self, tool_calls: list[MCPToolCall], tool_results: list[MCPToolResult]
-    ) -> ResponseInputParam:
+    ) -> list[FunctionCallOutput]:
         """Convert MCP tool outputs into Responses input items."""
-        formatted: ResponseInputParam = []
+        formatted: list[FunctionCallOutput] = []
         for call, result in zip(tool_calls, tool_results, strict=False):
             if not call.id:
                 self.console.warning_log(f"Tool '{call.name}' missing call_id; skipping output.")
diff --git a/hud/agents/openai_chat.py b/hud/agents/openai_chat.py
index c342316b..e4e61b05 100644
--- a/hud/agents/openai_chat.py
+++ b/hud/agents/openai_chat.py
@@ -6,6 +6,7 @@
 
 Key points:
 - Stateless, no special server-side conversation state is assumed.
+- Defaults to HUD inference gateway (inference.hud.ai) when HUD_API_KEY is set
 - Accepts an :class:`openai.AsyncOpenAI` client, caller can supply their own
   base_url / api_key (e.g. llama.cpp, together.ai, …)
 - All HUD features (step_count, OTel spans, tool filtering, screenshots, …)
@@ -23,7 +24,7 @@
 from openai import AsyncOpenAI
 from pydantic import ConfigDict, Field
 
-from hud import instrument
+from hud.settings import settings
 from hud.types import AgentResponse, BaseAgentConfig, MCPToolCall, MCPToolResult
 from hud.utils.hud_console import HUDConsole
 from hud.utils.types import with_signature
@@ -43,7 +44,7 @@ class OpenAIChatConfig(BaseAgentConfig):
     model_config = ConfigDict(arbitrary_types_allowed=True)
 
     model_name: str = "OpenAI Chat"
-    checkpoint_name: str = "gpt-5-mini"
+    model: str = "gpt-5-mini"
     openai_client: AsyncOpenAI | None = None
     api_key: str | None = None
     base_url: str | None = None
@@ -73,10 +74,16 @@ def __init__(self, params: OpenAIChatCreateParams | None = None, **kwargs: Any)
             self.oai = self.config.openai_client
         elif self.config.api_key is not None or self.config.base_url is not None:
             self.oai = AsyncOpenAI(api_key=self.config.api_key, base_url=self.config.base_url)
+        elif settings.api_key:
+            # Default to HUD inference gateway
+            self.oai = AsyncOpenAI(
+                api_key=settings.api_key,
+                base_url=settings.hud_gateway_url,
+            )
         else:
             raise ValueError(
-                "Either openai_client or api_key must be provided. "
-                "Set OPENAI_API_KEY environment variable or pass api_key explicitly."
+                "No API key found. Set HUD_API_KEY for HUD gateway, "
+                "or provide api_key/base_url/openai_client explicitly."
             )
 
         self.completion_kwargs = dict(self.config.completion_kwargs)
@@ -97,14 +104,14 @@ def _oai_to_mcp(tool_call: Any) -> MCPToolCall:  # type: ignore[valid-type]
             arguments=args,
         )
 
-    async def get_system_messages(self) -> list[Any]:
+    async def get_system_messages(self) -> list[dict[str, Any]]:
         """Get system messages for OpenAI."""
         if self.system_prompt is not None:
             return [{"role": "system", "content": self.system_prompt}]
         else:
             return []
 
-    async def format_blocks(self, blocks: list[types.ContentBlock]) -> list[Any]:
+    async def format_blocks(self, blocks: list[types.ContentBlock]) -> list[dict[str, Any]]:
         """Format blocks for OpenAI."""
         content = []
         for block in blocks:
@@ -213,18 +220,13 @@ async def _invoke_chat_completion(
             raise ValueError("openai_client is required for OpenAIChatAgent")
         # default transport = OpenAI SDK
         return await self.oai.chat.completions.create(
-            model=self.config.checkpoint_name,
+            model=self.config.model,
             messages=messages,
             tools=tools,  # type: ignore ready ChatCompletionToolParam-shaped
             **extra,
         )  # type: ignore
 
-    @instrument(
-        span_type="agent",
-        record_args=False,
-        record_result=True,
-    )
-    async def get_response(self, messages: list[Any]) -> AgentResponse:
+    async def get_response(self, messages: list[dict[str, Any]]) -> AgentResponse:
         """Send chat request to OpenAI and convert the response."""
 
         # Convert MCP tool schemas to OpenAI format
@@ -297,7 +299,7 @@ async def format_tool_results(
         self,
         tool_calls: list[MCPToolCall],
         tool_results: list[MCPToolResult],
-    ) -> list[Any]:
+    ) -> list[dict[str, Any]]:
         """Render MCP tool results as OpenAI messages.
 
         Note: OpenAI tool messages only support string content.
diff --git a/hud/agents/operator.py b/hud/agents/operator.py
index eab72772..d9e75f7d 100644
--- a/hud/agents/operator.py
+++ b/hud/agents/operator.py
@@ -11,10 +11,10 @@
     FunctionShellToolParam,
     FunctionToolParam,
     ResponseComputerToolCallOutputScreenshotParam,
-    ResponseInputParam,
 )
 from openai.types.responses.response_input_param import (
     ComputerCallOutput,
+    FunctionCallOutput,
 )
 from openai.types.shared_params.reasoning import Reasoning
 from pydantic import ConfigDict
@@ -56,7 +56,7 @@ class OperatorConfig(OpenAIConfig):
     model_config = ConfigDict(arbitrary_types_allowed=True)
 
     model_name: str = "Operator"
-    checkpoint_name: str = "computer-use-preview"
+    model: str = "computer-use-preview"
     environment: Literal["windows", "mac", "linux", "ubuntu", "browser"] = "linux"
 
 
@@ -144,10 +144,10 @@ def _extract_tool_call(self, item: Any) -> MCPToolCall | None:
 
     async def format_tool_results(
         self, tool_calls: list[MCPToolCall], tool_results: list[MCPToolResult]
-    ) -> ResponseInputParam:
+    ) -> list[ComputerCallOutput | FunctionCallOutput]:
         remaining_calls: list[MCPToolCall] = []
         remaining_results: list[MCPToolResult] = []
-        computer_outputs: ResponseInputParam = []
+        computer_outputs: list[ComputerCallOutput] = []
         ordering: list[tuple[str, int]] = []
 
         for call, result in zip(tool_calls, tool_results, strict=False):
@@ -186,8 +186,8 @@ async def format_tool_results(
                 remaining_results.append(result)
                 ordering.append(("function", len(remaining_calls) - 1))
 
-        formatted: ResponseInputParam = []
-        function_outputs: ResponseInputParam = []
+        formatted: list[ComputerCallOutput | FunctionCallOutput] = []
+        function_outputs: list[FunctionCallOutput] = []
         if remaining_calls:
             function_outputs = await super().format_tool_results(remaining_calls, remaining_results)
 
diff --git a/hud/agents/tests/conftest.py b/hud/agents/tests/conftest.py
index 871b96b6..629aeede 100644
--- a/hud/agents/tests/conftest.py
+++ b/hud/agents/tests/conftest.py
@@ -2,77 +2,101 @@
 
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Any
+from typing import Any
 
 import pytest
-
-if TYPE_CHECKING:
-    from collections.abc import Callable
 from mcp import types
 
+from hud.environment.router import ToolRouter
+from hud.eval.context import EvalContext
 from hud.types import MCPToolCall, MCPToolResult
 
 
-class MockMCPClient:
-    """Mock MCP client that satisfies AgentMCPClient protocol."""
+class MockEvalContext(EvalContext):
+    """Mock EvalContext for testing agents.
 
-    _initialized: bool = False
+    This provides a minimal EvalContext implementation that can be used
+    to test agent initialization and tool calling without a real environment.
+    """
 
     def __init__(
         self,
+        prompt: str = "Test prompt",
         tools: list[types.Tool] | None = None,
-        call_tool_handler: Callable[[MCPToolCall], MCPToolResult] | None = None,
-        initialize_error: Exception | None = None,
+        call_tool_handler: Any = None,
     ) -> None:
-        self._mcp_config: dict[str, dict[str, Any]] = {"test": {"url": "http://test"}}
+        # Core attributes
+        self.prompt = prompt
         self._tools = tools or []
+        self._submitted: str | None = None
+        self.reward: float | None = None
         self._call_tool_handler = call_tool_handler
-        self._initialize_error = initialize_error
-        self.call_tool_calls: list[MCPToolCall] = []
-        self.shutdown_called = False
-
-    @property
-    def mcp_config(self) -> dict[str, dict[str, Any]]:
-        return self._mcp_config
+        self.tool_calls: list[tuple[str, dict[str, Any]]] = []
+
+        # Environment attributes
+        self._router = ToolRouter()
+        self._agent_include: list[str] | None = None
+        self._agent_exclude: list[str] | None = None
+
+        # EvalContext attributes
+        self._task = None
+        self.trace_id = "test-trace-id"
+        self.eval_name = "test-eval"
+        self.job_id: str | None = None
+        self.group_id: str | None = None
+        self.index = 0
+        self.variants: dict[str, Any] = {}
+        self.answer: str | None = None
+        self.system_prompt: str | None = None
+        self.error: BaseException | None = None
+        self.metadata: dict[str, Any] = {}
+        self.results: list[Any] = []
+        self._is_summary = False
+
+    def as_tools(self) -> list[types.Tool]:
+        return self._tools
 
     @property
-    def is_connected(self) -> bool:
-        return self._initialized
-
-    async def initialize(self, mcp_config: dict[str, dict[str, Any]] | None = None) -> None:
-        if self._initialize_error:
-            raise self._initialize_error
-        self._initialized = True
-
-    async def shutdown(self) -> None:
-        self.shutdown_called = True
+    def has_scenario(self) -> bool:
+        return False
 
     async def list_tools(self) -> list[types.Tool]:
         return self._tools
 
-    async def call_tool(self, tool_call: MCPToolCall) -> MCPToolResult:
-        self.call_tool_calls.append(tool_call)
+    async def call_tool(self, call: Any, /, **kwargs: Any) -> MCPToolResult:
+        # Parse the call
+        if isinstance(call, tuple):
+            name, args = call[0], call[1] if len(call) > 1 else {}
+        elif hasattr(call, "name"):
+            name, args = call.name, getattr(call, "arguments", {}) or {}
+        else:
+            name, args = str(call), kwargs
+
+        self.tool_calls.append((name, args))
+
         if self._call_tool_handler:
-            return self._call_tool_handler(tool_call)
-        return MCPToolResult(content=[])
+            tc = MCPToolCall(name=name, arguments=args)
+            return self._call_tool_handler(tc)
 
-    def get_available_tools(self) -> list[types.Tool]:
-        return self._tools
+        return MCPToolResult(
+            content=[types.TextContent(type="text", text=f"Result from {name}")],
+            isError=False,
+        )
 
-    def get_tool_map(self) -> dict[str, types.Tool]:
-        return {t.name: t for t in self._tools}
+    async def submit(self, answer: str) -> None:
+        self._submitted = answer
 
 
 @pytest.fixture
-def mock_mcp_client() -> MockMCPClient:
-    """Create a mock MCP client that satisfies the AgentMCPClient protocol."""
-    return MockMCPClient()
+def mock_eval_context() -> MockEvalContext:
+    """Create a basic mock EvalContext."""
+    return MockEvalContext()
 
 
 @pytest.fixture
-def mock_mcp_client_with_tools() -> MockMCPClient:
-    """Create a mock MCP client with a test tool."""
-    return MockMCPClient(
+def mock_eval_context_with_tools() -> MockEvalContext:
+    """Create a mock EvalContext with test tools."""
+    return MockEvalContext(
         tools=[
             types.Tool(
                 name="test_tool",
@@ -84,41 +108,26 @@ def mock_mcp_client_with_tools() -> MockMCPClient:
 
 
 @pytest.fixture
-def mock_mcp_client_openai_computer() -> MockMCPClient:
-    """Create a mock MCP client with openai_computer tool for Operator tests."""
-    return MockMCPClient(
-        tools=[
-            types.Tool(
-                name="openai_computer",
-                description="OpenAI computer use tool",
-                inputSchema={},
-            )
-        ]
-    )
-
-
-@pytest.fixture
-def mock_mcp_client_gemini_computer() -> MockMCPClient:
-    """Create a mock MCP client with gemini_computer tool for Gemini tests."""
-    return MockMCPClient(
+def mock_eval_context_computer() -> MockEvalContext:
+    """Create a mock EvalContext with computer tool."""
+    return MockEvalContext(
         tools=[
             types.Tool(
-                name="gemini_computer",
-                description="Gemini computer use tool",
-                inputSchema={},
+                name="computer",
+                description="Computer use tool",
+                inputSchema={"type": "object"},
             )
         ]
     )
 
 
 @pytest.fixture
-def mock_mcp_client_browser_tools() -> MockMCPClient:
-    """Create a mock MCP client with browser-like tools for extended tests."""
-    return MockMCPClient(
+def mock_eval_context_browser_tools() -> MockEvalContext:
+    """Create a mock EvalContext with browser-like tools."""
+    return MockEvalContext(
         tools=[
             types.Tool(name="screenshot", description="Take screenshot", inputSchema={}),
             types.Tool(name="click", description="Click at coordinates", inputSchema={}),
             types.Tool(name="type", description="Type text", inputSchema={}),
-            types.Tool(name="bad_tool", description="A tool that fails", inputSchema={}),
         ]
     )
diff --git a/hud/agents/tests/test_base.py b/hud/agents/tests/test_base.py
index b2b544c2..fc2d4706 100644
--- a/hud/agents/tests/test_base.py
+++ b/hud/agents/tests/test_base.py
@@ -1,737 +1,352 @@
-"""Tests for BaseMCPAgent using simulated actions."""
+"""Tests for MCPAgent base class with v5 EvalContext pattern."""
 
 from __future__ import annotations
 
 from typing import Any, ClassVar
-from unittest.mock import MagicMock
-
-# Import AsyncMock from unittest.mock if available (Python 3.8+)
-try:
-    from unittest.mock import AsyncMock
-except ImportError:
-    # Fallback for older Python versions
-    from unittest.mock import MagicMock as AsyncMock
 
 import pytest
 from mcp import types
 
 from hud.agents import MCPAgent
 from hud.agents.base import BaseCreateParams
-from hud.datasets import Task
-from hud.tools.executors.base import BaseExecutor
-from hud.types import AgentResponse, BaseAgentConfig, MCPToolCall, MCPToolResult, Trace
-
-from .conftest import MockMCPClient
+from hud.environment.router import ToolRouter
+from hud.eval.context import EvalContext
+from hud.types import AgentResponse, BaseAgentConfig, MCPToolCall, MCPToolResult
 
 
 class MockConfig(BaseAgentConfig):
     model_name: str = "MockAgent"
-    checkpoint_name: str = "mock-model"
+    model: str = "mock-model"
 
 
 class MockCreateParams(BaseCreateParams, MockConfig):
     pass
 
 
+class MockEvalContext(EvalContext):
+    """Mock EvalContext for testing."""
+
+    def __init__(
+        self,
+        prompt: str = "Test prompt",
+        tools: list[types.Tool] | None = None,
+    ) -> None:
+        # Core attributes
+        self.prompt = prompt
+        self._tools = tools or [
+            types.Tool(name="test_tool", description="A test tool", inputSchema={}),
+            types.Tool(name="another_tool", description="Another tool", inputSchema={}),
+        ]
+        self._submitted: str | None = None
+        self.reward: float | None = None
+        self._tool_calls: list[tuple[str, dict[str, Any]]] = []
+
+        # Environment attributes
+        self._router = ToolRouter()
+        self._agent_include: list[str] | None = None
+        self._agent_exclude: list[str] | None = None
+
+        # EvalContext attributes
+        self._task = None
+        self.trace_id = "test-trace-id"
+        self.eval_name = "test-eval"
+        self.job_id: str | None = None
+        self.group_id: str | None = None
+        self.index = 0
+        self.variants: dict[str, Any] = {}
+        self.answer: str | None = None
+        self.system_prompt: str | None = None
+        self.error: BaseException | None = None
+        self.metadata: dict[str, Any] = {}
+        self.results: list[Any] = []
+        self._is_summary = False
+
+    def as_tools(self) -> list[types.Tool]:
+        return self._tools
+
+    @property
+    def has_scenario(self) -> bool:
+        return True
+
+    async def list_tools(self) -> list[types.Tool]:
+        return self._tools
+
+    async def call_tool(self, call: Any, /, **kwargs: Any) -> MCPToolResult:
+        # Parse the call
+        if isinstance(call, tuple):
+            name, args = call[0], call[1] if len(call) > 1 else {}
+        elif hasattr(call, "name"):
+            name, args = call.name, getattr(call, "arguments", {}) or {}
+        else:
+            name, args = str(call), kwargs
+        self._tool_calls.append((name, args))
+        return MCPToolResult(
+            content=[types.TextContent(type="text", text=f"Result from {name}")],
+            isError=False,
+        )
+
+    async def submit(self, answer: str) -> None:
+        self._submitted = answer
+
+
 class MockMCPAgent(MCPAgent):
-    """Concrete implementation of BaseMCPAgent for testing."""
+    """Concrete implementation of MCPAgent for testing."""
 
     metadata: ClassVar[dict[str, Any] | None] = {}
     config_cls: ClassVar[type[BaseAgentConfig]] = MockConfig
 
-    def __init__(self, mcp_client: Any = None, **kwargs: Any) -> None:
-        if mcp_client is None:
-            mcp_client = MockMCPClient()
-
-        kwargs.setdefault("mcp_client", mcp_client)
+    def __init__(self, **kwargs: Any) -> None:
         params = MockCreateParams(**kwargs)
         super().__init__(params)
-        self.executor = BaseExecutor()
-        self._messages: list[dict[str, Any]] = []
+        self._response = AgentResponse(content="Mock response", tool_calls=[], done=True)
 
-    async def create_initial_messages(
-        self, prompt: str, initial_screenshot: bool = False
-    ) -> list[dict[str, Any]]:
-        """Mock create initial messages."""
-        messages = [{"role": "user", "content": prompt}]
-        if initial_screenshot:
-            messages.append({"role": "assistant", "content": "Screenshot: mock_screenshot"})
-        return messages
+    def set_response(self, response: AgentResponse) -> None:
+        self._response = response
 
     async def get_response(self, messages: list[dict[str, Any]]) -> AgentResponse:
-        """Mock get response."""
-        return AgentResponse(content="Mock response", tool_calls=[], done=True)
+        return self._response
 
     async def format_tool_results(
         self, tool_calls: list[MCPToolCall], tool_results: list[MCPToolResult]
     ) -> list[dict[str, Any]]:
-        """Mock format tool results."""
         formatted = []
-        for tool_call, result in zip(tool_calls, tool_results):
+        for tool_call, result in zip(tool_calls, tool_results, strict=True):
             formatted.append({"role": "tool", "name": tool_call.name, "content": str(result)})
         return formatted
 
-    async def create_user_message(self, text: str) -> Any:
-        """Mock create user message."""
-        return {"role": "user", "content": text}
-
     async def get_system_messages(self) -> list[Any]:
-        """Mock get system messages."""
         return []
 
     async def format_blocks(self, blocks: list[types.ContentBlock]) -> list[Any]:
-        """Mock format blocks."""
-        formatted = []
-        for block in blocks:
-            if isinstance(block, types.TextContent):
-                formatted.append({"type": "text", "text": block.text})
-            elif isinstance(block, types.ImageContent):
-                formatted.append({"type": "image", "data": block.data})
-            elif hasattr(block, "type"):
-                formatted.append({"type": getattr(block, "type", "unknown")})
-        return formatted
+        return [{"type": "text", "text": getattr(b, "text", "")} for b in blocks]
 
 
-class TestBaseMCPAgent:
-    """Tests for BaseMCPAgent with simulated actions."""
+class TestMCPAgentInit:
+    """Tests for MCPAgent initialization."""
 
-    def test_init_defaults(self):
-        """Test initialization with default values."""
+    def test_init_defaults(self) -> None:
+        """Test agent initializes with default config."""
         agent = MockMCPAgent()
+        assert agent.ctx is None
+        assert agent._initialized is False
+        assert agent.system_prompt is None
 
-        assert agent.mcp_client is not None
-        assert agent.allowed_tools is None
-        assert agent.disallowed_tools is None
-        assert agent.initial_screenshot is True
-
-    def test_init_with_params(self, mock_mcp_client):
-        """Test initialization with custom parameters."""
-        agent = MockMCPAgent(
-            mcp_client=mock_mcp_client,
-            allowed_tools=["tool1", "tool2"],
-            disallowed_tools=["bad_tool"],
-            initial_screenshot=True,
-            system_prompt="Custom prompt",
-        )
-
-        assert agent.mcp_client == mock_mcp_client
-        assert agent.allowed_tools == ["tool1", "tool2"]
-        assert agent.disallowed_tools == ["bad_tool"]
-        assert agent.initial_screenshot is True
+    def test_init_with_system_prompt(self) -> None:
+        """Test agent with custom system prompt."""
+        agent = MockMCPAgent(system_prompt="Custom prompt")
         assert agent.system_prompt == "Custom prompt"
 
-    @pytest.mark.asyncio
-    async def test_init_no_client_no_task(self):
-        """Test initialize fails without client and without task."""
-
-        # Create a minimal concrete implementation to test the ValueError
-        class TestAgentConfig(BaseAgentConfig):
-            model_name: str = "TestAgent"
-            checkpoint_name: str = "test-model"
 
-        class TestAgentCreateParams(BaseCreateParams, TestAgentConfig):
-            pass
-
-        class TestAgent(MCPAgent):
-            config_cls = TestAgentConfig
-
-            def __init__(self, **kwargs: Any) -> None:
-                params = TestAgentCreateParams(**kwargs)
-                super().__init__(params)
-
-            async def create_initial_messages(
-                self, prompt: str, initial_screenshot: bool = False
-            ) -> list[dict[str, Any]]:
-                return []
-
-            async def format_tool_results(
-                self, tool_calls: list[MCPToolCall], tool_results: list[MCPToolResult]
-            ) -> list[dict[str, Any]]:
-                return []
-
-            async def get_response(self, messages: list[dict[str, Any]]) -> AgentResponse:
-                return AgentResponse(content="test", tool_calls=[], done=True)
-
-            async def get_system_messages(self) -> list[Any]:
-                return []
-
-            async def format_blocks(self, blocks: list[types.ContentBlock]) -> list[Any]:
-                return []
-
-        # Agent can be created with None client
-        agent = TestAgent(mcp_client=None)
-
-        # But initialize should fail without client or task
-        with pytest.raises(ValueError, match="No MCPClient"):
-            await agent.initialize()
+class TestMCPAgentRun:
+    """Tests for MCPAgent.run() with EvalContext."""
 
     @pytest.mark.asyncio
-    async def test_initialize_with_sessions(self):
-        """Test initialize with existing sessions."""
+    async def test_run_basic(self) -> None:
+        """Test basic run flow with EvalContext."""
+        ctx = MockEvalContext(prompt="Do something")
         agent = MockMCPAgent()
 
-        # Create proper async mock for session
-        mock_session = MagicMock()
-
-        # Set up the connector and client_session structure
-        mock_session.connector = MagicMock()
-        mock_session.connector.client_session = MagicMock()
-
-        # Mock list_tools on the client_session
-        async def mock_list_tools():
-            return types.ListToolsResult(
-                tools=[
-                    types.Tool(name="tool1", description="Tool 1", inputSchema={"type": "object"}),
-                    types.Tool(name="tool2", description="Tool 2", inputSchema={"type": "object"}),
-                    types.Tool(
-                        name="setup", description="Setup tool", inputSchema={"type": "object"}
-                    ),
-                ]
-            )
-
-        mock_session.connector.client_session.list_tools = mock_list_tools
-
-        assert agent.mcp_client is not None
-
-        # Mock the list_tools method on mcp_client to return the tools
-        agent.mcp_client.list_tools = AsyncMock(
-            return_value=[
-                types.Tool(name="tool1", description="Tool 1", inputSchema={"type": "object"}),
-                types.Tool(name="tool2", description="Tool 2", inputSchema={"type": "object"}),
-                types.Tool(name="setup", description="Setup tool", inputSchema={"type": "object"}),
-            ]
-        )
-
-        await agent.initialize()
-
-        # Check available tools were populated (excludes lifecycle tools)
-        tools = agent.get_available_tools()
-        assert len(tools) == 3  # All tools (setup is not in default lifecycle tools)
-
-        # Ensure names exist in available tools
-        names = {t.name for t in tools}
-        assert {"tool1", "tool2", "setup"} <= names
+        result = await agent.run(ctx)
 
-    @pytest.mark.asyncio
-    async def test_initialize_with_filtering(self):
-        """Test initialize with tool filtering."""
-        agent = MockMCPAgent(allowed_tools=["tool1"], disallowed_tools=["tool3"])
-
-        # Create proper async mock for session
-        mock_session = MagicMock()
-
-        # Set up the connector and client_session structure
-        mock_session.connector = MagicMock()
-        mock_session.connector.client_session = MagicMock()
-
-        async def mock_list_tools():
-            return types.ListToolsResult(
-                tools=[
-                    types.Tool(name="tool1", description="Tool 1", inputSchema={"type": "object"}),
-                    types.Tool(name="tool2", description="Tool 2", inputSchema={"type": "object"}),
-                    types.Tool(name="tool3", description="Tool 3", inputSchema={"type": "object"}),
-                    types.Tool(name="setup", description="Setup", inputSchema={"type": "object"}),
-                ]
-            )
-
-        mock_session.connector.client_session.list_tools = mock_list_tools
-
-        assert agent.mcp_client is not None
-
-        # Mock the list_tools method on mcp_client to return the tools
-        agent.mcp_client.list_tools = AsyncMock(
-            return_value=[
-                types.Tool(name="tool1", description="Tool 1", inputSchema={"type": "object"}),
-                types.Tool(name="tool2", description="Tool 2", inputSchema={"type": "object"}),
-                types.Tool(name="tool3", description="Tool 3", inputSchema={"type": "object"}),
-                types.Tool(name="setup", description="Setup", inputSchema={"type": "object"}),
-            ]
-        )
-
-        await agent.initialize()
-
-        # Check filtering worked - get_available_tools excludes lifecycle tools
-        tools = agent.get_available_tools()
-        tool_names = [t.name for t in tools]
-        assert len(tools) == 1  # Only tool1 (tool2 and tool3 are filtered out)
-        assert "tool1" in tool_names
-        assert "setup" not in tool_names  # Lifecycle tool excluded from available tools
-        assert "tool2" not in tool_names  # Not in allowed list
-        assert "tool3" not in tool_names  # In disallowed list
-
-        # Make sure tool schemas are correct
-        schemas = agent.get_tool_schemas()
-        assert len(schemas) == 1
-        assert schemas[0]["name"] == "tool1"
-        assert schemas[0]["description"] == "Tool 1"
-        assert schemas[0]["parameters"] == {"type": "object"}
+        assert result.done is True
+        assert result.content == "Mock response"
+        assert ctx._submitted == "Mock response"
 
     @pytest.mark.asyncio
-    async def test_call_tool_success(self):
-        """Test successful tool call."""
+    async def test_run_initializes_agent(self) -> None:
+        """Test run() initializes the agent with context."""
+        ctx = MockEvalContext(prompt="Do something")
         agent = MockMCPAgent()
 
-        # Initialize with a tool
-        mock_session = MagicMock()
-        mock_session.connector = MagicMock()
-        mock_session.connector.client_session = MagicMock()
-
-        async def mock_list_tools():
-            return types.ListToolsResult(
-                tools=[
-                    types.Tool(name="test_tool", description="Test", inputSchema={"type": "object"})
-                ]
-            )
-
-        mock_session.connector.client_session.list_tools = mock_list_tools
-
-        # Mock the call_tool method on the client session
-        mock_result = types.CallToolResult(
-            content=[types.TextContent(type="text", text="Tool result")], isError=False
-        )
-
-        async def mock_call_tool(name, args):
-            return mock_result
-
-        mock_session.connector.client_session.call_tool = mock_call_tool
-
-        assert agent.mcp_client is not None
-
-        # Mock the client's call_tool method directly
-        agent.mcp_client.call_tool = AsyncMock(return_value=mock_result)
-
-        # Mock the list_tools method to return the test tool
-        agent.mcp_client.list_tools = AsyncMock(
-            return_value=[
-                types.Tool(name="test_tool", description="Test", inputSchema={"type": "object"})
-            ]
-        )
-
-        await agent.initialize()
-
-        # Call the tool
-        tool_call = MCPToolCall(name="test_tool", arguments={"param": "value"})
-        results = await agent.call_tools(tool_call)
-
-        assert len(results) == 1
-        assert results[0] == mock_result
-        assert not results[0].isError
+        assert not agent._initialized
+        await agent.run(ctx)
+        assert agent._initialized
 
     @pytest.mark.asyncio
-    async def test_call_tool_not_found(self):
-        """Test calling non-existent tool."""
+    async def test_run_discovers_tools(self) -> None:
+        """Test run() discovers tools from context."""
+        tools = [
+            types.Tool(name="tool1", description="Tool 1", inputSchema={}),
+            types.Tool(name="tool2", description="Tool 2", inputSchema={}),
+        ]
+        ctx = MockEvalContext(prompt="Do something", tools=tools)
         agent = MockMCPAgent()
 
-        # Initialize without tools
-        mock_session = MagicMock()
+        # We need to check tools before cleanup
+        # Store a reference to check
+        discovered_tools = []
 
-        async def mock_list_tools():
-            return types.ListToolsResult(tools=[])
+        original_run = agent._run_context
 
-        mock_session.list_tools = mock_list_tools
-        assert agent.mcp_client is not None
+        async def capture_tools(*args: Any, **kwargs: Any) -> Any:
+            discovered_tools.extend(agent.get_available_tools())
+            return await original_run(*args, **kwargs)
 
-        await agent.initialize()
+        agent._run_context = capture_tools  # type: ignore
+        await agent.run(ctx)
 
-        # Try to call unknown tool - call_tools doesn't raise for unknown tools
-        tool_call = MCPToolCall(name="unknown_tool", arguments={})
-        await agent.call_tools(tool_call)
+        assert len(discovered_tools) == 2
+        assert discovered_tools[0].name == "tool1"
+        assert discovered_tools[1].name == "tool2"
 
     @pytest.mark.asyncio
-    async def test_call_tool_no_name(self):
-        """Test calling tool without name."""
-        # MCPToolCall accepts empty names
+    async def test_run_requires_eval_context(self) -> None:
+        """Test run() raises TypeError for non-EvalContext."""
         agent = MockMCPAgent()
-        tool_call = MCPToolCall(name="", arguments={})
 
-        # call_tools doesn't validate empty names, it will return error
-        await agent.call_tools(tool_call)
+        with pytest.raises(TypeError, match="must be EvalContext"):
+            await agent.run("not a context")  # type: ignore
 
-    def test_get_tool_schemas(self):
-        """Test getting tool schemas."""
+    @pytest.mark.asyncio
+    async def test_run_requires_prompt(self) -> None:
+        """Test run() raises ValueError when prompt is empty."""
+        ctx = MockEvalContext(prompt="")
         agent = MockMCPAgent()
 
-        agent._available_tools = [
-            types.Tool(name="tool1", description="Tool 1", inputSchema={"type": "object"}),
-            types.Tool(name="setup", description="Setup", inputSchema={"type": "object"}),
-        ]
-
-        schemas = agent.get_tool_schemas()
-
-        # Should include non-lifecycle tools
-        assert len(schemas) == 2
-        assert schemas[0]["name"] == "tool1"
+        with pytest.raises(ValueError, match="prompt is not set"):
+            await agent.run(ctx)
 
-    def test_get_tools_by_server(self):
-        """Test getting tools grouped by server."""
+    @pytest.mark.asyncio
+    async def test_run_clears_context_after(self) -> None:
+        """Test run() clears ctx after completion."""
+        ctx = MockEvalContext(prompt="Do something")
         agent = MockMCPAgent()
 
-        # Set up tools from different servers
-        tool1 = types.Tool(name="tool1", description="Tool 1", inputSchema={"type": "object"})
-        tool2 = types.Tool(name="tool2", description="Tool 2", inputSchema={"type": "object"})
-
-        agent._available_tools = [tool1, tool2]
-        tools = agent.get_available_tools()
-        assert {t.name for t in tools} == {"tool1", "tool2"}
+        await agent.run(ctx)
+        assert agent.ctx is None
 
     @pytest.mark.asyncio
-    async def test_executor_integration(self):
-        """Test integration with BaseExecutor for simulated actions."""
+    async def test_run_no_submit_on_empty_content(self) -> None:
+        """Test run() doesn't submit when content is empty."""
+        ctx = MockEvalContext(prompt="Do something")
         agent = MockMCPAgent()
+        agent.set_response(AgentResponse(content="", tool_calls=[], done=True))
 
-        # Test various executor actions
-        click_result = await agent.executor.click(100, 200, take_screenshot=False)
-        assert click_result.output is not None
-        assert "[SIMULATED] Click at (100, 200)" in click_result.output
-
-        type_result = await agent.executor.write("Test input", take_screenshot=False)
-        assert type_result.output is not None
-        assert "[SIMULATED] Type 'Test input'" in type_result.output
+        await agent.run(ctx)
+        assert ctx._submitted is None
 
-        scroll_result = await agent.executor.scroll(x=50, y=50, scroll_y=5, take_screenshot=False)
-        assert scroll_result.output is not None
-        assert "[SIMULATED] Scroll" in scroll_result.output
 
-        # Test screenshot
-        screenshot = await agent.executor.screenshot()
-        assert isinstance(screenshot, str)
-        assert screenshot.startswith("iVBORw0KGgo")  # PNG header
+class TestMCPAgentToolCalling:
+    """Tests for tool calling through context."""
 
+    @pytest.mark.asyncio
+    async def test_call_tools_uses_context(self) -> None:
+        """Test call_tools routes through ctx.call_tool."""
+        ctx = MockEvalContext(prompt="Do something")
+        agent = MockMCPAgent()
 
-class MockAgentExtended(MCPAgent):
-    """Mock agent for testing with predefined responses."""
+        # Bind context manually
+        agent.ctx = ctx
+        await agent._initialize_from_ctx(ctx)
 
-    metadata: ClassVar[dict[str, Any] | None] = {}
-    config_cls: ClassVar[type[BaseAgentConfig]] = MockConfig
-
-    def __init__(self, responses: list[Any] | None = None, **kwargs: Any):
-        if kwargs.get("mcp_client") is None:
-            kwargs["mcp_client"] = MockMCPClient()
-        params = MockCreateParams(**kwargs)
-        super().__init__(params)
-        self.responses = responses or []
-        self.call_count = 0
+        # Call a tool
+        results = await agent.call_tools(MCPToolCall(name="test_tool", arguments={"arg": "value"}))
 
-    async def create_initial_messages(
-        self, prompt: str, initial_screenshot: bool = False
-    ) -> list[dict[str, Any]]:
-        """Create initial messages."""
-        messages = [{"role": "user", "content": prompt}]
-        if initial_screenshot:
-            # capture_screenshot doesn't exist, just mock it
-            screenshot = "mock_screenshot_data"
-            messages.append({"role": "assistant", "content": f"Screenshot: {screenshot}"})
-        return messages
+        assert len(results) == 1
+        assert not results[0].isError
+        assert ("test_tool", {"arg": "value"}) in ctx._tool_calls
 
-    async def get_response(self, messages: list[dict[str, Any]]) -> AgentResponse:
-        """Return predefined responses - must be async."""
-        if self.call_count < len(self.responses):
-            response_dict = self.responses[self.call_count]
-            self.call_count += 1
-            # Convert dict to AgentResponse
-            return AgentResponse(
-                content=response_dict.get("content", ""),
-                tool_calls=response_dict.get("tool_calls", []),
-                done=response_dict.get("done", not bool(response_dict.get("tool_calls"))),
-            )
-        return AgentResponse(content="Done", tool_calls=[], done=True)
+    @pytest.mark.asyncio
+    async def test_call_tools_without_context_raises(self) -> None:
+        """Test call_tools raises when no context bound."""
+        agent = MockMCPAgent()
 
-    async def format_tool_results(
-        self, tool_calls: list[MCPToolCall], tool_results: list[MCPToolResult]
-    ) -> list[dict[str, Any]]:
-        """Format tool results."""
-        formatted = []
-        for tool_call, result in zip(tool_calls, tool_results):
-            formatted.append({"role": "tool", "name": tool_call.name, "content": str(result)})
-        return formatted
+        with pytest.raises(ValueError, match="not bound to context"):
+            await agent.call_tools(MCPToolCall(name="test_tool", arguments={}))
 
-    async def create_user_message(self, text: str) -> Any:
-        """Create user message."""
-        return {"role": "user", "content": text}
 
-    async def get_system_messages(self) -> list[Any]:
-        """Mock get system messages."""
-        return []
+class TestMCPAgentRequiredTools:
+    """Tests for required_tools validation."""
 
-    async def format_blocks(self, blocks: list[types.ContentBlock]) -> list[Any]:
-        """Mock format blocks."""
-        formatted = []
-        for block in blocks:
-            if isinstance(block, types.TextContent):
-                formatted.append({"type": "text", "text": block.text})
-            elif isinstance(block, types.ImageContent):
-                formatted.append({"type": "image", "data": block.data})
-            elif hasattr(block, "type"):
-                formatted.append({"type": getattr(block, "type", "unknown")})
-        return formatted
+    @pytest.mark.asyncio
+    async def test_missing_required_tools_raises(self) -> None:
+        """Test run() raises when required tools are missing."""
 
+        class AgentWithRequiredTools(MockMCPAgent):
+            required_tools: ClassVar[list[str]] = ["must_have_tool"]
 
-class TestMCPAgentExtended:
-    """Extended tests for MCPAgent."""
+        ctx = MockEvalContext(prompt="Do something", tools=[])
+        agent = AgentWithRequiredTools()
 
-    @pytest.fixture
-    def agent_with_tools(self, mock_mcp_client_browser_tools):
-        """Create agent with mock tools."""
-        return MockAgentExtended(mcp_client=mock_mcp_client_browser_tools)
+        with pytest.raises(ValueError, match="Required tools are missing"):
+            await agent.run(ctx)
 
     @pytest.mark.asyncio
-    async def test_run_with_task_object(self, agent_with_tools):
-        """Test running agent with Task object."""
-        from hud.types import MCPToolResult
-
-        task = Task(
-            id="test_task",
-            prompt="Click the button",
-            mcp_config={"test_server": {"url": "http://localhost:8080"}},
-            setup_tool={"name": "navigate", "arguments": {"url": "https://example.com"}},  # type: ignore[arg-type]
-            evaluate_tool={"name": "check_result", "arguments": {}},  # type: ignore[arg-type]
-        )
-
-        # Set up responses
-        agent_with_tools.responses = [
-            {
-                "role": "assistant",
-                "content": "I'll click the button",
-                "tool_calls": [MCPToolCall(name="click", arguments={"x": 100, "y": 200})],
-            }
-        ]
+    async def test_required_tools_present_succeeds(self) -> None:
+        """Test run() succeeds when required tools are present."""
 
-        # Mock the evaluation to return a reward
-        agent_with_tools.mcp_client.call_tool = AsyncMock(
-            side_effect=[
-                # Setup tool
-                MCPToolResult(
-                    content=[types.TextContent(type="text", text="Navigated")],
-                    isError=False,
-                ),
-                # Click tool
-                MCPToolResult(
-                    content=[types.TextContent(type="text", text="Clicked")],
-                    isError=False,
-                ),
-                # Evaluate tool with reward
-                MCPToolResult(
-                    content=[types.TextContent(type="text", text="Success")],
-                    isError=False,
-                    structuredContent={"reward": 1.0},
-                ),
-            ]
-        )
+        class AgentWithRequiredTools(MockMCPAgent):
+            required_tools: ClassVar[list[str]] = ["required_tool"]
 
-        result = await agent_with_tools.run(task)
+        tools = [types.Tool(name="required_tool", description="Required", inputSchema={})]
+        ctx = MockEvalContext(prompt="Do something", tools=tools)
+        agent = AgentWithRequiredTools()
 
-        assert isinstance(result, Trace)
-        assert result.reward == 1.0
-        assert not result.isError
+        result = await agent.run(ctx)
         assert result.done
 
-    @pytest.mark.asyncio
-    async def test_run_with_setup_error(self, agent_with_tools):
-        """Test task execution with setup phase error."""
-        from hud.types import MCPToolResult
-
-        task = Task(
-            id="test_task",
-            prompt="Do something",
-            mcp_config={"test_server": {"url": "http://localhost:8080"}},
-            setup_tool={"name": "bad_setup", "arguments": {}},  # type: ignore[arg-type]
-        )
-
-        # Mock setup tool to fail
-        agent_with_tools.mcp_client.call_tool = AsyncMock(
-            return_value=MCPToolResult(
-                content=[types.TextContent(type="text", text="Setup failed")],
-                isError=True,
-            )
-        )
 
-        result = await agent_with_tools.run(task)
-
-        assert isinstance(result, Trace)
-        assert result.isError
-        # Error content is the string representation of the MCPToolResult list
-        assert result.content is not None
-        assert "Setup failed" in result.content
-        assert "MCPToolResult" in result.content
+class TestMCPAgentOnToolsReady:
+    """Tests for _on_tools_ready hook."""
 
     @pytest.mark.asyncio
-    async def test_run_with_multiple_setup_tools(self, agent_with_tools):
-        """Test task with multiple setup tools."""
-
-        task = Task(
-            id="test_task",
-            prompt="Test multiple setup",
-            mcp_config={"test_server": {"url": "http://localhost:8080"}},
-            setup_tool=[
-                MCPToolCall(name="setup1", arguments={}),
-                MCPToolCall(name="setup2", arguments={}),
-            ],
-        )
+    async def test_on_tools_ready_called(self) -> None:
+        """Test _on_tools_ready is called during initialization."""
+        hook_called = [False]
 
-        agent_with_tools.responses = [{"role": "assistant", "content": "Done", "tool_calls": []}]
+        class AgentWithHook(MockMCPAgent):
+            def _on_tools_ready(self) -> None:
+                hook_called[0] = True
 
-        setup_calls = []
-        agent_with_tools.mcp_client.call_tool = AsyncMock(
-            side_effect=lambda tool_call: setup_calls.append(tool_call)
-            or MCPToolResult(
-                content=[types.TextContent(type="text", text=f"{tool_call.name} done")],
-                isError=False,
-            )
-        )
+        ctx = MockEvalContext(prompt="Do something")
+        agent = AgentWithHook()
 
-        result = await agent_with_tools.run(task)
-
-        # Check that the tool names match
-        setup_names = [call.name for call in setup_calls]
-        assert "setup1" in setup_names
-        assert "setup2" in setup_names
-        assert not result.isError
+        await agent.run(ctx)
+        assert hook_called[0]
 
     @pytest.mark.asyncio
-    async def test_allowed_tools_filtering(self):
-        """Test that allowed_tools filters available tools."""
-        mock_client = MockMCPClient(
-            tools=[
-                types.Tool(name="tool1", description="Tool 1", inputSchema={}),
-                types.Tool(name="tool2", description="Tool 2", inputSchema={}),
-                types.Tool(name="tool3", description="Tool 3", inputSchema={}),
-            ]
-        )
+    async def test_on_tools_ready_has_access_to_tools(self) -> None:
+        """Test _on_tools_ready can access discovered tools."""
+        captured_tools: list[types.Tool] = []
 
-        agent = MockAgentExtended(mcp_client=mock_client, allowed_tools=["tool1", "tool3"])
-        await agent.initialize("test")
+        class AgentWithHook(MockMCPAgent):
+            def _on_tools_ready(self) -> None:
+                captured_tools.extend(self.get_available_tools())
 
-        available_names = [tool.name for tool in agent.get_available_tools()]
-        assert "tool1" in available_names
-        assert "tool3" in available_names
-        assert "tool2" not in available_names
-
-    @pytest.mark.asyncio
-    async def test_disallowed_tools_filtering(self):
-        """Test that disallowed_tools filters available tools."""
-        mock_client = MockMCPClient(
-            tools=[
-                types.Tool(name="tool1", description="Tool 1", inputSchema={}),
-                types.Tool(name="tool2", description="Tool 2", inputSchema={}),
-                types.Tool(name="tool3", description="Tool 3", inputSchema={}),
-            ]
-        )
-
-        agent = MockAgentExtended(mcp_client=mock_client, disallowed_tools=["tool2"])
-        await agent.initialize("test")
-
-        available_names = [tool.name for tool in agent.get_available_tools()]
-        assert "tool1" in available_names
-        assert "tool3" in available_names
-        assert "tool2" not in available_names
+        tools = [
+            types.Tool(name="tool1", description="Tool 1", inputSchema={}),
+            types.Tool(name="tool2", description="Tool 2", inputSchema={}),
+        ]
+        ctx = MockEvalContext(prompt="Do something", tools=tools)
+        agent = AgentWithHook()
 
-    @pytest.mark.asyncio
-    async def test_lifecycle_tools(self):
-        """Test lifecycle tools are called in run_prompt."""
-        mock_client = MockMCPClient(
-            tools=[types.Tool(name="screenshot", description="Take screenshot", inputSchema={})]
-        )
+        await agent.run(ctx)
 
-        agent = MockAgentExtended(
-            mcp_client=mock_client,
-            responses=[{"role": "assistant", "content": "Done", "tool_calls": []}],
-        )
+        assert len(captured_tools) == 2
+        assert captured_tools[0].name == "tool1"
 
-        # Initialize to make tools available
-        await agent.initialize()
-
-        result = await agent.run("Test lifecycle", max_steps=1)
-        assert not result.isError
-
-    # This test is commented out as screenshot history management may have changed
-    # @pytest.mark.asyncio
-    # async def test_screenshot_history_management(self, agent_with_tools):
-    #     """Test screenshot history is maintained."""
-    #     agent_with_tools.initial_screenshot = True
-
-    #     # Set up responses with tool calls
-    #     agent_with_tools.responses = [
-    #         {
-    #             "role": "assistant",
-    #             "content": "Action 1",
-    #             "tool_calls": [MCPToolCall(name="click", arguments={"x": 1, "y": 1})],
-    #         },
-    #         {
-    #             "role": "assistant",
-    #             "content": "Action 2",
-    #             "tool_calls": [MCPToolCall(name="click", arguments={"x": 2, "y": 2})],
-    #         },
-    #         {
-    #             "role": "assistant",
-    #             "content": "Action 3",
-    #             "tool_calls": [MCPToolCall(name="click", arguments={"x": 3, "y": 3})],
-    #         },
-    #     ]
-
-    #     await agent_with_tools.run("Test screenshots", max_steps=3)
-
-    #     # Should have screenshots in history
-    #     assert len(agent_with_tools.screenshot_history) > 0
 
-    @pytest.mark.asyncio
-    async def test_run_with_invalid_prompt_type(self, agent_with_tools):
-        """Test run with invalid prompt type raises TypeError."""
-        with pytest.raises(TypeError, match="prompt_or_task must be str or Task"):
-            await agent_with_tools.run(123)  # Invalid type
+class TestMCPAgentToolSchemas:
+    """Tests for tool schema generation."""
 
     @pytest.mark.asyncio
-    async def test_evaluate_phase_with_multiple_tools(self, agent_with_tools):
-        """Test evaluation phase with multiple evaluation tools."""
-        from hud.types import MCPToolResult
-
-        task = Task(
-            id="test_task",
-            prompt="Test evaluation",
-            mcp_config={"test_server": {"url": "http://localhost:8080"}},
-            evaluate_tool=[
-                MCPToolCall(name="eval1", arguments={}),
-                MCPToolCall(name="eval2", arguments={"reward": True}),
-            ],
-        )
-
-        agent_with_tools.responses = [{"role": "assistant", "content": "Done", "tool_calls": []}]
-
-        eval_calls = []
-        agent_with_tools.mcp_client.call_tool = AsyncMock(
-            side_effect=lambda tool_call: eval_calls.append(tool_call)
-            or MCPToolResult(
-                content=[types.TextContent(type="text", text=f"{tool_call.name} result")],
-                isError=False,
-                structuredContent={"reward": 0.5} if tool_call.name == "eval1" else {"reward": 1.0},
+    async def test_get_tool_schemas(self) -> None:
+        """Test get_tool_schemas returns correct format."""
+        tools = [
+            types.Tool(
+                name="my_tool",
+                description="My tool description",
+                inputSchema={"type": "object", "properties": {"x": {"type": "string"}}},
             )
-        )
-
-        result = await agent_with_tools.run(task)
-
-        # Check that the tool names match
-        eval_names = [call.name for call in eval_calls]
-        assert "eval1" in eval_names
-        assert "eval2" in eval_names
-        assert result.reward == 0.5  # From eval1 (first evaluation tool)
-
-    @pytest.mark.asyncio
-    async def test_trace_population_on_error(self, agent_with_tools):
-        """Test that trace is populated on task execution error."""
-
-        task = Task(
-            id="test_task",
-            prompt="Test error",
-            mcp_config={"test_server": {"url": "http://localhost:8080"}},
-            setup_tool={"name": "failing_setup", "arguments": {}},  # type: ignore[arg-type]
-        )
-
-        # Make setup fail with exception
-        agent_with_tools.mcp_client.call_tool = AsyncMock(side_effect=Exception("Setup explosion"))
+        ]
+        ctx = MockEvalContext(prompt="Do something", tools=tools)
+        agent = MockMCPAgent()
 
-        result = await agent_with_tools.run(task)
+        # Initialize agent
+        agent.ctx = ctx
+        await agent._initialize_from_ctx(ctx)
 
-        assert result.isError
-        # Error content is the string representation of the MCPToolResult list
-        assert "Setup explosion" in result.content
-        assert "MCPToolResult" in result.content
-        assert result.done
+        schemas = agent.get_tool_schemas()
+        assert len(schemas) == 1
+        assert schemas[0]["name"] == "my_tool"
+        assert schemas[0]["description"] == "My tool description"
diff --git a/hud/agents/tests/test_base_runtime.py b/hud/agents/tests/test_base_runtime.py
index 2ea24756..1b084641 100644
--- a/hud/agents/tests/test_base_runtime.py
+++ b/hud/agents/tests/test_base_runtime.py
@@ -1,63 +1,118 @@
+"""Runtime tests for MCPAgent base class."""
+
 from __future__ import annotations
 
 from typing import Any
-from unittest import mock
 
 import mcp.types as types
 import pytest
 
 from hud.agents.base import BaseCreateParams, MCPAgent, find_content, find_reward, text_to_blocks
+from hud.environment.router import ToolRouter
+from hud.eval.context import EvalContext
 from hud.types import AgentResponse, BaseAgentConfig, MCPToolCall, MCPToolResult
 
-from .conftest import MockMCPClient
-
 
 class DummyConfig(BaseAgentConfig):
     model_name: str = "DummyAgent"
-    checkpoint_name: str = "dummy-model"
+    model: str = "dummy-model"
 
 
 class DummyCreateParams(BaseCreateParams, DummyConfig):
     pass
 
 
+class MockEvalContext(EvalContext):
+    """Mock EvalContext for testing."""
+
+    def __init__(
+        self,
+        prompt: str = "Test prompt",
+        tools: list[types.Tool] | None = None,
+    ) -> None:
+        # Core attributes
+        self.prompt = prompt
+        self._tools = tools or []
+        self._submitted: str | None = None
+        self.reward: float | None = None
+        self._call_tool_handler: Any = None
+
+        # Environment attributes
+        self._router = ToolRouter()
+        self._agent_include: list[str] | None = None
+        self._agent_exclude: list[str] | None = None
+
+        # EvalContext attributes
+        self._task = None
+        self.trace_id = "test-trace-id"
+        self.eval_name = "test-eval"
+        self.job_id: str | None = None
+        self.group_id: str | None = None
+        self.index = 0
+        self.variants: dict[str, Any] = {}
+        self.answer: str | None = None
+        self.system_prompt: str | None = None
+        self.error: BaseException | None = None
+        self.metadata: dict[str, Any] = {}
+        self.results: list[Any] = []
+        self._is_summary = False
+
+    def as_tools(self) -> list[types.Tool]:
+        return self._tools
+
+    @property
+    def has_scenario(self) -> bool:
+        return False
+
+    def set_call_tool_handler(self, handler: Any) -> None:
+        self._call_tool_handler = handler
+
+    async def list_tools(self) -> list[types.Tool]:
+        return self._tools
+
+    async def call_tool(self, call: Any, /, **kwargs: Any) -> MCPToolResult:
+        if self._call_tool_handler:
+            # Parse the call
+            if isinstance(call, tuple):
+                tc = MCPToolCall(name=call[0], arguments=call[1] if len(call) > 1 else {})
+            elif hasattr(call, "name"):
+                tc = call
+            else:
+                tc = MCPToolCall(name=str(call), arguments=kwargs)
+            return self._call_tool_handler(tc)
+        return MCPToolResult(
+            content=[types.TextContent(type="text", text="ok")],
+            isError=False,
+        )
+
+    async def submit(self, answer: str) -> None:
+        self._submitted = answer
+
+
 class DummyAgent(MCPAgent):
     config_cls = DummyConfig
 
     def __init__(self, **kwargs: Any) -> None:
-        # Only create MockMCPClient if mcp_client not specified at all
-        if "mcp_client" not in kwargs:
-            kwargs["mcp_client"] = MockMCPClient()
         params = DummyCreateParams(**kwargs)
         super().__init__(params)
 
     async def get_system_messages(self) -> list[types.ContentBlock]:
         return [types.TextContent(type="text", text="sys")]
 
-    async def get_response(self, messages):
-        # Single step: no tool calls -> done
+    async def get_response(self, messages: list[Any]) -> AgentResponse:
         return AgentResponse(content="ok", tool_calls=[], done=True)
 
-    async def format_blocks(self, blocks):
-        # Return as-is
+    async def format_blocks(self, blocks: list[Any]) -> list[Any]:
         return blocks
 
-    async def format_tool_results(self, tool_calls, tool_results):
+    async def format_tool_results(
+        self, tool_calls: list[MCPToolCall], tool_results: list[MCPToolResult]
+    ) -> list[Any]:
         return [types.TextContent(text="tools", type="text")]
 
 
-@pytest.mark.asyncio
-async def test_run_with_string_prompt_auto_client(monkeypatch):
-    fake_client = MockMCPClient()
-
-    # Patch MCPClient construction inside initialize()
-    with mock.patch("hud.clients.MCPClient", return_value=fake_client):
-        agent = DummyAgent(mcp_client=fake_client, auto_trace=False)
-        result = await agent.run("hello", max_steps=1)
-    assert result.done is True and result.isError is False
-
-
-def test_find_reward_and_content_extractors():
+def test_find_reward_and_content_extractors() -> None:
+    """Test reward and content extraction from tool results."""
     # Structured content
     r = MCPToolResult(
         content=text_to_blocks("{}"), isError=False, structuredContent={"reward": 0.7}
@@ -70,108 +125,109 @@ def test_find_reward_and_content_extractors():
     assert find_content(r2) == "hi"
 
 
-@pytest.mark.asyncio
-async def test_call_tools_error_paths():
-    call_count = [0]
-    ok_result = MCPToolResult(content=text_to_blocks("ok"), isError=False)
-
-    def handler(tool_call: MCPToolCall) -> MCPToolResult:
-        call_count[0] += 1
-        if call_count[0] == 1:
-            return ok_result
-        raise RuntimeError("boom")
-
-    fake_client = MockMCPClient(call_tool_handler=handler)
-    agent = DummyAgent(mcp_client=fake_client, auto_trace=False)
-    results = await agent.call_tools(
-        [MCPToolCall(name="a", arguments={}), MCPToolCall(name="b", arguments={})]
-    )
-    assert results[0].isError is False
-    assert results[1].isError is True
-
-
-@pytest.mark.asyncio
-async def test_initialize_without_client_raises_valueerror():
-    agent = DummyAgent(mcp_client=None, auto_trace=False)
-    with pytest.raises(ValueError):
-        await agent.initialize(None)
-
-
-def test_get_available_tools_before_initialize_raises():
-    agent = DummyAgent(mcp_client=MockMCPClient(), auto_trace=False)
+def test_get_available_tools_before_run_raises() -> None:
+    """Test that get_available_tools raises before initialization."""
+    agent = DummyAgent()
     with pytest.raises(RuntimeError):
         agent.get_available_tools()
 
 
 @pytest.mark.asyncio
-async def test_format_message_invalid_type_raises():
-    agent = DummyAgent(mcp_client=MockMCPClient(), auto_trace=False)
+async def test_format_message_invalid_type_raises() -> None:
+    """Test that format_message raises for invalid types."""
+    agent = DummyAgent()
     with pytest.raises(ValueError):
         await agent.format_message({"oops": 1})  # type: ignore
 
 
-@pytest.mark.asyncio
-async def test_call_tools_timeout_error_shutdown_called():
-    def handler(tool_call: MCPToolCall) -> MCPToolResult:
-        raise TimeoutError("timeout")
-
-    fake_client = MockMCPClient(call_tool_handler=handler)
-    agent = DummyAgent(mcp_client=fake_client, auto_trace=False)
-    with pytest.raises(TimeoutError):
-        await agent.call_tools(MCPToolCall(name="x", arguments={}))
-    assert fake_client.shutdown_called
-
-
-def test_text_to_blocks_shapes():
+def test_text_to_blocks_shapes() -> None:
+    """Test text_to_blocks returns correct structure."""
     blocks = text_to_blocks("x")
     assert isinstance(blocks, list) and blocks and isinstance(blocks[0], types.TextContent)
 
 
 @pytest.mark.asyncio
-async def test_run_returns_connection_error_trace(monkeypatch):
-    fake_client = MockMCPClient(
-        initialize_error=RuntimeError("Connection refused http://localhost:1234")
-    )
+async def test_run_with_eval_context() -> None:
+    """Test basic run() with EvalContext."""
+    ctx = MockEvalContext(prompt="hello")
+    agent = DummyAgent()
+    result = await agent.run(ctx, max_steps=1)
+    assert result.done is True
+    assert result.isError is False
+
 
-    class DummyCM:
-        def __exit__(self, *args, **kwargs):
-            return False
+@pytest.mark.asyncio
+async def test_run_requires_eval_context() -> None:
+    """Test run() raises TypeError for non-EvalContext."""
+    agent = DummyAgent()
+    with pytest.raises(TypeError, match="must be EvalContext"):
+        await agent.run("hello")  # type: ignore
 
-    monkeypatch.setattr("hud.utils.mcp.setup_hud_telemetry", lambda *args, **kwargs: DummyCM())
 
-    agent = DummyAgent(mcp_client=fake_client, auto_trace=False)
-    result = await agent.run("p", max_steps=1)
-    assert result.isError is True
-    assert "Could not connect" in (result.content or "")
+@pytest.mark.asyncio
+async def test_run_requires_prompt() -> None:
+    """Test run() raises ValueError when prompt is empty."""
+    ctx = MockEvalContext(prompt="")
+    agent = DummyAgent()
+    with pytest.raises(ValueError, match="prompt is not set"):
+        await agent.run(ctx)
 
 
 @pytest.mark.asyncio
-async def test_run_calls_response_tool_when_configured(monkeypatch):
-    ok = MCPToolResult(content=text_to_blocks("ok"), isError=False)
-    fake_client = MockMCPClient(call_tool_handler=lambda _: ok)
+async def test_call_tools_error_paths() -> None:
+    """Test call_tools handles errors correctly."""
+    call_count = [0]
+    ok_result = MCPToolResult(content=text_to_blocks("ok"), isError=False)
+
+    def handler(tool_call: MCPToolCall) -> MCPToolResult:
+        call_count[0] += 1
+        if call_count[0] == 1:
+            return ok_result
+        raise RuntimeError("boom")
 
-    class DummyCM:
-        def __exit__(self, *args, **kwargs):
-            return False
+    ctx = MockEvalContext(prompt="test")
+    ctx.set_call_tool_handler(handler)
+    agent = DummyAgent()
 
-    monkeypatch.setattr("hud.utils.mcp.setup_hud_telemetry", lambda *args, **kwargs: DummyCM())
+    # Initialize the agent with context
+    agent.ctx = ctx
+    await agent._initialize_from_ctx(ctx)
 
-    agent = DummyAgent(mcp_client=fake_client, auto_trace=False, response_tool_name="submit")
-    result = await agent.run("hello", max_steps=1)
-    assert result.isError is False
-    assert len(fake_client.call_tool_calls) > 0
+    results = await agent.call_tools(
+        [MCPToolCall(name="a", arguments={}), MCPToolCall(name="b", arguments={})]
+    )
+    assert results[0].isError is False
+    assert results[1].isError is True
 
 
 @pytest.mark.asyncio
-async def test_get_available_tools_after_initialize(monkeypatch):
-    fake_client = MockMCPClient()
+async def test_call_tools_timeout_raises() -> None:
+    """Test call_tools raises TimeoutError."""
 
-    class DummyCM:
-        def __exit__(self, *args, **kwargs):
-            return False
+    def handler(tool_call: MCPToolCall) -> MCPToolResult:
+        raise TimeoutError("timeout")
+
+    ctx = MockEvalContext(prompt="test")
+    ctx.set_call_tool_handler(handler)
+    agent = DummyAgent()
+
+    agent.ctx = ctx
+    await agent._initialize_from_ctx(ctx)
 
-    monkeypatch.setattr("hud.utils.mcp.setup_hud_telemetry", lambda *args, **kwargs: DummyCM())
+    with pytest.raises(TimeoutError):
+        await agent.call_tools(MCPToolCall(name="x", arguments={}))
 
-    agent = DummyAgent(mcp_client=fake_client, auto_trace=False)
-    await agent.initialize(None)
-    assert agent.get_available_tools() == []
+
+@pytest.mark.asyncio
+async def test_get_available_tools_after_run() -> None:
+    """Test get_available_tools works after initialization."""
+    tools = [types.Tool(name="test_tool", description="Test", inputSchema={})]
+    ctx = MockEvalContext(prompt="hello", tools=tools)
+    agent = DummyAgent()
+
+    # Run initializes the agent
+    await agent.run(ctx, max_steps=1)
+
+    # After cleanup, we can't access tools (ctx is cleared)
+    # But during run, tools were available
+    assert agent._initialized is True
diff --git a/hud/agents/tests/test_claude.py b/hud/agents/tests/test_claude.py
index a8ab7383..66513f50 100644
--- a/hud/agents/tests/test_claude.py
+++ b/hud/agents/tests/test_claude.py
@@ -2,12 +2,11 @@
 
 from __future__ import annotations
 
-from types import SimpleNamespace
-from typing import TYPE_CHECKING, cast
+from typing import TYPE_CHECKING, Any, cast
 from unittest.mock import AsyncMock, MagicMock, patch
 
 import pytest
-from anthropic import AsyncAnthropic, AsyncAnthropicBedrock, BadRequestError
+from anthropic import AsyncAnthropic, AsyncAnthropicBedrock
 from mcp import types
 
 from hud.agents.claude import (
@@ -16,41 +15,96 @@
     text_to_content_block,
     tool_use_content_block,
 )
+from hud.environment.router import ToolRouter
+from hud.eval.context import EvalContext
 from hud.types import MCPToolCall, MCPToolResult
 
 if TYPE_CHECKING:
+    from collections.abc import Generator
+
     from anthropic.types.beta import BetaImageBlockParam, BetaMessageParam, BetaTextBlockParam
 
 
+class MockEvalContext(EvalContext):
+    """Mock EvalContext for testing."""
+
+    def __init__(self, tools: list[types.Tool] | None = None) -> None:
+        # Core attributes
+        self.prompt = "Test prompt"
+        self._tools = tools or []
+        self._submitted: str | None = None
+        self.reward: float | None = None
+
+        # Environment attributes
+        self._router = ToolRouter()
+        self._agent_include: list[str] | None = None
+        self._agent_exclude: list[str] | None = None
+
+        # EvalContext attributes
+        self._task = None
+        self.trace_id = "test-trace-id"
+        self.eval_name = "test-eval"
+        self.job_id: str | None = None
+        self.group_id: str | None = None
+        self.index = 0
+        self.variants: dict[str, Any] = {}
+        self.answer: str | None = None
+        self.system_prompt: str | None = None
+        self.error: BaseException | None = None
+        self.metadata: dict[str, Any] = {}
+        self.results: list[Any] = []
+        self._is_summary = False
+
+    def as_tools(self) -> list[types.Tool]:
+        return self._tools
+
+    @property
+    def has_scenario(self) -> bool:
+        return False
+
+    async def list_tools(self) -> list[types.Tool]:
+        return self._tools
+
+    async def call_tool(self, call: Any, /, **kwargs: Any) -> MCPToolResult:
+        return MCPToolResult(
+            content=[types.TextContent(type="text", text="ok")],
+            isError=False,
+        )
+
+    async def submit(self, answer: str) -> None:
+        self._submitted = answer
+
+
 class MockStreamContextManager:
     """Mock for Claude's streaming context manager."""
 
-    def __init__(self, response: MagicMock):
+    def __init__(self, response: MagicMock) -> None:
         self.response = response
 
-    async def __aenter__(self):
+    async def __aenter__(self) -> MockStreamContextManager:
         return self
 
-    async def __aexit__(self, exc_type, exc_val, exc_tb):
+    async def __aexit__(
+        self, exc_type: type | None, exc_val: Exception | None, exc_tb: Any
+    ) -> bool:
         return False
 
-    def __aiter__(self):
+    def __aiter__(self) -> MockStreamContextManager:
         return self
 
-    async def __anext__(self):
-        # No events to yield, end iteration immediately
+    async def __anext__(self) -> None:
         raise StopAsyncIteration
 
-    async def get_final_message(self):
+    async def get_final_message(self) -> MagicMock:
         return self.response
 
 
 class TestClaudeHelperFunctions:
     """Test helper functions for Claude message formatting."""
 
-    def test_base64_to_content_block(self):
+    def test_base64_to_content_block(self) -> None:
         """Test base64 image conversion."""
-        base64_data = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNkYPhfDwAChwGA60e6kgAAAABJRU5ErkJggg=="  # noqa: E501
+        base64_data = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk"
         result = base64_to_content_block(base64_data)
 
         assert result["type"] == "image"
@@ -58,7 +112,7 @@ def test_base64_to_content_block(self):
         assert result["source"]["media_type"] == "image/png"
         assert result["source"]["data"] == base64_data
 
-    def test_text_to_content_block(self):
+    def test_text_to_content_block(self) -> None:
         """Test text conversion."""
         text = "Hello, world!"
         result = text_to_content_block(text)
@@ -66,7 +120,7 @@ def test_text_to_content_block(self):
         assert result["type"] == "text"
         assert result["text"] == text
 
-    def test_tool_use_content_block(self):
+    def test_tool_use_content_block(self) -> None:
         """Test tool result content block creation."""
         tool_use_id = "tool_123"
         content: list[BetaTextBlockParam | BetaImageBlockParam] = [
@@ -84,206 +138,154 @@ class TestClaudeAgent:
     """Test ClaudeAgent class."""
 
     @pytest.fixture
-    def mock_anthropic(self):
-        """Create a stub AsyncAnthropic client and patch constructor."""
-        client = AsyncAnthropic(api_key="test_key")
-        client.__dict__["beta"] = SimpleNamespace(messages=AsyncMock())
-        with patch("hud.agents.claude.AsyncAnthropic", return_value=client):
-            yield client
+    def mock_anthropic(self) -> Generator[AsyncAnthropic, None, None]:  # type: ignore[misc]
+        """Create a stub Anthropic client."""
+        with patch("hud.agents.claude.AsyncAnthropic") as mock_class:
+            client = MagicMock(spec=AsyncAnthropic)
+            client.api_key = "test-key"
+            mock_class.return_value = client
+            yield client  # type: ignore[misc]
 
     @pytest.mark.asyncio
-    async def test_init(self, mock_mcp_client, mock_anthropic):
-        """Test agent initialization."""
+    async def test_init_with_client(self, mock_anthropic: AsyncAnthropic) -> None:
+        """Test agent initialization with provided client."""
         agent = ClaudeAgent.create(
-            mcp_client=mock_mcp_client,
             model_client=mock_anthropic,
-            checkpoint_name="claude-3-opus-20240229",
-            max_tokens=1000,
-            validate_api_key=False,  # Skip validation in tests
+            model="claude-sonnet-4-20250514",
+            validate_api_key=False,
         )
 
         assert agent.model_name == "Claude"
-        assert agent.max_tokens == 1000
+        assert agent.config.model == "claude-sonnet-4-20250514"
         assert agent.anthropic_client == mock_anthropic
 
     @pytest.mark.asyncio
-    async def test_init_without_model_client(self, mock_mcp_client, mock_anthropic):
-        """Test agent initialization without model client."""
-        with patch("hud.settings.settings.anthropic_api_key", "test_key"):
-            agent = ClaudeAgent.create(
-                mcp_client=mock_mcp_client,
-                checkpoint_name="claude-3-opus-20240229",
-                validate_api_key=False,  # Skip validation in tests
-            )
+    async def test_init_with_parameters(self, mock_anthropic: AsyncAnthropic) -> None:
+        """Test agent initialization with various parameters."""
+        agent = ClaudeAgent.create(
+            model_client=mock_anthropic,
+            model="claude-sonnet-4-20250514",
+            max_tokens=4096,
+            validate_api_key=False,
+        )
 
-            assert agent.model_name == "Claude"
-            assert agent.anthropic_client is not None
+        assert agent.max_tokens == 4096
 
     @pytest.mark.asyncio
-    async def test_format_blocks(self, mock_mcp_client, mock_anthropic):
-        """Test formatting content blocks into Claude messages."""
+    async def test_format_blocks_text_only(self, mock_anthropic: AsyncAnthropic) -> None:
+        """Test formatting text content blocks."""
         agent = ClaudeAgent.create(
-            mcp_client=mock_mcp_client,
             model_client=mock_anthropic,
-            validate_api_key=False,  # Skip validation in tests
+            validate_api_key=False,
         )
 
-        # Test with text only
-        text_blocks: list[types.ContentBlock] = [
-            types.TextContent(type="text", text="Hello, Claude!")
+        blocks: list[types.ContentBlock] = [
+            types.TextContent(type="text", text="Hello, world!"),
+            types.TextContent(type="text", text="How are you?"),
         ]
-        messages = await agent.format_blocks(text_blocks)
+
+        messages = await agent.format_blocks(blocks)
         assert len(messages) == 1
         assert messages[0]["role"] == "user"
         content = messages[0]["content"]
         assert isinstance(content, list)
-        assert len(content) == 1
-        assert content[0]["type"] == "text"
-        assert content[0]["text"] == "Hello, Claude!"
+        assert len(content) == 2
+        assert content[0]["type"] == "text"  # type: ignore[index]
+        assert content[0]["text"] == "Hello, world!"  # type: ignore[index]
+
+    @pytest.mark.asyncio
+    async def test_format_blocks_with_image(self, mock_anthropic: AsyncAnthropic) -> None:
+        """Test formatting image content blocks."""
+        agent = ClaudeAgent.create(
+            model_client=mock_anthropic,
+            validate_api_key=False,
+        )
 
-        # Test with screenshot
-        image_blocks: list[types.ContentBlock] = [
-            types.TextContent(type="text", text="Look at this"),
+        blocks: list[types.ContentBlock] = [
+            types.TextContent(type="text", text="Look at this:"),
             types.ImageContent(type="image", data="base64data", mimeType="image/png"),
         ]
-        messages = await agent.format_blocks(image_blocks)
+
+        messages = await agent.format_blocks(blocks)
         assert len(messages) == 1
-        assert messages[0]["role"] == "user"
         content = messages[0]["content"]
         assert isinstance(content, list)
         assert len(content) == 2
-        # Content blocks are in order
-        assert content[0]["type"] == "text"
-        assert content[0]["text"] == "Look at this"
-        assert content[1]["type"] == "image"
-        assert content[1]["source"]["data"] == "base64data"
+        assert content[1]["type"] == "image"  # type: ignore[index]
 
     @pytest.mark.asyncio
-    async def test_format_tool_results_method(self, mock_mcp_client, mock_anthropic):
-        """Test the agent's format_tool_results method."""
+    async def test_format_tool_results_text(self, mock_anthropic: AsyncAnthropic) -> None:
+        """Test formatting tool results with text content."""
         agent = ClaudeAgent.create(
-            mcp_client=mock_mcp_client,
             model_client=mock_anthropic,
-            validate_api_key=False,  # Skip validation in tests
+            validate_api_key=False,
         )
 
-        tool_calls = [
-            MCPToolCall(name="test_tool", arguments={}, id="id1"),
-        ]
-
+        tool_calls = [MCPToolCall(id="call_123", name="test_tool", arguments={})]
         tool_results = [
-            MCPToolResult(content=[types.TextContent(type="text", text="Success")], isError=False),
+            MCPToolResult(
+                content=[types.TextContent(type="text", text="Tool output")],
+                isError=False,
+            )
         ]
 
         messages = await agent.format_tool_results(tool_calls, tool_results)
-
-        # format_tool_results returns a single user message with tool result content
         assert len(messages) == 1
         assert messages[0]["role"] == "user"
-        # The content is wrapped in a tool result block
-        content = list(messages[0]["content"])
+        content = messages[0]["content"]
+        assert isinstance(content, list)
         assert len(content) == 1
-        assert content[0]["type"] == "tool_result"  # type: ignore
-        assert content[0]["tool_use_id"] == "id1"  # type: ignore
-        # The actual content is nested inside
-        inner_content = list(content[0]["content"])  # type: ignore
-        assert inner_content[0]["type"] == "text"  # type: ignore
-        assert inner_content[0]["text"] == "Success"  # type: ignore
+        assert content[0]["type"] == "tool_result"  # type: ignore[index]
+        assert content[0]["tool_use_id"] == "call_123"  # type: ignore[index]
 
     @pytest.mark.asyncio
-    async def test_get_response(self, mock_mcp_client, mock_anthropic):
-        """Test getting model response from Claude API."""
-        # Disable telemetry for this test to avoid backend configuration issues
-        with patch("hud.settings.settings.telemetry_enabled", False):
-            agent = ClaudeAgent.create(
-                mcp_client=mock_mcp_client,
-                model_client=mock_anthropic,
-                validate_api_key=False,  # Skip validation in tests
-            )
-
-            # Mock the API response
-            mock_response = MagicMock()
-
-            # Create text block
-            text_block = MagicMock()
-            text_block.type = "text"
-            text_block.text = "Hello!"
-
-            # Create tool use block
-            tool_block = MagicMock()
-            tool_block.type = "tool_use"
-            tool_block.id = "tool_123"
-            tool_block.name = "test_tool"
-            tool_block.input = {"param": "value"}
-
-            mock_response.content = [text_block, tool_block]
-            mock_response.usage = MagicMock(input_tokens=10, output_tokens=20)
-
-            # Mock the streaming context manager
-            mock_stream = MockStreamContextManager(mock_response)
-            mock_anthropic.beta.messages.stream = MagicMock(return_value=mock_stream)
-
-            messages = [
-                cast(
-                    "BetaMessageParam",
-                    {"role": "user", "content": [{"type": "text", "text": "Hi"}]},
-                )
-            ]
-            response = await agent.get_response(messages)
-
-            assert response.content == "Hello!"
-            assert len(response.tool_calls) == 1
-            assert response.tool_calls[0].name == "test_tool"
-            assert response.tool_calls[0].arguments == {"param": "value"}
-            # The test was checking for Claude-specific attributes that aren't part of ModelResponse
-            # These would need to be accessed from the original Claude response if needed
-
-            # Verify API was called correctly
-            mock_anthropic.beta.messages.stream.assert_called_once()
+    async def test_format_tool_results_with_error(self, mock_anthropic: AsyncAnthropic) -> None:
+        """Test formatting tool results with error."""
+        agent = ClaudeAgent.create(
+            model_client=mock_anthropic,
+            validate_api_key=False,
+        )
 
-    @pytest.mark.asyncio
-    async def test_get_model_response_text_only(self, mock_mcp_client, mock_anthropic):
-        """Test getting text-only response."""
-        # Disable telemetry for this test to avoid backend configuration issues
-        with patch("hud.settings.settings.telemetry_enabled", False):
-            agent = ClaudeAgent.create(
-                mcp_client=mock_mcp_client,
-                model_client=mock_anthropic,
-                validate_api_key=False,  # Skip validation in tests
+        tool_calls = [MCPToolCall(id="call_123", name="test_tool", arguments={})]
+        tool_results = [
+            MCPToolResult(
+                content=[types.TextContent(type="text", text="Error message")],
+                isError=True,
             )
+        ]
 
-            mock_response = MagicMock()
-            # Create text block
-            text_block = MagicMock()
-            text_block.type = "text"
-            text_block.text = "Just text"
-            mock_response.content = [text_block]
-            mock_response.usage = MagicMock(input_tokens=5, output_tokens=10)
-
-            # Mock the streaming context manager
-            mock_stream = MockStreamContextManager(mock_response)
-            mock_anthropic.beta.messages.stream = MagicMock(return_value=mock_stream)
+        messages = await agent.format_tool_results(tool_calls, tool_results)
+        assert len(messages) == 1
+        content = messages[0]["content"]
+        # Error content should include "Error:" prefix
+        assert any("Error" in str(block) for block in content[0]["content"])  # type: ignore[index]
 
-            messages = [
-                cast(
-                    "BetaMessageParam",
-                    {"role": "user", "content": [{"type": "text", "text": "Hi"}]},
-                )
-            ]
-            response = await agent.get_response(messages)
+    @pytest.mark.asyncio
+    async def test_get_system_messages(self, mock_anthropic: AsyncAnthropic) -> None:
+        """Test that system messages return empty (Claude uses system param)."""
+        agent = ClaudeAgent.create(
+            model_client=mock_anthropic,
+            system_prompt="You are a helpful assistant.",
+            validate_api_key=False,
+        )
 
-            assert response.content == "Just text"
-            assert response.tool_calls == []
+        messages = await agent.get_system_messages()
+        # Claude doesn't use system messages in the message list
+        assert messages == []
 
     @pytest.mark.asyncio
-    async def test_get_response_with_thinking(self, mock_mcp_client, mock_anthropic):
+    async def test_get_response_with_thinking(self, mock_anthropic: AsyncAnthropic) -> None:
         """Test getting model response with thinking content."""
         with patch("hud.settings.settings.telemetry_enabled", False):
             agent = ClaudeAgent.create(
-                mcp_client=mock_mcp_client,
                 model_client=mock_anthropic,
                 validate_api_key=False,
             )
+            # Set up agent as initialized
+            agent.claude_tools = []
+            agent.tool_mapping = {}
+            agent.has_computer_tool = False
+            agent._initialized = True
 
             mock_response = MagicMock()
 
@@ -313,112 +315,109 @@ async def test_get_response_with_thinking(self, mock_mcp_client, mock_anthropic)
             assert response.reasoning == "Let me analyze this problem..."
 
     @pytest.mark.asyncio
-    async def test_get_model_response_error(self, mock_mcp_client, mock_anthropic):
-        """Test handling API errors."""
-        # Disable telemetry for this test to avoid backend configuration issues
-        with patch("hud.settings.settings.telemetry_enabled", False):
-            agent = ClaudeAgent.create(
-                mcp_client=mock_mcp_client,
-                model_client=mock_anthropic,
-                validate_api_key=False,  # Skip validation in tests
+    async def test_convert_tools_for_claude(self, mock_anthropic: AsyncAnthropic) -> None:
+        """Test converting MCP tools to Claude format."""
+        tools = [
+            types.Tool(
+                name="my_tool",
+                description="A test tool",
+                inputSchema={"type": "object", "properties": {"x": {"type": "string"}}},
             )
+        ]
+        ctx = MockEvalContext(tools=tools)
+        agent = ClaudeAgent.create(
+            model_client=mock_anthropic,
+            validate_api_key=False,
+        )
+
+        agent.ctx = ctx
+        await agent._initialize_from_ctx(ctx)
 
-            # Mock API error - stream() raises when entering context
-            error = BadRequestError(
-                message="Invalid request",
-                response=MagicMock(status_code=400),
-                body={"error": {"message": "Invalid request"}},
+        # Check that tools were converted
+        assert len(agent.claude_tools) == 1
+        assert agent.claude_tools[0]["name"] == "my_tool"  # type: ignore[typeddict-item]
+
+    @pytest.mark.asyncio
+    async def test_computer_tool_detection(self, mock_anthropic: AsyncAnthropic) -> None:
+        """Test that computer tools are detected for beta API."""
+        tools = [
+            types.Tool(
+                name="computer",
+                description="Control computer",
+                inputSchema={"type": "object"},
             )
+        ]
+        ctx = MockEvalContext(tools=tools)
+        agent = ClaudeAgent.create(
+            model_client=mock_anthropic,
+            validate_api_key=False,
+        )
 
-            class MockErrorStreamContextManager:
-                """Mock stream that raises error on enter."""
+        agent.ctx = ctx
+        await agent._initialize_from_ctx(ctx)
 
-                async def __aenter__(self):
-                    raise error
+        assert agent.has_computer_tool is True
 
-                async def __aexit__(self, exc_type, exc_val, exc_tb):
-                    return False
+    @pytest.mark.asyncio
+    async def test_get_response_with_text(self, mock_anthropic: AsyncAnthropic) -> None:
+        """Test getting response with text output."""
+        # Create mock response
+        mock_response = MagicMock()
+        mock_response.content = [MagicMock(type="text", text="Hello!")]
 
-            mock_anthropic.beta.messages.stream = MagicMock(
-                return_value=MockErrorStreamContextManager()
-            )
+        mock_stream = MockStreamContextManager(mock_response)
+        mock_anthropic.beta.messages.stream = MagicMock(return_value=mock_stream)
 
-            messages = [{"role": "user", "content": [{"type": "text", "text": "Hi"}]}]
+        agent = ClaudeAgent.create(
+            model_client=mock_anthropic,
+            validate_api_key=False,
+        )
+        agent.claude_tools = []
+        agent.tool_mapping = {}
+        agent.has_computer_tool = False
+        agent._initialized = True
 
-            with pytest.raises(BadRequestError):
-                await agent.get_response(messages)  # type: ignore
+        response = await agent.get_response([])
+        assert response.content == "Hello!"
+        assert response.done is True
+        assert len(response.tool_calls) == 0
+
+    @pytest.mark.asyncio
+    async def test_get_response_with_tool_call(self, mock_anthropic: AsyncAnthropic) -> None:
+        """Test getting response with tool call."""
+        mock_tool_use = MagicMock()
+        mock_tool_use.type = "tool_use"
+        mock_tool_use.id = "call_123"
+        mock_tool_use.name = "my_tool"
+        mock_tool_use.input = {"x": "value"}
+
+        mock_response = MagicMock()
+        mock_response.content = [mock_tool_use]
 
-    # This test is commented out as it's testing complex integration scenarios
-    # that may have changed in the implementation
-    # @pytest.mark.asyncio
-    # async def test_run_with_tools(self, mock_mcp_client, mock_anthropic):
-    #     """Test running agent with tool usage."""
-    #     # Disable telemetry for this test to avoid backend configuration issues
-    #     with patch("hud.settings.settings.telemetry_enabled", False):
-    #         agent = ClaudeAgent.create(mcp_client=mock_mcp_client, model_client=mock_anthropic)
-
-    #         # Mock tool availability
-    #         agent._available_tools = [
-    #             types.Tool(
-    #                 name="calculator", description="Calculator", inputSchema={"type": "object"}
-    #             )
-    #         ]
-    #         agent._tool_map = {
-    #             "calculator": types.Tool(
-    #                 name="calculator", description="Calculator", inputSchema={"type": "object"}
-    #             )
-    #         }
-
-    #         # Mock initial response with tool use
-    #         initial_response = MagicMock()
-    #         # Create tool use block
-    #         tool_block = MagicMock()
-    #         tool_block.type = "tool_use"
-    #         tool_block.id = "calc_123"
-    #         tool_block.name = "calculator"
-    #         tool_block.input = {"operation": "add", "a": 2, "b": 3}
-    #         initial_response.content = [tool_block]
-    #         initial_response.usage = MagicMock(input_tokens=10, output_tokens=15)
-
-    #         # Mock follow-up response
-    #         final_response = MagicMock()
-    #         text_block = MagicMock()
-    #         text_block.type = "text"
-    #         text_block.text = "2 + 3 = 5"
-    #         final_response.content = [text_block]
-    #         final_response.usage = MagicMock(input_tokens=20, output_tokens=10)
-
-    #         mock_anthropic.beta.messages.create = AsyncMock(
-    #             side_effect=[initial_response, final_response]
-    #         )
-
-    #         # Mock tool execution
-    #         mock_mcp_client.call_tool = AsyncMock(
-    #             return_value=MCPToolResult(
-    #                 content=[types.TextContent(type="text", text="5")], isError=False
-    #             )
-    #         )
-
-    #         # Mock the mcp_client properties
-    #         mock_mcp_client.mcp_config = {"test_server": {"url": "http://localhost"}}
-    #         mock_mcp_client.list_tools = AsyncMock(return_value=agent._available_tools)
-    #         mock_mcp_client.initialize = AsyncMock()
-
-    #         # Initialize the agent
-    #         await agent.initialize()
-
-    #         # Use a string prompt instead of a task
-    #         result = await agent.run("What is 2 + 3?")
-
-    #         assert result.content == "2 + 3 = 5"
-    #         assert result.done is True
+        mock_stream = MockStreamContextManager(mock_response)
+        mock_anthropic.beta.messages.stream = MagicMock(return_value=mock_stream)
+
+        agent = ClaudeAgent.create(
+            model_client=mock_anthropic,
+            validate_api_key=False,
+        )
+        agent.claude_tools = []
+        agent.tool_mapping = {"my_tool": "my_tool"}
+        agent.has_computer_tool = False
+        agent._initialized = True
+
+        response = await agent.get_response([])
+        assert response.done is False
+        assert len(response.tool_calls) == 1
+        assert response.tool_calls[0].name == "my_tool"
+        assert response.tool_calls[0].arguments == {"x": "value"}
 
 
 class TestClaudeAgentBedrock:
     """Test ClaudeAgent class with Bedrock."""
 
     @pytest.fixture
-    def bedrock_client(self):
+    def bedrock_client(self) -> AsyncAnthropicBedrock:
         """Create a real AsyncAnthropicBedrock client and stub networked methods."""
         client = AsyncAnthropicBedrock(
             aws_access_key="AKIATEST",
@@ -430,29 +429,27 @@ def bedrock_client(self):
         return client
 
     @pytest.mark.asyncio
-    async def test_init(self, mock_mcp_client, bedrock_client):
+    async def test_init(self, bedrock_client: AsyncAnthropicBedrock) -> None:
         """Test agent initialization."""
         agent = ClaudeAgent.create(
-            mcp_client=mock_mcp_client,
             model_client=bedrock_client,
-            checkpoint_name="test-model-arn",
-            validate_api_key=False,  # Skip validation in tests
+            model="test-model-arn",
+            validate_api_key=False,
         )
 
         assert agent.model_name == "Claude"
-        assert agent.config.checkpoint_name == "test-model-arn"
+        assert agent.config.model == "test-model-arn"
         assert agent.anthropic_client == bedrock_client
 
     @pytest.mark.asyncio
     async def test_get_response_bedrock_uses_create_not_stream(
-        self, mock_mcp_client, bedrock_client
-    ):
+        self, bedrock_client: AsyncAnthropicBedrock
+    ) -> None:
         """Bedrock path must call messages.create() (Bedrock doesn't support stream())."""
         with patch("hud.settings.settings.telemetry_enabled", False):
             agent = ClaudeAgent.create(
-                mcp_client=mock_mcp_client,
                 model_client=bedrock_client,
-                checkpoint_name="test-model-arn",
+                model="test-model-arn",
                 validate_api_key=False,
             )
 
@@ -465,7 +462,7 @@ async def test_get_response_bedrock_uses_create_not_stream(
             text_block.text = "Hello from Bedrock"
             mock_response.content = [text_block]
 
-            bedrock_client.beta.messages.create.return_value = mock_response
+            bedrock_client.beta.messages.create.return_value = mock_response  # type: ignore[union-attr]
 
             messages = [
                 cast(
@@ -480,12 +477,12 @@ async def test_get_response_bedrock_uses_create_not_stream(
 
             # Bedrock-specific behavior: uses create() and appends assistant message directly.
             assert not hasattr(bedrock_client.beta.messages, "stream")
-            bedrock_client.beta.messages.create.assert_awaited_once()
+            bedrock_client.beta.messages.create.assert_awaited_once()  # type: ignore[union-attr]
             assert len(messages) == 2
             assert messages[-1]["role"] == "assistant"
 
             # Ensure the Bedrock call shape is stable.
-            _, kwargs = bedrock_client.beta.messages.create.call_args
+            _, kwargs = bedrock_client.beta.messages.create.call_args  # type: ignore[union-attr]
             assert kwargs["model"] == "test-model-arn"
             assert kwargs["tool_choice"] == {"type": "auto", "disable_parallel_tool_use": True}
             assert "fine-grained-tool-streaming-2025-05-14" in kwargs["betas"]
@@ -493,30 +490,28 @@ async def test_get_response_bedrock_uses_create_not_stream(
 
     @pytest.mark.asyncio
     async def test_get_response_bedrock_missing_boto3_raises_value_error(
-        self, mock_mcp_client, bedrock_client
-    ):
+        self, bedrock_client: AsyncAnthropicBedrock
+    ) -> None:
         """If boto3 isn't installed, Bedrock client import path should raise a clear ValueError."""
         with patch("hud.settings.settings.telemetry_enabled", False):
             agent = ClaudeAgent.create(
-                mcp_client=mock_mcp_client,
                 model_client=bedrock_client,
-                checkpoint_name="test-model-arn",
+                model="test-model-arn",
                 validate_api_key=False,
             )
 
-            bedrock_client.beta.messages.create.side_effect = ModuleNotFoundError("boto3")
+            bedrock_client.beta.messages.create.side_effect = ModuleNotFoundError("boto3")  # type: ignore[union-attr]
             messages = [{"role": "user", "content": [{"type": "text", "text": "Hi"}]}]
 
             with pytest.raises(ValueError, match=r"boto3 is required for AWS Bedrock"):
                 await agent.get_response(messages)  # type: ignore
 
     def test_init_with_bedrock_client_does_not_require_anthropic_api_key(
-        self, mock_mcp_client, bedrock_client
+        self, bedrock_client: AsyncAnthropicBedrock
     ) -> None:
         """Providing model_client should bypass ANTHROPIC_API_KEY validation."""
         with patch("hud.settings.settings.anthropic_api_key", None):
             agent = ClaudeAgent.create(
-                mcp_client=mock_mcp_client,
                 model_client=bedrock_client,
                 validate_api_key=False,
             )
diff --git a/hud/agents/tests/test_client.py b/hud/agents/tests/test_client.py
index 11132506..c4f86fdf 100644
--- a/hud/agents/tests/test_client.py
+++ b/hud/agents/tests/test_client.py
@@ -15,7 +15,6 @@
 logger = logging.getLogger(__name__)
 
 
-@patch("hud.clients.base.setup_hud_telemetry")
 class TestMCPClient:
     """Test MCPClient class."""
 
@@ -34,7 +33,7 @@ def mock_mcp_use_client(self):
             yield mock_instance
 
     @pytest.mark.asyncio
-    async def test_connect_single_server(self, mock_telemetry, mock_mcp_use_client):
+    async def test_connect_single_server(self, mock_mcp_use_client):
         """Test connecting to a single server."""
         config = {"test_server": {"command": "python", "args": ["-m", "test_server"]}}
 
@@ -77,7 +76,7 @@ async def mock_list_tools():
         assert names == {"tool1", "tool2"}
 
     @pytest.mark.asyncio
-    async def test_connect_multiple_servers(self, mock_telemetry, mock_mcp_use_client):
+    async def test_connect_multiple_servers(self, mock_mcp_use_client):
         """Test connecting to multiple servers."""
         config = {
             "server1": {"command": "python", "args": ["-m", "server1"]},
@@ -129,7 +128,7 @@ async def mock_list_tools2():
         assert names == {"server1_tool1", "server2_tool2"}
 
     @pytest.mark.asyncio
-    async def test_call_tool(self, mock_telemetry, mock_mcp_use_client):
+    async def test_call_tool(self, mock_mcp_use_client):
         """Test calling a tool."""
         config = {"test": {"command": "test"}}
         client = MCPClient(mcp_config=config)
@@ -180,7 +179,7 @@ async def mock_list_tools():
         )
 
     @pytest.mark.asyncio
-    async def test_call_tool_not_found(self, mock_telemetry, mock_mcp_use_client):
+    async def test_call_tool_not_found(self, mock_mcp_use_client):
         """Test calling a non-existent tool."""
         config = {"test": {"command": "test"}}
         client = MCPClient(mcp_config=config)
@@ -208,7 +207,7 @@ async def mock_list_tools():
         assert "Tool 'nonexistent' not found" in text_content
 
     @pytest.mark.asyncio
-    async def test_get_telemetry_data(self, mock_telemetry, mock_mcp_use_client):
+    async def test_get_telemetry_data(self, mock_mcp_use_client):
         """Test getting telemetry data."""
         config = {"test": {"command": "test"}}
         client = MCPClient(mcp_config=config)
@@ -245,7 +244,7 @@ async def mock_list_tools():
         assert isinstance(telemetry_data, dict)
 
     @pytest.mark.asyncio
-    async def test_close(self, mock_telemetry, mock_mcp_use_client):
+    async def test_close(self, mock_mcp_use_client):
         """Test closing client connections."""
         config = {"test": {"command": "test"}}
         client = MCPClient(mcp_config=config)
@@ -267,7 +266,7 @@ async def mock_list_tools():
         mock_mcp_use_client.close_all_sessions.assert_called_once()
 
     @pytest.mark.asyncio
-    async def test_context_manager(self, mock_telemetry, mock_mcp_use_client):
+    async def test_context_manager(self, mock_mcp_use_client):
         """Test using client as context manager."""
         mock_session = MagicMock()
         mock_session.connector = MagicMock()
@@ -291,7 +290,7 @@ async def mock_list_tools():
         mock_mcp_use_client.close_all_sessions.assert_called_once()
 
     @pytest.mark.asyncio
-    async def test_get_available_tools(self, mock_telemetry, mock_mcp_use_client):
+    async def test_get_available_tools(self, mock_mcp_use_client):
         """Test getting available tools."""
         config = {"test": {"command": "test"}}
         client = MCPClient(mcp_config=config)
@@ -319,7 +318,7 @@ async def mock_list_tools():
         assert names == {"tool1", "tool2"}
 
     @pytest.mark.asyncio
-    async def test_get_tool_map(self, mock_telemetry, mock_mcp_use_client):
+    async def test_get_tool_map(self, mock_mcp_use_client):
         """Test getting tool map."""
         config = {"test": {"command": "test"}}
         client = MCPClient(mcp_config=config)
diff --git a/hud/agents/tests/test_gemini.py b/hud/agents/tests/test_gemini.py
index fcd7f946..cb6f64ff 100644
--- a/hud/agents/tests/test_gemini.py
+++ b/hud/agents/tests/test_gemini.py
@@ -3,7 +3,8 @@
 from __future__ import annotations
 
 import base64
-from unittest.mock import MagicMock, patch
+from typing import Any
+from unittest.mock import AsyncMock, MagicMock, patch
 
 import pytest
 from google import genai
@@ -11,37 +12,93 @@
 from mcp import types
 
 from hud.agents.gemini import GeminiAgent
-from hud.agents.gemini_cua import GeminiCUAAgent
+from hud.environment.router import ToolRouter
+from hud.eval.context import EvalContext
 from hud.types import MCPToolCall, MCPToolResult
 
 
+class MockEvalContext(EvalContext):
+    """Mock EvalContext for testing."""
+
+    def __init__(self, tools: list[types.Tool] | None = None) -> None:
+        # Core attributes
+        self.prompt = "Test prompt"
+        self._tools = tools or []
+        self._submitted: str | None = None
+        self.reward: float | None = None
+
+        # Environment attributes
+        self._router = ToolRouter()
+        self._agent_include: list[str] | None = None
+        self._agent_exclude: list[str] | None = None
+
+        # EvalContext attributes
+        self._task = None
+        self.trace_id = "test-trace-id"
+        self.eval_name = "test-eval"
+        self.job_id: str | None = None
+        self.group_id: str | None = None
+        self.index = 0
+        self.variants: dict[str, Any] = {}
+        self.answer: str | None = None
+        self.system_prompt: str | None = None
+        self.error: BaseException | None = None
+        self.metadata: dict[str, Any] = {}
+        self.results: list[Any] = []
+        self._is_summary = False
+
+    def as_tools(self) -> list[types.Tool]:
+        return self._tools
+
+    @property
+    def has_scenario(self) -> bool:
+        return False
+
+    async def list_tools(self) -> list[types.Tool]:
+        return self._tools
+
+    async def call_tool(self, call: Any, /, **kwargs: Any) -> MCPToolResult:
+        return MCPToolResult(
+            content=[types.TextContent(type="text", text="ok")],
+            isError=False,
+        )
+
+    async def submit(self, answer: str) -> None:
+        self._submitted = answer
+
+
 class TestGeminiAgent:
     """Test GeminiAgent base class."""
 
     @pytest.fixture
-    def mock_gemini_client(self):
+    def mock_gemini_client(self) -> MagicMock:
         """Create a stub Gemini client."""
-        client = genai.Client(api_key="test_key")
+        client = MagicMock(spec=genai.Client)
+        client.api_key = "test_key"
+        client.models = MagicMock()
         client.models.list = MagicMock(return_value=iter([]))
         client.models.generate_content = MagicMock()
+        # Set up async interface (aio.models.generate_content)
+        client.aio = MagicMock()
+        client.aio.models = MagicMock()
+        client.aio.models.generate_content = AsyncMock()
         return client
 
     @pytest.mark.asyncio
-    async def test_init(self, mock_mcp_client, mock_gemini_client):
+    async def test_init(self, mock_gemini_client: MagicMock) -> None:
         """Test agent initialization."""
         agent = GeminiAgent.create(
-            mcp_client=mock_mcp_client,
             model_client=mock_gemini_client,
-            checkpoint_name="gemini-2.5-flash",
-            validate_api_key=False,  # Skip validation in tests
+            model="gemini-2.5-flash",
+            validate_api_key=False,
         )
 
         assert agent.model_name == "Gemini"
-        assert agent.config.checkpoint_name == "gemini-2.5-flash"
+        assert agent.config.model == "gemini-2.5-flash"
         assert agent.gemini_client == mock_gemini_client
 
     @pytest.mark.asyncio
-    async def test_init_without_model_client(self, mock_mcp_client):
+    async def test_init_without_model_client(self) -> None:
         """Test agent initialization without model client."""
         with (
             patch("hud.settings.settings.gemini_api_key", "test_key"),
@@ -54,145 +111,97 @@ async def test_init_without_model_client(self, mock_mcp_client):
             mock_client_class.return_value = mock_client
 
             agent = GeminiAgent.create(
-                mcp_client=mock_mcp_client,
-                checkpoint_name="gemini-2.5-flash",
+                model="gemini-2.5-flash",
                 validate_api_key=False,
             )
 
-            assert agent.model_name == "Gemini"
             assert agent.gemini_client is not None
 
     @pytest.mark.asyncio
-    async def test_format_blocks(self, mock_mcp_client, mock_gemini_client):
-        """Test formatting content blocks into Gemini messages."""
+    async def test_format_blocks_text_only(self, mock_gemini_client: MagicMock) -> None:
+        """Test formatting text content blocks."""
         agent = GeminiAgent.create(
-            mcp_client=mock_mcp_client,
             model_client=mock_gemini_client,
             validate_api_key=False,
         )
 
-        # Test with text only
-        text_blocks: list[types.ContentBlock] = [
-            types.TextContent(type="text", text="Hello, Gemini!")
+        blocks: list[types.ContentBlock] = [
+            types.TextContent(type="text", text="Hello, world!"),
+            types.TextContent(type="text", text="How are you?"),
         ]
-        messages = await agent.format_blocks(text_blocks)
-        assert len(messages) == 1
-        assert messages[0].role == "user"
-        parts = messages[0].parts
-        assert parts is not None
-        assert len(parts) == 1
-        assert parts[0].text == "Hello, Gemini!"
-
-        # Test with screenshot
-        image_blocks: list[types.ContentBlock] = [
-            types.TextContent(type="text", text="Look at this"),
-            types.ImageContent(
-                type="image",
-                data=base64.b64encode(b"fakeimage").decode("utf-8"),
-                mimeType="image/png",
-            ),
-        ]
-        messages = await agent.format_blocks(image_blocks)
+
+        messages = await agent.format_blocks(blocks)
         assert len(messages) == 1
         assert messages[0].role == "user"
-        parts = messages[0].parts
-        assert parts is not None
-        assert len(parts) == 2
-        # First part is text
-        assert parts[0].text == "Look at this"
-        # Second part is image - check that it was created from bytes
-        assert parts[1].inline_data is not None
+        assert messages[0].parts is not None
+        assert len(messages[0].parts) == 2
 
     @pytest.mark.asyncio
-    async def test_format_tool_results(self, mock_mcp_client, mock_gemini_client):
-        """Test the agent's format_tool_results method for non-computer tools."""
+    async def test_format_blocks_with_image(self, mock_gemini_client: MagicMock) -> None:
+        """Test formatting image content blocks."""
         agent = GeminiAgent.create(
-            mcp_client=mock_mcp_client,
             model_client=mock_gemini_client,
             validate_api_key=False,
         )
 
-        tool_calls = [
-            MCPToolCall(
-                name="calculator",
-                arguments={"operation": "add", "a": 1, "b": 2},
-                id="call_1",  # type: ignore
-                gemini_name="calculator",  # type: ignore
-            ),
-        ]
+        # Create a tiny valid base64 PNG
+        png_data = base64.b64encode(b"\x89PNG\r\n\x1a\n").decode()
 
-        tool_results = [
-            MCPToolResult(
-                content=[
-                    types.TextContent(type="text", text="Result: 3"),
-                ],
-                isError=False,
-            ),
+        blocks: list[types.ContentBlock] = [
+            types.TextContent(type="text", text="Look at this:"),
+            types.ImageContent(type="image", data=png_data, mimeType="image/png"),
         ]
 
-        messages = await agent.format_tool_results(tool_calls, tool_results)
-
-        # format_tool_results returns a single user message with function responses
+        messages = await agent.format_blocks(blocks)
         assert len(messages) == 1
-        assert messages[0].role == "user"
-        # The content contains function response parts
-        parts = messages[0].parts
-        assert parts is not None
-        assert len(parts) == 1
-        function_response = parts[0].function_response
-        assert function_response is not None
-        assert function_response.name == "calculator"
-        response_payload = function_response.response or {}
-        assert response_payload.get("success") is True
-        assert response_payload.get("output") == "Result: 3"
+        assert messages[0].parts is not None
+        assert len(messages[0].parts) == 2
 
     @pytest.mark.asyncio
-    async def test_format_tool_results_with_error(self, mock_mcp_client, mock_gemini_client):
-        """Test formatting tool results with errors."""
+    async def test_format_tool_results(self, mock_gemini_client: MagicMock) -> None:
+        """Test formatting tool results."""
         agent = GeminiAgent.create(
-            mcp_client=mock_mcp_client,
             model_client=mock_gemini_client,
             validate_api_key=False,
         )
 
-        tool_calls = [
-            MCPToolCall(
-                name="calculator",
-                arguments={"operation": "divide", "a": 1, "b": 0},
-                id="call_error",  # type: ignore
-                gemini_name="calculator",  # type: ignore
-            ),
-        ]
-
+        tool_calls = [MCPToolCall(id="call_123", name="test_tool", arguments={})]
         tool_results = [
             MCPToolResult(
-                content=[types.TextContent(type="text", text="Division by zero error")],
-                isError=True,
-            ),
+                content=[types.TextContent(type="text", text="Tool output")],
+                isError=False,
+            )
         ]
 
         messages = await agent.format_tool_results(tool_calls, tool_results)
-
-        # Check that error is in the response
         assert len(messages) == 1
         assert messages[0].role == "user"
-        parts = messages[0].parts
-        assert parts is not None
-        function_response = parts[0].function_response
-        assert function_response is not None
-        response_payload = function_response.response or {}
-        assert "error" in response_payload
 
     @pytest.mark.asyncio
-    async def test_get_response_text_only(self, mock_mcp_client, mock_gemini_client):
+    async def test_get_system_messages(self, mock_gemini_client: MagicMock) -> None:
+        """Test that system messages return empty (Gemini uses system_instruction)."""
+        agent = GeminiAgent.create(
+            model_client=mock_gemini_client,
+            system_prompt="You are a helpful assistant.",
+            validate_api_key=False,
+        )
+
+        messages = await agent.get_system_messages()
+        # Gemini doesn't use system messages in the message list
+        assert messages == []
+
+    @pytest.mark.asyncio
+    async def test_get_response_text_only(self, mock_gemini_client: MagicMock) -> None:
         """Test getting text-only response."""
         # Disable telemetry for this test
         with patch("hud.settings.settings.telemetry_enabled", False):
             agent = GeminiAgent.create(
-                mcp_client=mock_mcp_client,
                 model_client=mock_gemini_client,
                 validate_api_key=False,
             )
+            # Set up agent as initialized (no tools needed for this test)
+            agent.gemini_tools = []
+            agent._initialized = True
 
             # Mock the API response with text only
             mock_response = MagicMock()
@@ -207,9 +216,11 @@ async def test_get_response_text_only(self, mock_mcp_client, mock_gemini_client)
 
             mock_response.candidates = [mock_candidate]
 
-            mock_gemini_client.models.generate_content = MagicMock(return_value=mock_response)
+            mock_gemini_client.aio.models.generate_content = AsyncMock(return_value=mock_response)
 
-            messages = [genai_types.Content(role="user", parts=[genai_types.Part(text="Status?")])]
+            messages = [
+                genai_types.Content(role="user", parts=[genai_types.Part.from_text(text="Status?")])
+            ]
             response = await agent.get_response(messages)
 
             assert response.content == "Task completed successfully"
@@ -217,14 +228,16 @@ async def test_get_response_text_only(self, mock_mcp_client, mock_gemini_client)
             assert response.done is True
 
     @pytest.mark.asyncio
-    async def test_get_response_with_thinking(self, mock_mcp_client, mock_gemini_client):
+    async def test_get_response_with_thinking(self, mock_gemini_client: MagicMock) -> None:
         """Test getting response with thinking content."""
         with patch("hud.settings.settings.telemetry_enabled", False):
             agent = GeminiAgent.create(
-                mcp_client=mock_mcp_client,
                 model_client=mock_gemini_client,
                 validate_api_key=False,
             )
+            # Set up agent as initialized (no tools needed for this test)
+            agent.gemini_tools = []
+            agent._initialized = True
 
             mock_response = MagicMock()
             mock_candidate = MagicMock()
@@ -244,10 +257,12 @@ async def test_get_response_with_thinking(self, mock_mcp_client, mock_gemini_cli
 
             mock_response.candidates = [mock_candidate]
 
-            mock_gemini_client.models.generate_content = MagicMock(return_value=mock_response)
+            mock_gemini_client.aio.models.generate_content = AsyncMock(return_value=mock_response)
 
             messages = [
-                genai_types.Content(role="user", parts=[genai_types.Part(text="Hard question")])
+                genai_types.Content(
+                    role="user", parts=[genai_types.Part.from_text(text="Hard question")]
+                )
             ]
             response = await agent.get_response(messages)
 
@@ -255,346 +270,100 @@ async def test_get_response_with_thinking(self, mock_mcp_client, mock_gemini_cli
             assert response.reasoning == "Let me reason through this..."
 
     @pytest.mark.asyncio
-    async def test_convert_tools_for_gemini(self, mock_mcp_client, mock_gemini_client):
+    async def test_convert_tools_for_gemini(self, mock_gemini_client: MagicMock) -> None:
         """Test converting MCP tools to Gemini format."""
-        agent = GeminiAgent.create(
-            mcp_client=mock_mcp_client,
-            model_client=mock_gemini_client,
-            validate_api_key=False,
-        )
-
-        # Set up available tools (no computer tool for base agent)
-        agent._available_tools = [
-            types.Tool(
-                name="calculator",
-                description="Calculator tool",
-                inputSchema={
-                    "type": "object",
-                    "properties": {"operation": {"type": "string"}},
-                },
-            ),
+        tools = [
             types.Tool(
-                name="weather",
-                description="Weather tool",
-                inputSchema={
-                    "type": "object",
-                    "properties": {"location": {"type": "string"}},
-                },
-            ),
+                name="my_tool",
+                description="A test tool",
+                inputSchema={"type": "object", "properties": {"x": {"type": "string"}}},
+            )
         ]
-
-        gemini_tools = agent._convert_tools_for_gemini()
-
-        # Should have 2 function declaration tools
-        assert len(gemini_tools) == 2
-
-        # Both should be function declarations
-        assert gemini_tools[0].function_declarations is not None  # type: ignore[reportAttributeAccessIssue]
-        assert len(gemini_tools[0].function_declarations) == 1  # type: ignore[reportAttributeAccessIssue]
-        assert gemini_tools[0].function_declarations[0].name == "calculator"  # type: ignore[reportAttributeAccessIssue]
-
-        assert gemini_tools[1].function_declarations is not None  # type: ignore[reportAttributeAccessIssue]
-        assert len(gemini_tools[1].function_declarations) == 1  # type: ignore[reportAttributeAccessIssue]
-        assert gemini_tools[1].function_declarations[0].name == "weather"  # type: ignore[reportAttributeAccessIssue]
-
-    @pytest.mark.asyncio
-    async def test_create_user_message(self, mock_mcp_client, mock_gemini_client):
-        """Test creating a user message."""
+        ctx = MockEvalContext(tools=tools)
         agent = GeminiAgent.create(
-            mcp_client=mock_mcp_client,
             model_client=mock_gemini_client,
             validate_api_key=False,
         )
 
-        message = await agent.create_user_message("Hello Gemini")
+        agent.ctx = ctx
+        await agent._initialize_from_ctx(ctx)
 
-        assert message.role == "user"
-        parts = message.parts
-        assert parts is not None
-        assert len(parts) == 1
-        assert parts[0].text == "Hello Gemini"
+        # Check that tools were converted
+        assert len(agent.gemini_tools) == 1
+        # Gemini tools have function_declarations - cast to genai Tool type
+        gemini_tool = agent.gemini_tools[0]
+        assert isinstance(gemini_tool, genai_types.Tool)
+        assert gemini_tool.function_declarations is not None
+        assert gemini_tool.function_declarations[0].name == "my_tool"
 
-    @pytest.mark.asyncio
-    async def test_handle_empty_response(self, mock_mcp_client, mock_gemini_client):
-        """Test handling empty response from API."""
-        with patch("hud.settings.settings.telemetry_enabled", False):
-            agent = GeminiAgent.create(
-                mcp_client=mock_mcp_client,
-                model_client=mock_gemini_client,
-                validate_api_key=False,
-            )
-
-            # Mock empty response
-            mock_response = MagicMock()
-            mock_response.candidates = []
 
-            mock_gemini_client.models.generate_content = MagicMock(return_value=mock_response)
-
-            messages = [genai_types.Content(role="user", parts=[genai_types.Part(text="Hi")])]
-            response = await agent.get_response(messages)
-
-            assert response.content == ""
-            assert response.tool_calls == []
-            assert response.done is True
-
-
-class TestGeminiCUAAgent:
-    """Test GeminiCUAAgent computer use agent."""
+class TestGeminiToolConversion:
+    """Tests for tool conversion to Gemini format."""
 
     @pytest.fixture
-    def mock_gemini_client(self):
+    def mock_gemini_client(self) -> MagicMock:
         """Create a stub Gemini client."""
-        client = genai.Client(api_key="test_key")
+        client = MagicMock(spec=genai.Client)
+        client.api_key = "test_key"
+        client.models = MagicMock()
         client.models.list = MagicMock(return_value=iter([]))
-        client.models.generate_content = MagicMock()
+        # Set up async interface
+        client.aio = MagicMock()
+        client.aio.models = MagicMock()
+        client.aio.models.generate_content = AsyncMock()
         return client
 
     @pytest.mark.asyncio
-    async def test_init(self, mock_mcp_client_gemini_computer, mock_gemini_client):
-        """Test agent initialization."""
-        agent = GeminiCUAAgent.create(
-            mcp_client=mock_mcp_client_gemini_computer,
-            model_client=mock_gemini_client,
-            checkpoint_name="gemini-2.5-computer-use-preview",
-            validate_api_key=False,  # Skip validation in tests
-        )
-
-        assert agent.model_name == "GeminiCUA"
-        assert agent.config.checkpoint_name == "gemini-2.5-computer-use-preview"
-        assert agent.gemini_client == mock_gemini_client
-
-    @pytest.mark.asyncio
-    async def test_format_tool_results_with_screenshot(
-        self, mock_mcp_client_gemini_computer, mock_gemini_client
-    ):
-        """Test the agent's format_tool_results method with screenshots."""
-        agent = GeminiCUAAgent.create(
-            mcp_client=mock_mcp_client_gemini_computer,
-            model_client=mock_gemini_client,
-            validate_api_key=False,
-        )
-
-        tool_calls = [
-            MCPToolCall(
-                name="gemini_computer",
-                arguments={"action": "click_at", "x": 100, "y": 200},
-                id="call_1",  # type: ignore
-                gemini_name="click_at",  # type: ignore
-            ),
-        ]
-
-        tool_results = [
-            MCPToolResult(
-                content=[
-                    types.TextContent(type="text", text="__URL__:https://example.com"),
-                    types.ImageContent(
-                        type="image",
-                        data=base64.b64encode(b"screenshot").decode("utf-8"),
-                        mimeType="image/png",
-                    ),
-                ],
-                isError=False,
-            ),
-        ]
-
-        messages = await agent.format_tool_results(tool_calls, tool_results)
-
-        # format_tool_results returns a single user message with function responses
-        assert len(messages) == 1
-        assert messages[0].role == "user"
-        # The content contains function response parts
-        parts = messages[0].parts
-        assert parts is not None
-        assert len(parts) == 1
-        function_response = parts[0].function_response
-        assert function_response is not None
-        assert function_response.name == "click_at"
-        response_payload = function_response.response or {}
-        assert response_payload.get("success") is True
-        assert response_payload.get("url") == "https://example.com"
-
-    @pytest.mark.asyncio
-    async def test_format_tool_results_with_error(
-        self, mock_mcp_client_gemini_computer, mock_gemini_client
-    ):
-        """Test formatting tool results with errors."""
-        agent = GeminiCUAAgent.create(
-            mcp_client=mock_mcp_client_gemini_computer,
-            model_client=mock_gemini_client,
-            validate_api_key=False,
-        )
-
-        tool_calls = [
-            MCPToolCall(
-                name="gemini_computer",
-                arguments={"action": "invalid"},
-                id="call_error",  # type: ignore
-                gemini_name="click_at",  # type: ignore
-            ),
-        ]
-
-        tool_results = [
-            MCPToolResult(
-                content=[types.TextContent(type="text", text="Action failed: invalid action")],
-                isError=True,
-            ),
-        ]
-
-        messages = await agent.format_tool_results(tool_calls, tool_results)
-
-        # Check that error is in the response
-        assert len(messages) == 1
-        assert messages[0].role == "user"
-        parts = messages[0].parts
-        assert parts is not None
-        function_response = parts[0].function_response
-        assert function_response is not None
-        response_payload = function_response.response or {}
-        assert "error" in response_payload
-
-    @pytest.mark.asyncio
-    async def test_get_response(self, mock_mcp_client_gemini_computer, mock_gemini_client):
-        """Test getting model response from Gemini API."""
-        # Disable telemetry for this test
-        with patch("hud.settings.settings.telemetry_enabled", False):
-            agent = GeminiCUAAgent.create(
-                mcp_client=mock_mcp_client_gemini_computer,
-                model_client=mock_gemini_client,
-                validate_api_key=False,
-            )
-
-            # Set up available tools
-            agent._available_tools = [
-                types.Tool(name="gemini_computer", description="Computer tool", inputSchema={})
-            ]
-
-            # Mock the API response
-            mock_response = MagicMock()
-            mock_candidate = MagicMock()
-
-            # Create text part
-            text_part = MagicMock()
-            text_part.text = "I will click at coordinates"
-            text_part.function_call = None
-
-            # Create function call part
-            function_call_part = MagicMock()
-            function_call_part.text = None
-            function_call_part.function_call = MagicMock()
-            function_call_part.function_call.name = "click_at"
-            function_call_part.function_call.args = {"coordinate": [100, 200]}
-
-            mock_candidate.content = MagicMock()
-            mock_candidate.content.parts = [text_part, function_call_part]
-
-            mock_response.candidates = [mock_candidate]
-
-            mock_gemini_client.models.generate_content = MagicMock(return_value=mock_response)
-
-            messages = [genai_types.Content(role="user", parts=[genai_types.Part(text="Click")])]
-            response = await agent.get_response(messages)
-
-            assert response.content == "I will click at coordinates"
-            assert len(response.tool_calls) == 1
-            # Check normalized arguments
-            assert response.tool_calls[0].arguments == {"action": "click_at", "x": 100, "y": 200}
-            assert response.done is False
-
-    @pytest.mark.asyncio
-    async def test_convert_tools_for_gemini(
-        self, mock_mcp_client_gemini_computer, mock_gemini_client
-    ):
-        """Test converting MCP tools to Gemini format."""
-        agent = GeminiCUAAgent.create(
-            mcp_client=mock_mcp_client_gemini_computer,
-            model_client=mock_gemini_client,
-            validate_api_key=False,
-        )
-
-        # Set up available tools
-        agent._available_tools = [
-            types.Tool(
-                name="gemini_computer",
-                description="Computer tool",
-                inputSchema={"type": "object"},
-            ),
+    async def test_tool_with_properties(self, mock_gemini_client: MagicMock) -> None:
+        """Test tool with input properties."""
+        tools = [
             types.Tool(
-                name="calculator",
-                description="Calculator tool",
+                name="search",
+                description="Search the web",
                 inputSchema={
                     "type": "object",
-                    "properties": {"operation": {"type": "string"}},
+                    "properties": {
+                        "query": {"type": "string", "description": "Search query"},
+                        "limit": {"type": "integer", "description": "Max results"},
+                    },
+                    "required": ["query"],
                 },
-            ),
+            )
         ]
-
-        gemini_tools = agent._convert_tools_for_gemini()
-
-        # Should have 2 tools: computer_use and calculator
-        assert len(gemini_tools) == 2
-
-        # First should be computer use tool
-        assert gemini_tools[0].computer_use is not None  # type: ignore[reportAttributeAccessIssue]
-        assert (
-            gemini_tools[0].computer_use.environment == genai_types.Environment.ENVIRONMENT_BROWSER  # type: ignore[reportAttributeAccessIssue]
-        )
-
-        # Second should be calculator as function declaration
-        assert gemini_tools[1].function_declarations is not None  # type: ignore[reportAttributeAccessIssue]
-        assert len(gemini_tools[1].function_declarations) == 1  # type: ignore[reportAttributeAccessIssue]
-        assert gemini_tools[1].function_declarations[0].name == "calculator"  # type: ignore[reportAttributeAccessIssue]
-
-    @pytest.mark.asyncio
-    async def test_extract_tool_call_normalizes_coordinates(
-        self, mock_mcp_client_gemini_computer, mock_gemini_client
-    ):
-        """Test that _extract_tool_call normalizes coordinate arrays to x/y."""
-        agent = GeminiCUAAgent.create(
-            mcp_client=mock_mcp_client_gemini_computer,
+        ctx = MockEvalContext(tools=tools)
+        agent = GeminiAgent.create(
             model_client=mock_gemini_client,
             validate_api_key=False,
         )
 
-        # Set up tool mapping
-        agent._gemini_to_mcp_tool_map = {"click_at": "gemini_computer"}
-
-        # Create a mock part with function call
-        part = MagicMock()
-        part.function_call = MagicMock()
-        part.function_call.name = "click_at"
-        part.function_call.args = {"coordinate": [150, 250]}
+        agent.ctx = ctx
+        await agent._initialize_from_ctx(ctx)
 
-        tool_call = agent._extract_tool_call(part)
-
-        assert tool_call is not None
-        assert tool_call.name == "gemini_computer"
-        assert tool_call.arguments["action"] == "click_at"  # type: ignore[reportAttributeAccessIssue]
-        assert tool_call.arguments["x"] == 150  # type: ignore[reportAttributeAccessIssue]
-        assert tool_call.arguments["y"] == 250  # type: ignore[reportAttributeAccessIssue]
+        assert len(agent.gemini_tools) == 1
+        gemini_tool = agent.gemini_tools[0]
+        # Gemini tools have function_declarations - cast to genai Tool type
+        assert isinstance(gemini_tool, genai_types.Tool)
+        assert gemini_tool.function_declarations is not None
+        assert gemini_tool.function_declarations[0].name == "search"
+        assert gemini_tool.function_declarations[0].parameters_json_schema is not None
 
     @pytest.mark.asyncio
-    async def test_extract_tool_call_preserves_non_computer_args(
-        self, mock_mcp_client_gemini_computer, mock_gemini_client
-    ):
-        """Test that _extract_tool_call preserves arguments for non-computer tools."""
-        agent = GeminiCUAAgent.create(
-            mcp_client=mock_mcp_client_gemini_computer,
+    async def test_tool_without_schema(self, mock_gemini_client: MagicMock) -> None:
+        """Test tool without description raises error."""
+        # Create a tool with inputSchema but no description
+        tools = [
+            types.Tool(
+                name="incomplete",
+                description=None,
+                inputSchema={"type": "object"},
+            )
+        ]
+        ctx = MockEvalContext(tools=tools)
+        agent = GeminiAgent.create(
             model_client=mock_gemini_client,
             validate_api_key=False,
         )
 
-        # Set up tool mapping
-        agent._gemini_to_mcp_tool_map = {"calculator": "calculator"}
-
-        # Create a mock part with function call for non-computer tool
-        part = MagicMock()
-        part.function_call = MagicMock()
-        part.function_call.name = "calculator"
-        part.function_call.args = {"operation": "add", "a": 1, "b": 2}
-
-        tool_call = agent._extract_tool_call(part)
-
-        assert tool_call is not None
-        assert tool_call.name == "calculator"
-        # Arguments should be passed as-is, no normalization
-        assert tool_call.arguments == {"operation": "add", "a": 1, "b": 2}
+        agent.ctx = ctx
+        with pytest.raises(ValueError, match="requires both a description"):
+            await agent._initialize_from_ctx(ctx)
diff --git a/hud/agents/tests/test_grounded_openai_agent.py b/hud/agents/tests/test_grounded_openai_agent.py
index 83d8081f..04bab667 100644
--- a/hud/agents/tests/test_grounded_openai_agent.py
+++ b/hud/agents/tests/test_grounded_openai_agent.py
@@ -70,20 +70,20 @@ def get_openai_tool_schema(self) -> dict:
 
 @pytest.mark.asyncio
 async def test_call_tools_injects_screenshot_and_delegates(monkeypatch: pytest.MonkeyPatch) -> None:
-    # Agent with fake OpenAI client and fake MCP client
+    # Agent with fake OpenAI client
     grounder_cfg = GrounderConfig(api_base="http://example", model="qwen")
     fake_openai = AsyncOpenAI(api_key="test")
     agent = GroundedOpenAIChatAgent.create(
         grounder_config=grounder_cfg,
         openai_client=fake_openai,
-        checkpoint_name="gpt-4o-mini",
-        mcp_client=FakeMCPClient(),
+        model="gpt-4o-mini",
         initial_screenshot=False,
     )
 
     # Inject a dummy grounded tool to observe args without full initialization
     dummy_tool = DummyGroundedTool()
     agent.grounded_tool = dummy_tool  # type: ignore
+    agent._initialized = True  # Mark as initialized to skip context initialization
 
     # Seed conversation history with a user image
     png_b64 = (
@@ -129,8 +129,7 @@ async def test_get_response_with_reasoning() -> None:
         agent = GroundedOpenAIChatAgent.create(
             grounder_config=grounder_cfg,
             openai_client=fake_openai,
-            checkpoint_name="gpt-4o-mini",
-            mcp_client=FakeMCPClient(),
+            model="gpt-4o-mini",
             initial_screenshot=False,
         )
 
@@ -148,9 +147,21 @@ async def test_get_response_with_reasoning() -> None:
         mock_response.choices = [mock_choice]
 
         agent.oai.chat.completions.create = AsyncMock(return_value=mock_response)
+        agent._initialized = True  # Mark as initialized to skip context initialization
 
+        # Include an image so get_response doesn't try to take a screenshot via ctx
+        png_b64 = (
+            "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR4nGMAAQAABQAB"
+            "J2n0mQAAAABJRU5ErkJggg=="
+        )
         agent.conversation_history = [
-            {"role": "user", "content": [{"type": "text", "text": "Hard question"}]}
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{png_b64}"}},
+                    {"type": "text", "text": "Hard question"},
+                ],
+            }
         ]
 
         response = await agent.get_response(agent.conversation_history)
diff --git a/hud/agents/tests/test_openai.py b/hud/agents/tests/test_openai.py
index 1c2947a3..cae578da 100644
--- a/hud/agents/tests/test_openai.py
+++ b/hud/agents/tests/test_openai.py
@@ -2,8 +2,8 @@
 
 from __future__ import annotations
 
-from typing import Any, cast
-from unittest.mock import AsyncMock, MagicMock, patch
+from typing import TYPE_CHECKING, Any, cast
+from unittest.mock import AsyncMock, patch
 
 import pytest
 from mcp import types
@@ -15,51 +15,101 @@
     ResponseReasoningItem,
 )
 from openai.types.responses.response_reasoning_item import Summary
-from pydantic import AnyUrl
 
 from hud.agents.openai import OpenAIAgent
+from hud.environment.router import ToolRouter
+from hud.eval.context import EvalContext
 from hud.types import MCPToolCall, MCPToolResult
 
+if TYPE_CHECKING:
+    from collections.abc import Generator
+
+
+class MockEvalContext(EvalContext):
+    """Mock EvalContext for testing."""
+
+    def __init__(self, tools: list[types.Tool] | None = None) -> None:
+        # Core attributes
+        self.prompt = "Test prompt"
+        self._tools = tools or []
+        self._submitted: str | None = None
+        self.reward: float | None = None
+
+        # Environment attributes
+        self._router = ToolRouter()
+        self._agent_include: list[str] | None = None
+        self._agent_exclude: list[str] | None = None
+
+        # EvalContext attributes
+        self._task = None
+        self.trace_id = "test-trace-id"
+        self.eval_name = "test-eval"
+        self.job_id: str | None = None
+        self.group_id: str | None = None
+        self.index = 0
+        self.variants: dict[str, Any] = {}
+        self.answer: str | None = None
+        self.system_prompt: str | None = None
+        self.error: BaseException | None = None
+        self.metadata: dict[str, Any] = {}
+        self.results: list[Any] = []
+        self._is_summary = False
+
+    def as_tools(self) -> list[types.Tool]:
+        return self._tools
+
+    @property
+    def has_scenario(self) -> bool:
+        return False
+
+    async def list_tools(self) -> list[types.Tool]:
+        return self._tools
+
+    async def call_tool(self, call: Any, /, **kwargs: Any) -> MCPToolResult:
+        return MCPToolResult(
+            content=[types.TextContent(type="text", text="ok")],
+            isError=False,
+        )
+
+    async def submit(self, answer: str) -> None:
+        self._submitted = answer
+
 
 class TestOpenAIAgent:
     """Test OpenAIAgent class."""
 
     @pytest.fixture
-    def mock_openai(self):
+    def mock_openai(self) -> Generator[AsyncOpenAI, None, None]:  # type: ignore[misc]
         """Create a stub OpenAI client."""
         with patch("hud.agents.openai.AsyncOpenAI") as mock_class:
             client = AsyncOpenAI(api_key="test", base_url="http://localhost")
             client.chat.completions.create = AsyncMock()
             client.responses.create = AsyncMock()
             mock_class.return_value = client
-            yield client
+            yield client  # type: ignore[misc]
 
     @pytest.mark.asyncio
-    async def test_init_with_client(self, mock_mcp_client):
+    async def test_init_with_client(self, mock_openai: AsyncOpenAI) -> None:
         """Test agent initialization with provided client."""
-        mock_model_client = AsyncOpenAI(api_key="test", base_url="http://localhost")
         agent = OpenAIAgent.create(
-            mcp_client=mock_mcp_client,
-            model_client=mock_model_client,
-            checkpoint_name="gpt-4o",
+            model_client=mock_openai,
+            model="gpt-4o",
             validate_api_key=False,
         )
 
         assert agent.model_name == "OpenAI"
-        assert agent.config.checkpoint_name == "gpt-4o"
-        assert agent.checkpoint_name == "gpt-4o"
-        assert agent.openai_client == mock_model_client
+        assert agent.config.model == "gpt-4o"
+        assert agent.model == "gpt-4o"
+        assert agent.openai_client == mock_openai
         assert agent.max_output_tokens is None
         assert agent.temperature is None
 
     @pytest.mark.asyncio
-    async def test_init_with_parameters(self, mock_mcp_client):
+    async def test_init_with_parameters(self, mock_openai: AsyncOpenAI) -> None:
         """Test agent initialization with various parameters."""
-        mock_model_client = AsyncOpenAI(api_key="test", base_url="http://localhost")
         agent = OpenAIAgent.create(
-            mcp_client=mock_mcp_client,
-            model_client=mock_model_client,
-            checkpoint_name="gpt-4o",
+            model_client=mock_openai,
+            model="gpt-4o",
             max_output_tokens=2048,
             temperature=0.7,
             reasoning={"effort": "high"},
@@ -75,18 +125,17 @@ async def test_init_with_parameters(self, mock_mcp_client):
         assert agent.parallel_tool_calls is True
 
     @pytest.mark.asyncio
-    async def test_init_without_client_no_api_key(self, mock_mcp_client):
+    async def test_init_without_client_no_api_key(self) -> None:
         """Test agent initialization fails without API key."""
         with patch("hud.agents.openai.settings") as mock_settings:
             mock_settings.openai_api_key = None
             with pytest.raises(ValueError, match="OpenAI API key not found"):
-                OpenAIAgent.create(mcp_client=mock_mcp_client)
+                OpenAIAgent.create()
 
     @pytest.mark.asyncio
-    async def test_format_blocks_text_only(self, mock_mcp_client, mock_openai):
+    async def test_format_blocks_text_only(self, mock_openai: AsyncOpenAI) -> None:
         """Test formatting text content blocks."""
         agent = OpenAIAgent.create(
-            mcp_client=mock_mcp_client,
             model_client=mock_openai,
             validate_api_key=False,
         )
@@ -98,986 +147,303 @@ async def test_format_blocks_text_only(self, mock_mcp_client, mock_openai):
 
         messages = await agent.format_blocks(blocks)
         assert len(messages) == 1
-        msg = cast("dict[str, Any]", messages[0])
-        assert msg["role"] == "user"
-        content = cast("list[dict[str, Any]]", msg["content"])
-        assert len(content) == 2
-        assert content[0] == {"type": "input_text", "text": "Hello, world!"}
-        assert content[1] == {"type": "input_text", "text": "How are you?"}
+        assert messages[0]["role"] == "user"
+        assert len(messages[0]["content"]) == 2
+        assert messages[0]["content"][0]["type"] == "input_text"
+        assert messages[0]["content"][0]["text"] == "Hello, world!"
 
     @pytest.mark.asyncio
-    async def test_format_blocks_with_image(self, mock_mcp_client, mock_openai):
-        """Test formatting content blocks with images."""
+    async def test_format_blocks_with_image(self, mock_openai: AsyncOpenAI) -> None:
+        """Test formatting image content blocks."""
         agent = OpenAIAgent.create(
-            mcp_client=mock_mcp_client,
             model_client=mock_openai,
             validate_api_key=False,
         )
 
         blocks: list[types.ContentBlock] = [
-            types.TextContent(type="text", text="Check this out:"),
-            types.ImageContent(type="image", data="base64imagedata", mimeType="image/jpeg"),
+            types.TextContent(type="text", text="Look at this:"),
+            types.ImageContent(type="image", data="base64data", mimeType="image/png"),
         ]
 
         messages = await agent.format_blocks(blocks)
         assert len(messages) == 1
-        msg = cast("dict[str, Any]", messages[0])
-        assert msg["role"] == "user"
-        content = cast("list[dict[str, Any]]", msg["content"])
-        assert len(content) == 2
-        assert content[0] == {"type": "input_text", "text": "Check this out:"}
-        assert content[1] == {
-            "type": "input_image",
-            "image_url": "data:image/jpeg;base64,base64imagedata",
-            "detail": "auto",
-        }
+        assert len(messages[0]["content"]) == 2
+        assert messages[0]["content"][1]["type"] == "input_image"
+        assert messages[0]["content"][1]["image_url"] == "data:image/png;base64,base64data"  # type: ignore[typeddict-item]
 
     @pytest.mark.asyncio
-    async def test_format_blocks_empty(self, mock_mcp_client, mock_openai):
+    async def test_format_blocks_empty(self, mock_openai: AsyncOpenAI) -> None:
         """Test formatting empty content blocks."""
         agent = OpenAIAgent.create(
-            mcp_client=mock_mcp_client,
             model_client=mock_openai,
             validate_api_key=False,
         )
 
-        blocks: list[types.ContentBlock] = []
-
-        messages = await agent.format_blocks(blocks)
+        messages = await agent.format_blocks([])
         assert len(messages) == 1
-        msg = cast("dict[str, Any]", messages[0])
-        assert msg["role"] == "user"
-        content = cast("list[dict[str, Any]]", msg["content"])
-        assert len(content) == 1
-        assert content[0] == {"type": "input_text", "text": ""}
+        # Empty blocks produce a single empty text item
+        assert len(messages[0]["content"]) == 1
+        assert messages[0]["content"][0]["type"] == "input_text"
+        assert messages[0]["content"][0]["text"] == ""
 
     @pytest.mark.asyncio
-    async def test_format_tool_results_text(self, mock_mcp_client, mock_openai):
+    async def test_format_tool_results_text(self, mock_openai: AsyncOpenAI) -> None:
         """Test formatting tool results with text content."""
         agent = OpenAIAgent.create(
-            mcp_client=mock_mcp_client,
             model_client=mock_openai,
             validate_api_key=False,
         )
 
-        tool_calls = [
-            MCPToolCall(name="test_tool", arguments={"arg": "value"}, id="call_123"),  # type: ignore
-        ]
-
+        tool_calls = [MCPToolCall(id="call_123", name="test_tool", arguments={})]
         tool_results = [
             MCPToolResult(
-                content=[types.TextContent(type="text", text="Tool executed successfully")],
+                content=[types.TextContent(type="text", text="Tool output")],
                 isError=False,
-            ),
-        ]
-
-        messages = await agent.format_tool_results(tool_calls, tool_results)
-
-        assert len(messages) == 1
-        msg = cast("dict[str, Any]", messages[0])
-        assert msg["type"] == "function_call_output"
-        assert msg["call_id"] == "call_123"
-        output = cast("list[dict[str, Any]]", msg["output"])
-        assert len(output) == 1
-        assert output[0]["type"] == "input_text"
-        assert output[0]["text"] == "Tool executed successfully"
-
-    @pytest.mark.asyncio
-    async def test_format_tool_results_with_image(self, mock_mcp_client, mock_openai):
-        """Test formatting tool results with image content."""
-        agent = OpenAIAgent.create(
-            mcp_client=mock_mcp_client,
-            model_client=mock_openai,
-            validate_api_key=False,
-        )
-
-        tool_calls = [
-            MCPToolCall(name="screenshot", arguments={}, id="call_456"),  # type: ignore
-        ]
-
-        tool_results = [
-            MCPToolResult(
-                content=[
-                    types.ImageContent(type="image", data="screenshot_data", mimeType="image/png")
-                ],
-                isError=False,
-            ),
+            )
         ]
 
         messages = await agent.format_tool_results(tool_calls, tool_results)
-
         assert len(messages) == 1
-        msg = cast("dict[str, Any]", messages[0])
-        assert msg["type"] == "function_call_output"
-        assert msg["call_id"] == "call_456"
-        output = cast("list[dict[str, Any]]", msg["output"])
-        assert len(output) == 1
-        assert output[0]["type"] == "input_image"
-        assert output[0]["image_url"] == "data:image/png;base64,screenshot_data"
+        assert messages[0]["type"] == "function_call_output"
+        assert messages[0]["call_id"] == "call_123"
+        # Output is a list of content items
+        assert len(messages[0]["output"]) == 1
+        assert messages[0]["output"][0]["text"] == "Tool output"  # type: ignore[index]
 
     @pytest.mark.asyncio
-    async def test_format_tool_results_with_error(self, mock_mcp_client, mock_openai):
-        """Test formatting tool results with errors."""
+    async def test_format_tool_results_with_error(self, mock_openai: AsyncOpenAI) -> None:
+        """Test formatting tool results with error."""
         agent = OpenAIAgent.create(
-            mcp_client=mock_mcp_client,
             model_client=mock_openai,
             validate_api_key=False,
         )
 
-        tool_calls = [
-            MCPToolCall(name="failing_tool", arguments={}, id="call_error"),  # type: ignore
-        ]
-
+        tool_calls = [MCPToolCall(id="call_123", name="test_tool", arguments={})]
         tool_results = [
             MCPToolResult(
-                content=[types.TextContent(type="text", text="Error: Something went wrong")],
+                content=[types.TextContent(type="text", text="Error message")],
                 isError=True,
-            ),
-        ]
-
-        messages = await agent.format_tool_results(tool_calls, tool_results)
-
-        assert len(messages) == 1
-        msg = cast("dict[str, Any]", messages[0])
-        assert msg["type"] == "function_call_output"
-        assert msg["call_id"] == "call_error"
-        output = cast("list[dict[str, Any]]", msg["output"])
-        assert len(output) == 2
-        assert output[0]["type"] == "input_text"
-        assert output[0]["text"] == "[tool_error] true"
-        assert output[1]["type"] == "input_text"
-        assert output[1]["text"] == "Error: Something went wrong"
-
-    @pytest.mark.asyncio
-    async def test_format_tool_results_with_structured_content(self, mock_mcp_client, mock_openai):
-        """Test formatting tool results with structured content."""
-        agent = OpenAIAgent.create(
-            mcp_client=mock_mcp_client,
-            model_client=mock_openai,
-            validate_api_key=False,
-        )
-
-        tool_calls = [
-            MCPToolCall(name="data_tool", arguments={}, id="call_789"),  # type: ignore
-        ]
-
-        tool_results = [
-            MCPToolResult(
-                content=[],
-                structuredContent={"key": "value", "number": 42},
-                isError=False,
-            ),
+            )
         ]
 
         messages = await agent.format_tool_results(tool_calls, tool_results)
-
         assert len(messages) == 1
+        # Output is a list; first item is error indicator, second is the message
         msg = cast("dict[str, Any]", messages[0])
-        assert msg["type"] == "function_call_output"
-        assert msg["call_id"] == "call_789"
         output = cast("list[dict[str, Any]]", msg["output"])
-        assert len(output) == 1
-        assert output[0]["type"] == "input_text"
-        # Structured content is JSON serialized
-        import json
-
-        parsed = json.loads(output[0]["text"])
-        assert parsed == {"key": "value", "number": 42}
-
-    @pytest.mark.asyncio
-    async def test_format_tool_results_multiple(self, mock_mcp_client, mock_openai):
-        """Test formatting multiple tool results."""
-        agent = OpenAIAgent.create(
-            mcp_client=mock_mcp_client,
-            model_client=mock_openai,
-            validate_api_key=False,
-        )
-
-        tool_calls = [
-            MCPToolCall(name="tool1", arguments={}, id="call_1"),  # type: ignore
-            MCPToolCall(name="tool2", arguments={}, id="call_2"),  # type: ignore
-        ]
-
-        tool_results = [
-            MCPToolResult(
-                content=[types.TextContent(type="text", text="Result 1")],
-                isError=False,
-            ),
-            MCPToolResult(
-                content=[types.TextContent(type="text", text="Result 2")],
-                isError=False,
-            ),
-        ]
-
-        messages = await agent.format_tool_results(tool_calls, tool_results)
-
-        assert len(messages) == 2
-        msg0 = cast("dict[str, Any]", messages[0])
-        assert msg0["call_id"] == "call_1"
-        msg1 = cast("dict[str, Any]", messages[1])
-        assert msg1["call_id"] == "call_2"
-
-    @pytest.mark.asyncio
-    async def test_format_tool_results_missing_call_id(self, mock_mcp_client, mock_openai):
-        """Test formatting tool results with missing call_id."""
-        agent = OpenAIAgent.create(
-            mcp_client=mock_mcp_client,
-            model_client=mock_openai,
-            validate_api_key=False,
-        )
-
-        tool_calls = [
-            MCPToolCall(name="tool_no_id", arguments={}, id=""),  # Empty string instead of None
-        ]
-
-        tool_results = [
-            MCPToolResult(
-                content=[types.TextContent(type="text", text="Some result")],
-                isError=False,
-            ),
-        ]
-
-        messages = await agent.format_tool_results(tool_calls, tool_results)
-
-        # Should skip tools without call_id (empty string is falsy)
-        assert len(messages) == 0
-
-    @pytest.mark.asyncio
-    async def test_get_response_with_text(self, mock_mcp_client, mock_openai):
-        """Test getting model response with text output."""
-        # Disable telemetry for this test
-        with patch("hud.settings.settings.telemetry_enabled", False):
-            agent = OpenAIAgent.create(
-                mcp_client=mock_mcp_client,
-                model_client=mock_openai,
-                validate_api_key=False,
-            )
-
-            # Mock OpenAI API response
-            mock_response = MagicMock()
-            mock_response.id = "response_123"
-
-            # Create properly typed output text with all required fields
-            mock_output_text = ResponseOutputText(
-                type="output_text",
-                text="This is the response text",
-                annotations=[],  # Required field
-            )
-
-            # Create properly typed output message with all required fields
-            mock_output_message = ResponseOutputMessage(
-                type="message",
-                id="msg_123",  # Required field
-                role="assistant",  # Required field
-                status="completed",  # Required field
-                content=[mock_output_text],
-            )
-
-            mock_response.output = [mock_output_message]
-
-            mock_openai.responses.create = AsyncMock(return_value=mock_response)
-
-            # Test with initial message
-            messages = [{"role": "user", "content": [{"type": "input_text", "text": "Hello"}]}]
-            response = await agent.get_response(messages)
-
-            assert response.content == "This is the response text"
-            assert response.done is True
-            assert response.tool_calls == []
-            assert agent.last_response_id == "response_123"
-
-    @pytest.mark.asyncio
-    async def test_get_response_with_tool_call(self, mock_mcp_client, mock_openai):
-        """Test getting model response with tool call."""
-        with patch("hud.settings.settings.telemetry_enabled", False):
-            agent = OpenAIAgent.create(
-                mcp_client=mock_mcp_client,
-                model_client=mock_openai,
-                validate_api_key=False,
-            )
-
-            # Set up tool name map
-            agent._tool_name_map = {"test_tool": "test_tool"}
-
-            # Mock OpenAI API response with properly typed function call
-            mock_response = MagicMock()
-            mock_response.id = "response_456"
-
-            # Create properly typed function call with correct type value
-            mock_function_call = ResponseFunctionToolCall(
-                type="function_call",  # Correct type value
-                call_id="call_123",
-                name="test_tool",
-                arguments='{"param": "value"}',
-            )
-
-            mock_response.output = [mock_function_call]
-
-            mock_openai.responses.create = AsyncMock(return_value=mock_response)
-
-            messages = [
-                {"role": "user", "content": [{"type": "input_text", "text": "Do something"}]}
-            ]
-            response = await agent.get_response(messages)
-
-            assert response.done is False
-            assert len(response.tool_calls) == 1
-            assert response.tool_calls[0].name == "test_tool"
-            assert response.tool_calls[0].id == "call_123"
-            assert response.tool_calls[0].arguments == {"param": "value"}
-
-    @pytest.mark.asyncio
-    async def test_get_response_with_reasoning(self, mock_mcp_client, mock_openai):
-        """Test getting model response with reasoning."""
-        with patch("hud.settings.settings.telemetry_enabled", False):
-            agent = OpenAIAgent.create(
-                mcp_client=mock_mcp_client,
-                model_client=mock_openai,
-                validate_api_key=False,
-            )
-
-            # Mock OpenAI API response with properly typed reasoning
-            mock_response = MagicMock()
-            mock_response.id = "response_789"
-
-            # Create a properly typed reasoning item with all required fields
-            mock_summary = Summary(
-                type="summary_text",  # Correct literal type value
-                text="Let me think about this...",
-            )
-
-            mock_reasoning = ResponseReasoningItem(
-                type="reasoning",
-                id="reasoning_1",  # Required field
-                summary=[mock_summary],  # Required field
-                status="completed",  # Required field
-            )
-
-            # Create properly typed output message with all required fields
-            mock_output_text = ResponseOutputText(
-                type="output_text",
-                text="Final answer",
-                annotations=[],  # Required field
-            )
-            mock_output_message = ResponseOutputMessage(
-                type="message",
-                id="msg_789",  # Required field
-                role="assistant",  # Required field
-                status="completed",  # Required field
-                content=[mock_output_text],
-            )
-
-            mock_response.output = [mock_reasoning, mock_output_message]
-
-            mock_openai.responses.create = AsyncMock(return_value=mock_response)
-
-            messages = [
-                {"role": "user", "content": [{"type": "input_text", "text": "Hard question"}]}
-            ]
-            response = await agent.get_response(messages)
-
-            assert response.reasoning == "Let me think about this..."
-            assert response.content == "Final answer"
-
-    @pytest.mark.asyncio
-    async def test_get_response_empty_messages(self, mock_mcp_client, mock_openai):
-        """Test getting model response with empty messages."""
-        with patch("hud.settings.settings.telemetry_enabled", False):
-            agent = OpenAIAgent.create(
-                mcp_client=mock_mcp_client,
-                model_client=mock_openai,
-                validate_api_key=False,
-            )
-
-            # Mock empty response
-            mock_response = MagicMock()
-            mock_response.id = "response_empty"
-            mock_response.output = []
-
-            mock_openai.responses.create = AsyncMock(return_value=mock_response)
-
-            messages = []
-            response = await agent.get_response(messages)
-
-            assert response.content == ""
-            assert response.tool_calls == []
-
-    @pytest.mark.asyncio
-    async def test_get_response_no_new_messages_with_previous_id(
-        self, mock_mcp_client, mock_openai
-    ):
-        """Test getting model response when no new messages and previous response exists."""
-        with patch("hud.settings.settings.telemetry_enabled", False):
-            agent = OpenAIAgent.create(
-                mcp_client=mock_mcp_client,
-                model_client=mock_openai,
-                validate_api_key=False,
-            )
-
-            agent.last_response_id = "prev_response"
-            agent._message_cursor = 1
-
-            messages = [{"role": "user", "content": [{"type": "input_text", "text": "Hello"}]}]
-            response = await agent.get_response(messages)
-
-            # Should return early without calling API
-            assert response.content == ""
-            assert response.done is True
-            mock_openai.responses.create.assert_not_called()
-
-    @pytest.mark.asyncio
-    async def test_get_response_passes_correct_payload(self, mock_mcp_client, mock_openai):
-        """Test that get_response passes correct parameters to OpenAI API."""
-        with patch("hud.settings.settings.telemetry_enabled", False):
-            agent = OpenAIAgent.create(
-                mcp_client=mock_mcp_client,
-                model_client=mock_openai,
-                checkpoint_name="gpt-4o",
-                max_output_tokens=1024,
-                temperature=0.5,
-                reasoning={"effort": "high"},
-                tool_choice="auto",
-                parallel_tool_calls=True,
-                validate_api_key=False,
-            )
-
-            agent._openai_tools = [cast("Any", {"type": "function", "name": "test"})]
-            agent.system_prompt = "You are a helpful assistant"
-            agent.last_response_id = "prev_123"
-
-            # Mock the API response
-            mock_response = MagicMock()
-            mock_response.id = "response_new"
-            mock_response.output = []
-            mock_openai.responses.create = AsyncMock(return_value=mock_response)
-
-            messages = [{"role": "user", "content": [{"type": "input_text", "text": "Hi"}]}]
-            await agent.get_response(messages)
-
-            # Verify the API was called with the correct parameters
-            mock_openai.responses.create.assert_called_once()
-            call_kwargs = mock_openai.responses.create.call_args.kwargs
-
-            assert call_kwargs["model"] == "gpt-4o"
-            assert call_kwargs["input"] == messages
-            assert call_kwargs["instructions"] == "You are a helpful assistant"
-            assert call_kwargs["max_output_tokens"] == 1024
-            assert call_kwargs["temperature"] == 0.5
-            assert call_kwargs["reasoning"] == {"effort": "high"}
-            assert call_kwargs["tool_choice"] == "auto"
-            assert call_kwargs["parallel_tool_calls"] is True
-            assert call_kwargs["tools"] == [{"type": "function", "name": "test"}]
-            assert call_kwargs["previous_response_id"] == "prev_123"
-
-    @pytest.mark.asyncio
-    async def test_get_response_passes_minimal_payload(self, mock_mcp_client, mock_openai):
-        """Test that get_response passes minimal parameters when not configured."""
-        from openai import Omit
-
-        with patch("hud.settings.settings.telemetry_enabled", False):
-            agent = OpenAIAgent.create(
-                mcp_client=mock_mcp_client,
-                model_client=mock_openai,
-                validate_api_key=False,
-            )
-
-            # Mock the API response
-            mock_response = MagicMock()
-            mock_response.id = "response_new"
-            mock_response.output = []
-            mock_openai.responses.create = AsyncMock(return_value=mock_response)
-
-            messages = [{"role": "user", "content": [{"type": "input_text", "text": "Hi"}]}]
-            await agent.get_response(messages)
-
-            # Verify the API was called with minimal parameters
-            mock_openai.responses.create.assert_called_once()
-            call_kwargs = mock_openai.responses.create.call_args.kwargs
-
-            assert call_kwargs["model"] == "gpt-5.1"  # default
-            assert call_kwargs["input"] == messages
-            assert call_kwargs["max_output_tokens"] is None
-            assert call_kwargs["temperature"] is None
-            # tool_choice should be Omit() when not set
-            assert isinstance(call_kwargs["tool_choice"], Omit)
-            # tools should be Omit() when empty
-            assert isinstance(call_kwargs["tools"], Omit)
-            # previous_response_id should be Omit() when not set
-            assert isinstance(call_kwargs["previous_response_id"], Omit)
-
-    @pytest.mark.asyncio
-    async def test_reset_response_state(self, mock_mcp_client, mock_openai):
-        """Test resetting response state."""
-        agent = OpenAIAgent.create(
-            mcp_client=mock_mcp_client,
-            model_client=mock_openai,
-            validate_api_key=False,
-        )
-
-        # Set some state
-        agent.last_response_id = "some_id"
-        agent._message_cursor = 5
-
-        # Reset
-        agent._reset_response_state()
-
-        assert agent.last_response_id is None
-        assert agent._message_cursor == 0
+        assert any(item.get("text") == "[tool_error] true" for item in output)
+        assert any(item.get("text") == "Error message" for item in output)
 
     @pytest.mark.asyncio
-    async def test_get_system_messages(self, mock_mcp_client, mock_openai):
-        """Test getting system messages."""
+    async def test_get_system_messages(self, mock_openai: AsyncOpenAI) -> None:
+        """Test getting system messages - OpenAI uses instructions field instead."""
         agent = OpenAIAgent.create(
-            mcp_client=mock_mcp_client,
             model_client=mock_openai,
+            system_prompt="You are a helpful assistant.",
             validate_api_key=False,
         )
 
-        # OpenAI agent returns empty list (uses instructions field instead)
+        # OpenAI agent returns empty list - system prompt is passed via instructions
         messages = await agent.get_system_messages()
-        assert messages == []
+        assert len(messages) == 0
 
     @pytest.mark.asyncio
-    async def test_convert_tools_for_openai(self, mock_mcp_client, mock_openai):
+    async def test_convert_tools_for_openai(self, mock_openai: AsyncOpenAI) -> None:
         """Test converting MCP tools to OpenAI format."""
-        agent = OpenAIAgent.create(
-            mcp_client=mock_mcp_client,
-            model_client=mock_openai,
-            validate_api_key=False,
-        )
-
-        # Mock MCP tools
-        mock_tools = [
-            types.Tool(
-                name="tool1",
-                description="First tool",
-                inputSchema={
-                    "type": "object",
-                    "properties": {"arg1": {"type": "string"}},
-                    "required": ["arg1"],
-                    "additionalProperties": False,
-                },
-            ),
+        tools = [
             types.Tool(
-                name="tool2",
-                description="Second tool",
-                inputSchema={
-                    "type": "object",
-                    "properties": {},
-                    "additionalProperties": False,
-                },
-            ),
+                name="my_tool",
+                description="A test tool",
+                inputSchema={"type": "object", "properties": {"x": {"type": "string"}}},
+            )
         ]
-
-        agent._available_tools = mock_tools
-        agent._convert_tools_for_openai()
-
-        assert len(agent._openai_tools) == 2
-        assert agent._tool_name_map == {"tool1": "tool1", "tool2": "tool2"}
-
-        tool1 = cast("dict[str, Any]", agent._openai_tools[0])
-        assert tool1["type"] == "function"
-        assert tool1["name"] == "tool1"
-        assert tool1["description"] == "First tool"
-        assert tool1["strict"] is True
-
-    @pytest.mark.asyncio
-    async def test_convert_tools_raises_on_incomplete(self, mock_mcp_client, mock_openai):
-        """Test that converting tools raises error for incomplete tool definitions."""
+        ctx = MockEvalContext(tools=tools)
         agent = OpenAIAgent.create(
-            mcp_client=mock_mcp_client,
             model_client=mock_openai,
             validate_api_key=False,
         )
 
-        # Create mock tools directly as objects that bypass pydantic validation
-        incomplete1 = MagicMock(spec=types.Tool)
-        incomplete1.name = "incomplete1"
-        incomplete1.description = None
-        incomplete1.inputSchema = {"type": "object"}
+        # Initialize with context to trigger tool conversion
+        agent.ctx = ctx
+        await agent._initialize_from_ctx(ctx)
 
-        agent._available_tools = [incomplete1]
-
-        # Should raise ValueError for tool without description
-        with pytest.raises(ValueError, match="requires both a description and inputSchema"):
-            agent._convert_tools_for_openai()
+        # Check that tools were converted
+        assert len(agent._openai_tools) >= 1
+        # Find our tool
+        tool = next((t for t in agent._openai_tools if t.get("name") == "my_tool"), None)
+        assert tool is not None
+        assert tool["type"] == "function"
 
     @pytest.mark.asyncio
-    async def test_convert_tools_for_openai_via_initialize(self, mock_mcp_client, mock_openai):
-        """Test that initialize properly converts tools."""
+    async def test_convert_tools_raises_on_incomplete(self, mock_openai: AsyncOpenAI) -> None:
+        """Test that tools without description raise error."""
+        tools = [
+            types.Tool(
+                name="incomplete_tool",
+                description=None,  # Missing description
+                inputSchema={"type": "object"},
+            )
+        ]
+        ctx = MockEvalContext(tools=tools)
         agent = OpenAIAgent.create(
-            mcp_client=mock_mcp_client,
             model_client=mock_openai,
             validate_api_key=False,
         )
 
-        # Mock the list_tools to return our test tools
-        mock_mcp_client.list_tools = AsyncMock(
-            return_value=[
-                types.Tool(
-                    name="complete",
-                    description="Complete tool",
-                    inputSchema={"type": "object", "properties": {}, "additionalProperties": False},
-                )
-            ]
-        )
-
-        await agent.initialize()
-
-        # Should have the complete tool converted
-        assert len(agent._openai_tools) == 1
-        tool = cast("dict[str, Any]", agent._openai_tools[0])
-        assert tool["name"] == "complete"
+        agent.ctx = ctx
+        with pytest.raises(ValueError, match="requires both a description"):
+            await agent._initialize_from_ctx(ctx)
 
     @pytest.mark.asyncio
-    async def test_get_response_converts_function_tool_call(self, mock_mcp_client, mock_openai):
-        """Test that get_response properly converts OpenAI function tool calls to MCP format."""
-        with patch("hud.settings.settings.telemetry_enabled", False):
-            agent = OpenAIAgent.create(
-                mcp_client=mock_mcp_client,
-                model_client=mock_openai,
-                validate_api_key=False,
-            )
-
-            # Set up tool name map (simulating tool conversion)
-            agent._tool_name_map = {"openai_name": "mcp_name"}
-
-            # Mock OpenAI API response with function call
-            mock_response = MagicMock()
-            mock_response.id = "response_123"
-
-            mock_function_call = ResponseFunctionToolCall(
-                type="function_call",
-                call_id="call_123",
-                name="openai_name",
-                arguments='{"key": "value", "number": 42}',
+    async def test_get_response_with_text(self, mock_openai: AsyncOpenAI) -> None:
+        """Test getting response with text output."""
+        # Setup mock response
+        mock_response = AsyncMock()
+        mock_response.output = [
+            ResponseOutputMessage(
+                id="msg_123",
+                type="message",
+                role="assistant",
+                status="completed",
+                content=[ResponseOutputText(type="output_text", text="Hello!", annotations=[])],
             )
+        ]
+        mock_openai.responses.create = AsyncMock(return_value=mock_response)
 
-            mock_response.output = [mock_function_call]
-            mock_openai.responses.create = AsyncMock(return_value=mock_response)
-
-            messages = [
-                {"role": "user", "content": [{"type": "input_text", "text": "Do something"}]}
-            ]
-            response = await agent.get_response(messages)
-
-            # Verify the tool call was converted correctly
-            assert len(response.tool_calls) == 1
-            assert response.tool_calls[0].name == "mcp_name"
-            assert response.tool_calls[0].id == "call_123"
-            assert response.tool_calls[0].arguments == {"key": "value", "number": 42}
-
-    @pytest.mark.asyncio
-    async def test_convert_function_tool_call_invalid_json(self, mock_mcp_client, mock_openai):
-        """Test converting function tool call with invalid JSON."""
-        _agent = OpenAIAgent.create(
-            mcp_client=mock_mcp_client,
+        agent = OpenAIAgent.create(
             model_client=mock_openai,
             validate_api_key=False,
         )
+        # Set empty tools to avoid needing initialization
+        agent._openai_tools = []
+        agent._initialized = True
 
-    async def test_get_response_raises_on_invalid_json_arguments(
-        self, mock_mcp_client, mock_openai
-    ):
-        """Test that get_response raises error on invalid JSON in function call arguments.
-
-        With strict mode being mandatory, invalid JSON arguments should never occur
-        in practice since schemas are validated. This test verifies that if it does
-        happen, we get an appropriate error rather than silently failing.
-        """
-        import json
-
-        with patch("hud.settings.settings.telemetry_enabled", False):
-            agent = OpenAIAgent.create(
-                mcp_client=mock_mcp_client,
-                model_client=mock_openai,
-                validate_api_key=False,
-            )
-
-            agent._tool_name_map = {"tool": "tool"}
-
-            # Mock OpenAI API response with function call that has invalid JSON
-            mock_response = MagicMock()
-            mock_response.id = "response_456"
-
-            mock_function_call = ResponseFunctionToolCall(
-                type="function_call",
-                call_id="call_456",
-                name="tool",
-                arguments="invalid json {{",
-            )
-
-            mock_response.output = [mock_function_call]
-            mock_openai.responses.create = AsyncMock(return_value=mock_response)
-
-            messages = [
-                {"role": "user", "content": [{"type": "input_text", "text": "Do something"}]}
-            ]
-
-            # With strict mode mandatory, invalid JSON should raise an error
-            with pytest.raises(json.JSONDecodeError):
-                await agent.get_response(messages)
+        response = await agent.get_response([])
+        assert response.content == "Hello!"
+        assert response.done is True
+        assert len(response.tool_calls) == 0
 
     @pytest.mark.asyncio
-    async def test_get_response_handles_tool_name_mapping(self, mock_mcp_client, mock_openai):
-        """Test that get_response correctly maps tool names that aren't in the map."""
-        with patch("hud.settings.settings.telemetry_enabled", False):
-            agent = OpenAIAgent.create(
-                mcp_client=mock_mcp_client,
-                model_client=mock_openai,
-                validate_api_key=False,
-            )
-
-            # Tool name is NOT in the map, should fall back to the original name
-            agent._tool_name_map = {}
-
-            mock_response = MagicMock()
-            mock_response.id = "response_789"
-
-            mock_function_call = ResponseFunctionToolCall(
+    async def test_get_response_with_tool_call(self, mock_openai: AsyncOpenAI) -> None:
+        """Test getting response with tool call."""
+        mock_response = AsyncMock()
+        # Tool calls come as separate output items, not inside message content
+        mock_response.output = [
+            ResponseFunctionToolCall(
+                id="call_123",
                 type="function_call",
-                call_id="call_789",
-                name="unmapped_tool",
-                arguments="{}",
+                call_id="call_123",
+                name="my_tool",
+                arguments='{"x": "value"}',
             )
+        ]
+        mock_openai.responses.create = AsyncMock(return_value=mock_response)
 
-            mock_response.output = [mock_function_call]
-            mock_openai.responses.create = AsyncMock(return_value=mock_response)
-
-            messages = [
-                {"role": "user", "content": [{"type": "input_text", "text": "Do something"}]}
-            ]
-            response = await agent.get_response(messages)
-
-            # Should use the original tool name when not in map
-            assert len(response.tool_calls) == 1
-            assert response.tool_calls[0].name == "unmapped_tool"
-            assert response.tool_calls[0].arguments == {}
-
-    @pytest.mark.asyncio
-    async def test_convert_tools_for_openai_shell_tool(self, mock_mcp_client, mock_openai):
-        """Test that shell tool is converted to OpenAI native shell type."""
-        agent = OpenAIAgent.create(
-            mcp_client=mock_mcp_client,
-            model_client=mock_openai,
-            validate_api_key=False,
-        )
-
-        # Mock a shell tool
-        shell_tool = types.Tool(
-            name="shell",
-            description="Execute shell commands",
-            inputSchema={"type": "object", "properties": {}},
-        )
-
-        agent._available_tools = [shell_tool]
-        agent._convert_tools_for_openai()
-
-        assert len(agent._openai_tools) == 1
-        tool = cast("dict[str, Any]", agent._openai_tools[0])
-        assert tool["type"] == "shell"
-
-    @pytest.mark.asyncio
-    async def test_convert_tools_for_openai_apply_patch_tool(self, mock_mcp_client, mock_openai):
-        """Test that apply_patch tool is converted to OpenAI native apply_patch type."""
-        agent = OpenAIAgent.create(
-            mcp_client=mock_mcp_client,
-            model_client=mock_openai,
-            validate_api_key=False,
-        )
-
-        # Mock an apply_patch tool
-        apply_patch_tool = types.Tool(
-            name="apply_patch",
-            description="Apply patches to files",
-            inputSchema={"type": "object", "properties": {}},
-        )
-
-        agent._available_tools = [apply_patch_tool]
-        agent._convert_tools_for_openai()
-
-        assert len(agent._openai_tools) == 1
-        tool = cast("dict[str, Any]", agent._openai_tools[0])
-        assert tool["type"] == "apply_patch"
-
-    @pytest.mark.asyncio
-    async def test_convert_tools_for_openai_strict_schema_failure(
-        self, mock_mcp_client, mock_openai
-    ):
-        """Test that tool conversion raises error when strict schema conversion fails."""
         agent = OpenAIAgent.create(
-            mcp_client=mock_mcp_client,
             model_client=mock_openai,
             validate_api_key=False,
         )
+        agent._openai_tools = []
+        agent._tool_name_map = {"my_tool": "my_tool"}
+        agent._initialized = True
 
-        # Mock a tool with a schema that will fail strict conversion
-        # Using a schema without additionalProperties which is required for strict mode
-        mock_tool = types.Tool(
-            name="non_strict_tool",
-            description="A tool with non-strict schema",
-            inputSchema={
-                "type": "object",
-                "properties": {"arg": {"type": "string"}},
-                # Missing additionalProperties and required - will fail strict conversion
-            },
-        )
-
-        agent._available_tools = [mock_tool]
-
-        # Mock ensure_strict_json_schema to raise an exception
-        with patch("hud.agents.openai.ensure_strict_json_schema") as mock_strict:
-            mock_strict.side_effect = ValueError("Schema not strict compatible")
-            # Now strict compatibility is mandatory, so this should raise
-            with pytest.raises(ValueError, match="Schema not strict compatible"):
-                agent._convert_tools_for_openai()
+        response = await agent.get_response([])
+        assert response.done is False
+        assert len(response.tool_calls) == 1
+        assert response.tool_calls[0].name == "my_tool"
+        assert response.tool_calls[0].arguments == {"x": "value"}
 
     @pytest.mark.asyncio
-    async def test_format_tool_results_with_resource_link(self, mock_mcp_client, mock_openai):
-        """Test formatting tool results with ResourceLink content."""
-        agent = OpenAIAgent.create(
-            mcp_client=mock_mcp_client,
-            model_client=mock_openai,
-            validate_api_key=False,
-        )
-
-        tool_calls = [
-            MCPToolCall(name="resource_tool", arguments={}, id="call_resource"),
-        ]
-
-        # Create a ResourceLink content
-        resource_link = types.ResourceLink(
-            type="resource_link",
-            name="test_resource",
-            uri=AnyUrl("file:///test/resource"),
-        )
-
-        tool_results = [
-            MCPToolResult(
-                content=[resource_link],
-                isError=False,
+    async def test_get_response_with_reasoning(self, mock_openai: AsyncOpenAI) -> None:
+        """Test getting response with reasoning."""
+        mock_response = AsyncMock()
+        mock_response.output = [
+            ResponseReasoningItem(
+                id="reason_123",
+                type="reasoning",
+                summary=[Summary(type="summary_text", text="Thinking about it...")],
+            ),
+            ResponseOutputMessage(
+                id="msg_123",
+                type="message",
+                role="assistant",
+                status="completed",
+                content=[ResponseOutputText(type="output_text", text="Answer!", annotations=[])],
             ),
         ]
+        mock_openai.responses.create = AsyncMock(return_value=mock_response)
 
-        messages = await agent.format_tool_results(tool_calls, tool_results)
-
-        assert len(messages) == 1
-        msg = cast("dict[str, Any]", messages[0])
-        output = cast("list[dict[str, Any]]", msg["output"])
-        assert len(output) == 1
-        assert output[0]["type"] == "input_file"
-        assert output[0]["file_url"] == "file:///test/resource"
-
-    @pytest.mark.asyncio
-    async def test_format_tool_results_with_embedded_text_resource(
-        self, mock_mcp_client, mock_openai
-    ):
-        """Test formatting tool results with EmbeddedResource containing text."""
         agent = OpenAIAgent.create(
-            mcp_client=mock_mcp_client,
             model_client=mock_openai,
             validate_api_key=False,
         )
+        agent._openai_tools = []
+        agent._initialized = True
 
-        tool_calls = [
-            MCPToolCall(name="embed_tool", arguments={}, id="call_embed"),
-        ]
+        response = await agent.get_response([])
+        # Reasoning is stored separately from content
+        assert response.reasoning == "Thinking about it..."
+        assert response.content == "Answer!"
 
-        # Create an EmbeddedResource with TextResourceContents
-        text_resource = types.TextResourceContents(
-            uri=AnyUrl("file:///test.txt"),
-            mimeType="text/plain",
-            text="Embedded text content",
-        )
-        embedded = types.EmbeddedResource(
-            type="resource",
-            resource=text_resource,
-        )
-
-        tool_results = [
-            MCPToolResult(
-                content=[embedded],
-                isError=False,
-            ),
-        ]
 
-        messages = await agent.format_tool_results(tool_calls, tool_results)
+class TestOpenAIToolConversion:
+    """Tests for tool conversion to OpenAI format."""
 
-        assert len(messages) == 1
-        msg = cast("dict[str, Any]", messages[0])
-        output = cast("list[dict[str, Any]]", msg["output"])
-        assert len(output) == 1
-        assert output[0]["type"] == "input_text"
-        assert output[0]["text"] == "Embedded text content"
+    @pytest.fixture
+    def mock_openai(self) -> Generator[AsyncOpenAI, None, None]:  # type: ignore[misc]
+        """Create a stub OpenAI client."""
+        with patch("hud.agents.openai.AsyncOpenAI") as mock_class:
+            client = AsyncOpenAI(api_key="test", base_url="http://localhost")
+            client.responses.create = AsyncMock()
+            mock_class.return_value = client
+            yield client  # type: ignore[misc]
 
     @pytest.mark.asyncio
-    async def test_format_tool_results_with_embedded_blob_resource(
-        self, mock_mcp_client, mock_openai
-    ):
-        """Test formatting tool results with EmbeddedResource containing blob."""
+    async def test_shell_tool_conversion(self, mock_openai: AsyncOpenAI) -> None:
+        """Test that shell tool is converted to native format."""
+        tools = [
+            types.Tool(
+                name="shell",
+                description="Execute shell commands",
+                inputSchema={"type": "object"},
+            )
+        ]
+        ctx = MockEvalContext(tools=tools)
         agent = OpenAIAgent.create(
-            mcp_client=mock_mcp_client,
             model_client=mock_openai,
             validate_api_key=False,
         )
 
-        tool_calls = [
-            MCPToolCall(name="blob_tool", arguments={}, id="call_blob"),
-        ]
+        agent.ctx = ctx
+        await agent._initialize_from_ctx(ctx)
 
-        # Create an EmbeddedResource with BlobResourceContents
-        blob_resource = types.BlobResourceContents(
-            uri=AnyUrl("file:///test.bin"),
-            mimeType="application/octet-stream",
-            blob="YmluYXJ5IGRhdGE=",  # base64 encoded "binary data"
-        )
-        embedded = types.EmbeddedResource(
-            type="resource",
-            resource=blob_resource,
-        )
-
-        tool_results = [
-            MCPToolResult(
-                content=[embedded],
-                isError=False,
-            ),
-        ]
-
-        messages = await agent.format_tool_results(tool_calls, tool_results)
-
-        assert len(messages) == 1
-        msg = cast("dict[str, Any]", messages[0])
-        output = cast("list[dict[str, Any]]", msg["output"])
-        assert len(output) == 1
-        assert output[0]["type"] == "input_file"
-        assert output[0]["file_data"] == "YmluYXJ5IGRhdGE="
+        # Check for native shell tool
+        shell_tool = next((t for t in agent._openai_tools if t.get("type") == "shell"), None)
+        assert shell_tool is not None
 
     @pytest.mark.asyncio
-    async def test_format_tool_results_empty_content(self, mock_mcp_client, mock_openai):
-        """Test formatting tool results with completely empty content."""
+    async def test_computer_tool_conversion(self, mock_openai: AsyncOpenAI) -> None:
+        """Test that computer tool is converted to function format."""
+        tools = [
+            types.Tool(
+                name="computer",
+                description="Control computer",
+                inputSchema={"type": "object"},
+            )
+        ]
+        ctx = MockEvalContext(tools=tools)
         agent = OpenAIAgent.create(
-            mcp_client=mock_mcp_client,
             model_client=mock_openai,
             validate_api_key=False,
         )
 
-        tool_calls = [
-            MCPToolCall(name="empty_tool", arguments={}, id="call_empty"),
-        ]
+        agent.ctx = ctx
+        await agent._initialize_from_ctx(ctx)
 
-        tool_results = [
-            MCPToolResult(
-                content=[],  # Empty content
-                isError=False,
-            ),
-        ]
-
-        messages = await agent.format_tool_results(tool_calls, tool_results)
-
-        assert len(messages) == 1
-        msg = cast("dict[str, Any]", messages[0])
-        output = cast("list[dict[str, Any]]", msg["output"])
-        # Should have fallback empty text when no content
-        assert len(output) == 1
-        assert output[0]["type"] == "input_text"
-        assert output[0]["text"] == ""
+        # Computer tool is converted to a regular function tool
+        computer_tool = next(
+            (t for t in agent._openai_tools if t.get("name") == "computer"),
+            None,
+        )
+        assert computer_tool is not None
+        assert computer_tool.get("type") == "function"
diff --git a/hud/agents/tests/test_operator.py b/hud/agents/tests/test_operator.py
index b9900247..4e3b0085 100644
--- a/hud/agents/tests/test_operator.py
+++ b/hud/agents/tests/test_operator.py
@@ -2,7 +2,7 @@
 
 from __future__ import annotations
 
-from typing import Any, cast
+from typing import TYPE_CHECKING, Any, cast
 from unittest.mock import AsyncMock, MagicMock, patch
 
 import pytest
@@ -11,43 +11,107 @@
 from openai.types.responses.response_computer_tool_call import PendingSafetyCheck
 
 from hud.agents.operator import OperatorAgent
+from hud.environment.router import ToolRouter
+from hud.eval.context import EvalContext
 from hud.types import MCPToolCall, MCPToolResult
 
+if TYPE_CHECKING:
+    from collections.abc import Generator
+
+
+class MockEvalContext(EvalContext):
+    """Mock EvalContext for testing."""
+
+    def __init__(self, tools: list[types.Tool] | None = None) -> None:
+        # Core attributes
+        self.prompt = "Test prompt"
+        self._tools = tools or []
+        self._submitted: str | None = None
+        self.reward: float | None = None
+
+        # Environment attributes
+        self._router = ToolRouter()
+        self._agent_include: list[str] | None = None
+        self._agent_exclude: list[str] | None = None
+
+        # EvalContext attributes
+        self._task = None
+        self.trace_id = "test-trace-id"
+        self.eval_name = "test-eval"
+        self.job_id: str | None = None
+        self.group_id: str | None = None
+        self.index = 0
+        self.variants: dict[str, Any] = {}
+        self.answer: str | None = None
+        self.system_prompt: str | None = None
+        self.error: BaseException | None = None
+        self.metadata: dict[str, Any] = {}
+        self.results: list[Any] = []
+        self._is_summary = False
+
+    def as_tools(self) -> list[types.Tool]:
+        return self._tools
+
+    @property
+    def has_scenario(self) -> bool:
+        return False
+
+    async def list_tools(self) -> list[types.Tool]:
+        return self._tools
+
+    async def call_tool(self, call: Any, /, **kwargs: Any) -> MCPToolResult:
+        return MCPToolResult(
+            content=[types.TextContent(type="text", text="ok")],
+            isError=False,
+        )
+
+    async def submit(self, answer: str) -> None:
+        self._submitted = answer
+
 
 class TestOperatorAgent:
     """Test OperatorAgent class."""
 
     @pytest.fixture
-    def mock_openai(self):
+    def mock_openai(self) -> Generator[AsyncOpenAI, None, None]:
         """Create a mock OpenAI client."""
         client = AsyncOpenAI(api_key="test", base_url="http://localhost")
         client.responses.create = AsyncMock()
         with patch("hud.agents.openai.AsyncOpenAI", return_value=client):
             yield client
 
+    @pytest.fixture
+    def mock_eval_context_computer(self) -> MockEvalContext:
+        """Create a mock EvalContext with computer tool."""
+        return MockEvalContext(
+            tools=[
+                types.Tool(
+                    name="openai_computer",
+                    description="OpenAI computer use tool",
+                    inputSchema={},
+                )
+            ]
+        )
+
     @pytest.mark.asyncio
-    async def test_init(self, mock_mcp_client_openai_computer):
+    async def test_init(self, mock_openai: AsyncOpenAI) -> None:
         """Test agent initialization."""
-        mock_model_client = AsyncOpenAI(api_key="test")
         agent = OperatorAgent.create(
-            mcp_client=mock_mcp_client_openai_computer,
-            model_client=mock_model_client,
-            checkpoint_name="gpt-4",
-            validate_api_key=False,  # Skip validation in tests
+            model_client=mock_openai,
+            model="gpt-4",
+            validate_api_key=False,
         )
 
         assert agent.model_name == "Operator"
-        assert agent.config.checkpoint_name == "gpt-4"
-        assert agent.openai_client == mock_model_client
+        assert agent.config.model == "gpt-4"
+        assert agent.openai_client == mock_openai
 
     @pytest.mark.asyncio
-    async def test_format_blocks(self, mock_mcp_client_openai_computer):
+    async def test_format_blocks(self, mock_openai: AsyncOpenAI) -> None:
         """Test formatting content blocks."""
-        mock_model_client = AsyncOpenAI(api_key="test")
         agent = OperatorAgent.create(
-            mcp_client=mock_mcp_client_openai_computer,
-            model_client=mock_model_client,
-            validate_api_key=False,  # Skip validation in tests
+            model_client=mock_openai,
+            validate_api_key=False,
         )
 
         # Test with text blocks
@@ -85,17 +149,16 @@ async def test_format_blocks(self, mock_mcp_client_openai_computer):
         }
 
     @pytest.mark.asyncio
-    async def test_format_tool_results(self, mock_mcp_client_openai_computer, mock_openai):
+    async def test_format_tool_results(self, mock_openai: AsyncOpenAI) -> None:
         """Test formatting tool results."""
         agent = OperatorAgent.create(
-            mcp_client=mock_mcp_client_openai_computer,
             model_client=mock_openai,
-            validate_api_key=False,  # Skip validation in tests
+            validate_api_key=False,
         )
 
         tool_calls = [
-            MCPToolCall(name="test_tool", arguments={}, id="call_123"),  # type: ignore
-            MCPToolCall(name="screenshot", arguments={}, id="call_456"),  # type: ignore
+            MCPToolCall(name="test_tool", arguments={}, id="call_123"),
+            MCPToolCall(name="screenshot", arguments={}, id="call_456"),
         ]
 
         tool_results = [
@@ -126,18 +189,15 @@ async def test_format_tool_results(self, mock_mcp_client_openai_computer, mock_o
         assert output1[0]["image_url"] == "data:image/png;base64,base64data"
 
     @pytest.mark.asyncio
-    async def test_format_tool_results_with_error(
-        self, mock_mcp_client_openai_computer, mock_openai
-    ):
+    async def test_format_tool_results_with_error(self, mock_openai: AsyncOpenAI) -> None:
         """Test formatting tool results with errors."""
         agent = OperatorAgent.create(
-            mcp_client=mock_mcp_client_openai_computer,
             model_client=mock_openai,
-            validate_api_key=False,  # Skip validation in tests
+            validate_api_key=False,
         )
 
         tool_calls = [
-            MCPToolCall(name="failing_tool", arguments={}, id="call_error"),  # type: ignore
+            MCPToolCall(name="failing_tool", arguments={}, id="call_error"),
         ]
 
         tool_results = [
@@ -160,20 +220,19 @@ async def test_format_tool_results_with_error(
         assert output[1]["text"] == "Something went wrong"
 
     @pytest.mark.asyncio
-    async def test_get_model_response(self, mock_mcp_client_openai_computer, mock_openai):
+    async def test_get_model_response(
+        self, mock_openai: AsyncOpenAI, mock_eval_context_computer: MockEvalContext
+    ) -> None:
         """Test getting model response from OpenAI API."""
-        # Disable telemetry for this test to avoid backend configuration issues
         with patch("hud.settings.settings.telemetry_enabled", False):
             agent = OperatorAgent.create(
-                mcp_client=mock_mcp_client_openai_computer,
                 model_client=mock_openai,
-                validate_api_key=False,  # Skip validation in tests
+                validate_api_key=False,
             )
 
-            # Set up available tools so agent doesn't return "No computer use tools available"
-            agent._available_tools = [
-                types.Tool(name="computer_openai", description="Computer tool", inputSchema={})
-            ]
+            # Initialize with context
+            agent.ctx = mock_eval_context_computer
+            await agent._initialize_from_ctx(mock_eval_context_computer)
 
             # Mock OpenAI API response for a successful computer use response
             mock_response = MagicMock()
@@ -193,26 +252,24 @@ async def test_get_model_response(self, mock_mcp_client_openai_computer, mock_op
             mock_openai.responses.create = AsyncMock(return_value=mock_response)
 
             messages = [{"prompt": "What's on the screen?", "screenshot": None}]
-            response = await agent.get_response(messages)
+            response = await agent.get_response(messages)  # type: ignore[arg-type]
 
-            # The test should verify that the response is processed correctly
-            # Since the isinstance checks will fail, content will be empty, but done should be True
             assert response.done is True
             assert response.tool_calls == []
 
     @pytest.mark.asyncio
-    async def test_handle_empty_response(self, mock_mcp_client_openai_computer, mock_openai):
+    async def test_handle_empty_response(
+        self, mock_openai: AsyncOpenAI, mock_eval_context_computer: MockEvalContext
+    ) -> None:
         """Test handling empty response from API."""
         agent = OperatorAgent.create(
-            mcp_client=mock_mcp_client_openai_computer,
             model_client=mock_openai,
-            validate_api_key=False,  # Skip validation in tests
+            validate_api_key=False,
         )
 
-        # Set up available tools
-        agent._available_tools = [
-            types.Tool(name="openai_computer", description="Computer tool", inputSchema={})
-        ]
+        # Initialize with context
+        agent.ctx = mock_eval_context_computer
+        await agent._initialize_from_ctx(mock_eval_context_computer)
 
         # Mock empty response
         mock_response = MagicMock()
@@ -223,16 +280,15 @@ async def test_handle_empty_response(self, mock_mcp_client_openai_computer, mock
         mock_openai.responses.create = AsyncMock(return_value=mock_response)
 
         messages = [{"prompt": "Hi", "screenshot": None}]
-        response = await agent.get_response(messages)
+        response = await agent.get_response(messages)  # type: ignore[arg-type]
 
         assert response.content == ""
         assert response.tool_calls == []
 
     @pytest.mark.asyncio
-    async def test_pending_safety_checks_initialization(self, mock_mcp_client, mock_openai):
+    async def test_pending_safety_checks_initialization(self, mock_openai: AsyncOpenAI) -> None:
         """Test that OperatorAgent initializes pending_call_id and pending_safety_checks."""
         agent = OperatorAgent.create(
-            mcp_client=mock_mcp_client,
             model_client=mock_openai,
             validate_api_key=False,
         )
@@ -253,10 +309,9 @@ async def test_pending_safety_checks_initialization(self, mock_mcp_client, mock_
         assert agent.pending_safety_checks[0].id == "safety_check_id"
 
     @pytest.mark.asyncio
-    async def test_extract_tool_call_computer(self, mock_mcp_client, mock_openai):
+    async def test_extract_tool_call_computer(self, mock_openai: AsyncOpenAI) -> None:
         """Test that _extract_tool_call routes computer_call to openai_computer."""
         agent = OperatorAgent.create(
-            mcp_client=mock_mcp_client,
             model_client=mock_openai,
             validate_api_key=False,
         )
@@ -281,10 +336,9 @@ async def test_extract_tool_call_computer(self, mock_mcp_client, mock_openai):
         assert agent.pending_safety_checks == mock_item.pending_safety_checks
 
     @pytest.mark.asyncio
-    async def test_extract_tool_call_delegates_to_super(self, mock_mcp_client, mock_openai):
+    async def test_extract_tool_call_delegates_to_super(self, mock_openai: AsyncOpenAI) -> None:
         """Test that _extract_tool_call delegates non-computer calls to parent."""
         agent = OperatorAgent.create(
-            mcp_client=mock_mcp_client,
             model_client=mock_openai,
             validate_api_key=False,
         )
diff --git a/hud/agents/tests/test_run_eval.py b/hud/agents/tests/test_run_eval.py
new file mode 100644
index 00000000..6fe1e461
--- /dev/null
+++ b/hud/agents/tests/test_run_eval.py
@@ -0,0 +1,179 @@
+"""Tests for MCPAgent.run() with EvalContext."""
+
+from __future__ import annotations
+
+from typing import Any, ClassVar
+
+import pytest
+from mcp import types
+
+from hud.agents import MCPAgent
+from hud.agents.base import BaseCreateParams
+from hud.environment.router import ToolRouter
+from hud.eval.context import EvalContext
+from hud.types import AgentResponse, BaseAgentConfig, MCPToolCall, MCPToolResult
+
+
+class MockConfig(BaseAgentConfig):
+    model_name: str = "MockAgent"
+    model: str = "mock-model"
+
+
+class MockCreateParams(BaseCreateParams, MockConfig):
+    pass
+
+
+class MockMCPAgent(MCPAgent):
+    """Mock agent for testing run()."""
+
+    metadata: ClassVar[dict[str, Any] | None] = {}
+    config_cls: ClassVar[type[BaseAgentConfig]] = MockConfig
+
+    def __init__(self, **kwargs: Any) -> None:
+        params = MockCreateParams(**kwargs)
+        super().__init__(params)
+        self._response = AgentResponse(content="Test response", tool_calls=[], done=True)
+
+    def set_response(self, response: AgentResponse) -> None:
+        self._response = response
+
+    async def get_response(self, messages: list[dict[str, Any]]) -> AgentResponse:
+        return self._response
+
+    async def format_tool_results(
+        self, tool_calls: list[MCPToolCall], tool_results: list[MCPToolResult]
+    ) -> list[dict[str, Any]]:
+        return [{"role": "tool", "content": str(r)} for r in tool_results]
+
+    async def get_system_messages(self) -> list[Any]:
+        return []
+
+    async def format_blocks(self, blocks: list[types.ContentBlock]) -> list[Any]:
+        return [{"type": "text", "text": getattr(b, "text")} for b in blocks if hasattr(b, "text")]
+
+
+class MockEvalContext(EvalContext):
+    """Mock EvalContext for testing - inherits from real EvalContext."""
+
+    def __init__(self, prompt: str = "Test prompt", tools: list[types.Tool] | None = None) -> None:
+        # Core attributes
+        self.prompt = prompt
+        self._tools = tools or [types.Tool(name="test_tool", description="Test", inputSchema={})]
+        self._submitted: str | None = None
+        self.reward: float | None = None
+        self._initialized = True
+
+        # Environment attributes
+        self._router = ToolRouter()
+        self._agent_include: list[str] | None = None
+        self._agent_exclude: list[str] | None = None
+
+        # EvalContext attributes
+        self._task = None
+        self.trace_id = "test-trace-id"
+        self.eval_name = "test-eval"
+        self.job_id: str | None = None
+        self.group_id: str | None = None
+        self.index = 0
+        self.variants: dict[str, Any] = {}
+        self.answer: str | None = None
+        self.system_prompt: str | None = None
+        self.error: BaseException | None = None
+        self.metadata: dict[str, Any] = {}
+        self.results: list[Any] = []
+        self._is_summary = False
+
+    def as_tools(self) -> list[types.Tool]:
+        return self._tools
+
+    @property
+    def has_scenario(self) -> bool:
+        return True
+
+    async def list_tools(self) -> list[types.Tool]:
+        return self._tools
+
+    async def call_tool(self, call: Any, /, **kwargs: Any) -> MCPToolResult:
+        # Handle tuple format (name, args)
+        if isinstance(call, tuple):
+            name = call[0]
+        elif hasattr(call, "name"):
+            name = call.name
+        else:
+            name = str(call)
+        return MCPToolResult(
+            content=[types.TextContent(type="text", text=f"Result from {name}")],
+            isError=False,
+        )
+
+    async def submit(self, answer: str) -> None:
+        self._submitted = answer
+
+
+class TestRun:
+    """Tests for MCPAgent.run() with EvalContext."""
+
+    @pytest.mark.asyncio
+    async def test_run_basic(self) -> None:
+        """Test basic run() flow."""
+        ctx = MockEvalContext(prompt="Do the task")
+        agent = MockMCPAgent()
+
+        result = await agent.run(ctx)
+
+        assert result.done
+        assert result.content == "Test response"
+        assert ctx._submitted == "Test response"
+
+    @pytest.mark.asyncio
+    async def test_run_no_prompt_raises(self) -> None:
+        """Test run() raises when prompt is not set."""
+        ctx = MockEvalContext(prompt="")
+        agent = MockMCPAgent()
+
+        with pytest.raises(ValueError, match="prompt is not set"):
+            await agent.run(ctx)
+
+    @pytest.mark.asyncio
+    async def test_run_wrong_type_raises(self) -> None:
+        """Test run() raises TypeError for non-EvalContext."""
+        agent = MockMCPAgent()
+
+        with pytest.raises(TypeError, match="must be EvalContext"):
+            await agent.run("not an eval context")  # type: ignore[arg-type]
+
+    @pytest.mark.asyncio
+    async def test_run_clears_ctx(self) -> None:
+        """Test run() clears ctx after completion."""
+        ctx = MockEvalContext(prompt="Do the task")
+        agent = MockMCPAgent()
+
+        await agent.run(ctx)
+        assert agent.ctx is None
+
+    @pytest.mark.asyncio
+    async def test_run_no_submit_on_empty_content(self) -> None:
+        """Test run() doesn't submit when content is empty."""
+        ctx = MockEvalContext(prompt="Do the task")
+        agent = MockMCPAgent()
+        agent.set_response(AgentResponse(content="", tool_calls=[], done=True))
+
+        await agent.run(ctx)
+        assert ctx._submitted is None
+
+    @pytest.mark.asyncio
+    async def test_run_initializes_tools(self) -> None:
+        """Test run() initializes tools from context."""
+        ctx = MockEvalContext(
+            prompt="Do the task",
+            tools=[
+                types.Tool(name="tool1", description="Tool 1", inputSchema={}),
+                types.Tool(name="tool2", description="Tool 2", inputSchema={}),
+            ],
+        )
+        agent = MockMCPAgent()
+
+        await agent.run(ctx)
+
+        assert agent._initialized
+        # After cleanup, ctx is None but tools were discovered
diff --git a/hud/agents/utils.py b/hud/agents/utils.py
deleted file mode 100644
index 0efc83aa..00000000
--- a/hud/agents/utils.py
+++ /dev/null
@@ -1,50 +0,0 @@
-from __future__ import annotations
-
-import contextlib
-from typing import TYPE_CHECKING
-
-from hud.otel.context import (
-    _update_task_status_async,
-    get_current_task_run_id,
-)
-
-if TYPE_CHECKING:
-    from hud.datasets import Task
-
-
-async def log_task_config_to_current_trace(task: Task) -> None:
-    with contextlib.suppress(Exception):
-        task_run_id = get_current_task_run_id()
-        if not task_run_id:
-            return
-
-        raw_config = task.model_dump()
-
-        await _update_task_status_async(
-            task_run_id,
-            "running",
-            task_id=task.id,
-            extra_metadata={"task_config": raw_config},
-        )
-
-
-async def log_agent_metadata_to_status(
-    model_name: str | None = None, checkpoint_name: str | None = None
-) -> None:
-    """Attach agent metadata (model/checkpoint) to current trace status metadata."""
-    with contextlib.suppress(Exception):
-        task_run_id = get_current_task_run_id()
-        if not task_run_id or (not model_name and not checkpoint_name):
-            return
-
-        agent_meta = {}
-        if model_name is not None:
-            agent_meta["model_name"] = model_name
-        if checkpoint_name is not None:
-            agent_meta["checkpoint_name"] = checkpoint_name
-
-        await _update_task_status_async(
-            task_run_id,
-            "running",
-            extra_metadata={"agent": agent_meta},
-        )
diff --git a/hud/cli/__init__.py b/hud/cli/__init__.py
index 13719a8c..33bef8b4 100644
--- a/hud/cli/__init__.py
+++ b/hud/cli/__init__.py
@@ -16,17 +16,11 @@
 from hud.utils.hud_console import HUDConsole
 
 from . import list_func as list_module
-from .analyze import (
-    analyze_environment,
-    analyze_environment_from_config,
-    analyze_environment_from_mcp_config,
-)
 from .build import build_command
 from .clone import clone_repository, get_clone_message, print_error, print_tutorial
 from .debug import debug_mcp_stdio
 from .dev import run_mcp_dev_server
 from .eval import eval_command
-from .init import create_environment
 from .pull import pull_command
 from .push import push_command
 from .remove import remove_command
@@ -104,6 +98,13 @@ def analyze(
         hud analyze --config mcp-config.json # From MCP config
         hud analyze --cursor text-2048-dev   # From Cursor config[/not dim]
     """
+    # Lazy import to avoid loading mcp_use on simple CLI commands
+    from .analyze import (
+        analyze_environment,
+        analyze_environment_from_config,
+        analyze_environment_from_mcp_config,
+    )
+
     if config:
         # Load config from JSON file (always live for configs)
         asyncio.run(analyze_environment_from_config(config, output_format, verbose))
@@ -365,6 +366,69 @@ def version() -> None:
         console.print("HUD CLI version: [cyan]unknown[/cyan]")
 
 
+@app.command()
+def models(
+    json_output: bool = typer.Option(False, "--json", help="Output as JSON"),
+) -> None:
+    """📋 List available models from HUD inference gateway.
+
+    [not dim]Shows models available via the HUD inference gateway at inference.hud.ai.
+
+    Examples:
+        hud models              # List all models
+        hud models --json       # Output as JSON[/not dim]
+    """
+    from hud.settings import settings
+
+    try:
+        response = httpx.get(
+            f"{settings.hud_gateway_url}/models",
+            headers={"Authorization": f"Bearer {settings.api_key}"} if settings.api_key else {},
+            timeout=30.0,
+        )
+        response.raise_for_status()
+        data = response.json()
+
+        if json_output:
+            console.print_json(json.dumps(data, indent=2))
+            return
+
+        # Parse and display models
+        models_list = data.get("data", data) if isinstance(data, dict) else data
+
+        if not models_list:
+            console.print("[yellow]No models found[/yellow]")
+            return
+
+        console.print(Panel.fit("📋 [bold cyan]Available Models[/bold cyan]", border_style="cyan"))
+
+        table = Table()
+        table.add_column("Name", style="cyan")
+        table.add_column("Model (API)", style="green")
+        table.add_column("Routes", style="yellow")
+
+        for model in models_list:
+            if isinstance(model, dict):
+                name = model.get("name", "-")
+                api_model = model.get("model", model.get("id", "-"))
+                routes = model.get("routes", [])
+                routes_str = ", ".join(routes) if routes else "-"
+                table.add_row(name, api_model, routes_str)
+            else:
+                table.add_row(str(model), "-", "-")
+
+        console.print(table)
+        console.print(f"\n[dim]Gateway: {settings.hud_gateway_url}[/dim]")
+
+    except httpx.HTTPStatusError as e:
+        console.print(f"[red]❌ API error: {e.response.status_code}[/red]")
+        console.print(f"[dim]{e.response.text}[/dim]")
+        raise typer.Exit(1) from e
+    except Exception as e:
+        console.print(f"[red]❌ Failed to fetch models: {e}[/red]")
+        raise typer.Exit(1) from e
+
+
 @app.command(context_settings={"allow_extra_args": True, "ignore_unknown_options": True})
 def dev(
     params: list[str] = typer.Argument(  # type: ignore[arg-type]  # noqa: B008
@@ -390,9 +454,10 @@ def dev(
         False, "--interactive", help="Launch interactive testing mode (HTTP mode only)"
     ),
     watch: list[str] = typer.Option(  # noqa: B008
-        None,
+        [],
         "--watch",
-        help="Additional directories to watch for changes (default: current directory)",
+        "-w",
+        help="Paths to watch for hot-reload (repeatable: -w tools -w env.py)",
     ),
     new: bool = typer.Option(
         False,
@@ -406,30 +471,36 @@ def dev(
 
     1. Python Module:
        hud dev                    # Auto-detects module
-       hud dev server.main        # Explicit module
+       hud dev env:env            # Explicit module:attribute
+       hud dev -w .               # Watch current directory
 
-    2. Docker with Volume Mounts (Complex environments like 'browser'):
-       hud dev --docker           # Auto-detects image from hud.lock.yaml
-       hud dev --docker -p 8080:8080  # With extra Docker args
+    2. Docker (Complex environments):
+       hud dev                        # Auto-detects Dockerfile, no hot-reload
+       hud dev -w tools -w env.py     # Mount & watch specific paths
+       hud dev -w tools               # Just watch tools folder
 
-    The server must define 'mcp' in its __init__.py or main.py.
+    For Docker mode, use --watch to specify which folders to mount and watch.
+    Paths not in --watch stay in the built image (no hot-reload).
 
     Examples:
-        hud dev                      # Auto-detect in current directory
+        hud dev                      # Auto-detect mode
         hud dev --new                # Create live dev trace on hud.ai
-        hud dev controller           # Run specific module
+        hud dev env:env              # Run specific module
         hud dev --inspector          # Launch MCP Inspector
         hud dev --interactive        # Launch interactive testing mode
-        hud dev --stdio              # Use stdio transport
-        hud dev --watch ../shared    # Watch additional directories
+        hud dev -w 'tools env.py'    # Docker: hot-reload tools/ and env.py
 
-    For environment backend servers, use uvicorn directly:
-        uvicorn server:app --reload[/not dim]
+    Local development pattern (Docker + local scenarios):
+        Terminal 1: hud dev -w 'tools env.py' --port 8000
+        Terminal 2: python local_test.py  # Uses connect_url()[/not dim]
     """
     # Extract module from params if provided (first param when not --docker)
     module = params[0] if params and not docker else None
     docker_args = params if docker else []
 
+    # Convert empty list to None for run_mcp_dev_server
+    watch_paths = watch if watch else None
+
     run_mcp_dev_server(
         module,
         stdio,
@@ -437,7 +508,7 @@ def dev(
         verbose,
         inspector,
         interactive,
-        watch,
+        watch_paths,
         docker=docker,
         docker_args=docker_args,
         new_trace=new,
@@ -826,31 +897,37 @@ def remove(
 
 @app.command()
 def init(
-    name: str = typer.Argument(None, help="Environment name (default: chosen preset name)"),
+    name: str = typer.Argument(None, help="Environment name (default: directory name)"),
+    directory: str = typer.Option(".", "--dir", "-d", help="Target directory"),
+    force: bool = typer.Option(False, "--force", "-f", help="Overwrite existing files"),
     preset: str | None = typer.Option(
         None,
         "--preset",
         "-p",
-        help="Preset to use: blank, deep-research, browser, rubrics. If omitted, you'll choose interactively.",  # noqa: E501
+        help="Download a preset: blank, deep-research, browser, rubrics",
     ),
-    directory: str = typer.Option(".", "--dir", "-d", help="Parent directory for the environment"),
-    force: bool = typer.Option(False, "--force", "-f", help="Overwrite existing files"),
 ) -> None:
-    """🚀 Initialize a new HUD environment with minimal boilerplate.
+    """🚀 Initialize a HUD environment.
 
-    [not dim]Creates a working MCP environment with:
-    - Dockerfile for containerization
-    - pyproject.toml for dependencies
-    - Minimal MCP server with context
-    - Required setup/evaluate tools
+    [not dim]• Empty directory: Choose a preset interactively
+    • Existing project: Add Dockerfile.hud and hud.py
+
+    Use --preset to skip selection and download a specific template.
 
     Examples:
-        hud init                    # Choose preset interactively, create ./preset-name/
-        hud init my-env             # Create new directory ./my-env/
-        hud init my-env --dir /tmp  # Create in /tmp/my-env/[/not dim]
+        hud init                    # Auto-detect mode
+        hud init my-env             # Initialize with custom name
+        hud init --preset browser   # Download browser preset[/not dim]
 
     """
-    create_environment(name, directory, force, preset)
+    if preset:
+        from hud.cli.init import create_environment
+
+        create_environment(name, directory, force, preset)
+    else:
+        from hud.cli.flows.init import smart_init
+
+        smart_init(name, directory, force)
 
 
 @app.command()
@@ -898,100 +975,6 @@ def get(
     )
 
 
-@app.command()
-def rl(
-    tasks_file: str | None = typer.Argument(
-        None,
-        help=(
-            "Path to tasks file (JSON/JSONL) or HuggingFace dataset name. "
-            "If not provided, looks for tasks.json or tasks.jsonl in current directory."
-        ),
-    ),
-    model: str | None = typer.Argument(
-        None,
-        help="Model to train from https://hud.ai/models (default: interactive selection)",
-    ),
-    config_file: Path | None = typer.Option(  # noqa: B008
-        None,
-        "--config",
-        "-c",
-        help="Path to existing configuration file",
-    ),
-    output_dir: str = typer.Option(
-        "checkpoints",
-        "--output-dir",
-        "-o",
-        help="Output directory for checkpoints",
-    ),
-    restart: bool = typer.Option(
-        False,
-        "--restart",
-        help="Restart the vLLM server before training",
-    ),
-    verbose: bool = typer.Option(
-        False,
-        "--verbose",
-        "-v",
-        help="Enable verbose output",
-    ),
-    local: bool = typer.Option(
-        False,
-        "--local",
-        help="Run training locally instead of using remote API server",
-    ),
-    no_ddp: bool = typer.Option(
-        False,
-        "--no-ddp",
-        help="Disable DDP even with multiple GPUs",
-    ),
-    ddp_gpus: str | None = typer.Option(
-        None,
-        "--ddp-gpus",
-        help="Specific GPUs for DDP (e.g., '0,1,2,3')",
-    ),
-    yes: bool = typer.Option(
-        False,
-        "--yes",
-        "-y",
-        help="Auto-accept all prompts and use defaults (lazy mode)",
-    ),
-    vllm_gpu: int | None = typer.Option(
-        None,
-        "--vllm-gpu",
-        help="Specific GPU for vLLM server",
-    ),
-    vllm_gpu_count: int = typer.Option(
-        1,
-        "--vllm-gpu-count",
-        help="Number of GPUs for vLLM server",
-    ),
-    skip_vllm_startup: bool = typer.Option(
-        False,
-        "--skip_vllm_startup",
-        help="Skip the vLLM server startup",
-    ),
-) -> None:
-    """🎯 Run GRPO reinforcement learning training on tasks."""
-    # Import from the rl module
-    from .rl import rl_command
-
-    rl_command(
-        tasks_file=tasks_file,
-        model=model,
-        config_file=config_file,
-        output_dir=output_dir,
-        restart=restart,
-        verbose=verbose,
-        local=local,
-        no_ddp=no_ddp,
-        ddp_gpus=ddp_gpus,
-        vllm_gpu=vllm_gpu,
-        vllm_gpu_count=vllm_gpu_count,
-        yes=yes,
-        skip_vllm_startup=skip_vllm_startup,
-    )
-
-
 @app.command()
 def convert(
     tasks_file: str = typer.Argument(
diff --git a/hud/cli/analyze.py b/hud/cli/analyze.py
index cd58ad23..ea44fb07 100644
--- a/hud/cli/analyze.py
+++ b/hud/cli/analyze.py
@@ -12,7 +12,6 @@
 from rich.table import Table
 from rich.tree import Tree
 
-from hud.clients import MCPClient
 from hud.utils.hud_console import HUDConsole
 
 console = Console()
@@ -45,7 +44,10 @@ async def analyze_environment(docker_cmd: list[str], output_format: str, verbose
     ) as progress:
         task = progress.add_task("Initializing MCP client...", total=None)
 
-        client = MCPClient(mcp_config=mcp_config, verbose=verbose, auto_trace=False)
+        # Use FastMCP client directly - no mcp_use deprecation warnings
+        from hud.clients.fastmcp import FastMCPHUDClient
+
+        client = FastMCPHUDClient(mcp_config=mcp_config, verbose=verbose, auto_trace=False)
 
         try:
             await client.initialize()
@@ -141,8 +143,8 @@ def display_interactive(analysis: dict) -> None:
                     tool_node.add(f"[bright_black]{tool['description']}[/bright_black]")
 
                 # Show input schema if verbose
-                if analysis.get("verbose") and tool.get("input_schema"):
-                    schema_str = json.dumps(tool["input_schema"], indent=2)
+                if analysis.get("verbose") and tool.get("inputSchema"):
+                    schema_str = json.dumps(tool["inputSchema"], indent=2)
                     syntax = Syntax(schema_str, "json", theme="monokai", line_numbers=False)
                     tool_node.add(syntax)
 
@@ -168,6 +170,28 @@ def display_interactive(analysis: dict) -> None:
 
     console.print(tools_tree)
 
+    # Scenarios (Environment scripts exposed as prompt+resource)
+    if analysis.get("scenarios"):
+        hud_console.section_title("🎬 Scenarios")
+        scenarios_table = Table()
+        scenarios_table.add_column("Scenario", style="bright_white")
+        scenarios_table.add_column("Env", style="bright_black")
+        scenarios_table.add_column("Setup/Eval", style="bright_black")
+
+        for s in analysis["scenarios"][:20]:
+            setup = "✓" if s.get("has_setup_prompt") else "✗"
+            eval_ = "✓" if s.get("has_evaluate_resource") else "✗"
+            scenarios_table.add_row(
+                str(s.get("name", "")),
+                str(s.get("env", "")),
+                f"setup {setup} / eval {eval_}",
+            )
+
+        console.print(scenarios_table)
+        if len(analysis["scenarios"]) > 20:
+            remaining = len(analysis["scenarios"]) - 20
+            console.print(f"[bright_black]... and {remaining} more scenarios[/bright_black]")
+
     # Resources
     if analysis["resources"]:
         hud_console.section_title("📚 Available Resources")
@@ -283,6 +307,17 @@ def display_markdown(analysis: dict) -> None:
             md.extend([f"| {uri} | {name} | {mime_type} |"])
         md.append("")
 
+    # Scenarios
+    if analysis.get("scenarios"):
+        md.append("## Scenarios\n")
+        for s in analysis["scenarios"]:
+            name = s.get("name", "")
+            env = s.get("env", "")
+            setup = "✓" if s.get("has_setup_prompt") else "✗"
+            eval_ = "✓" if s.get("has_evaluate_resource") else "✗"
+            md.append(f"- **{name}** ({env}) — setup {setup} / eval {eval_}")
+        md.append("")
+
     # Telemetry (only for live analysis)
     if analysis.get("telemetry"):
         md.append("## Telemetry")
@@ -344,7 +379,10 @@ async def _analyze_with_config(
     ) as progress:
         task = progress.add_task("Initializing MCP client...", total=None)
 
-        client = MCPClient(mcp_config=mcp_config, verbose=verbose)
+        # Use FastMCP client directly - no mcp_use deprecation warnings
+        from hud.clients.fastmcp import FastMCPHUDClient
+
+        client = FastMCPHUDClient(mcp_config=mcp_config, verbose=verbose)
 
         try:
             await client.initialize()
diff --git a/hud/cli/build.py b/hud/cli/build.py
index b7f169a1..53f949dc 100644
--- a/hud/cli/build.py
+++ b/hud/cli/build.py
@@ -18,13 +18,36 @@
 import yaml
 
 from hud.cli.utils.source_hash import compute_source_hash, list_source_files
-from hud.clients import MCPClient
 from hud.utils.hud_console import HUDConsole
 from hud.version import __version__ as hud_version
 
 from .utils.registry import save_to_registry
 
 
+def find_dockerfile(directory: Path) -> Path | None:
+    """Find the Dockerfile in a directory, preferring Dockerfile.hud.
+
+    Checks for Dockerfile.hud first (HUD-specific), then falls back to Dockerfile.
+
+    Args:
+        directory: Directory to search in
+
+    Returns:
+        Path to the Dockerfile if found, None otherwise
+    """
+    # Prefer Dockerfile.hud for HUD environments
+    hud_dockerfile = directory / "Dockerfile.hud"
+    if hud_dockerfile.exists():
+        return hud_dockerfile
+
+    # Fall back to standard Dockerfile
+    standard_dockerfile = directory / "Dockerfile"
+    if standard_dockerfile.exists():
+        return standard_dockerfile
+
+    return None
+
+
 def parse_version(version_str: str) -> tuple[int, int, int]:
     """Parse version string like '1.0.0' or '1.0' into tuple of integers."""
     # Remove 'v' prefix if present
@@ -427,8 +450,11 @@ async def analyze_mcp_environment(
     mcp_config = parse_docker_command(docker_cmd)
 
     # Initialize client and measure timing
+    # Use FastMCP client directly - no mcp_use deprecation warnings
+    from hud.clients.fastmcp import FastMCPHUDClient
+
     start_time = time.time()
-    client = MCPClient(mcp_config=mcp_config, verbose=verbose, auto_trace=False)
+    client = FastMCPHUDClient(mcp_config=mcp_config, verbose=verbose)
     initialized = False
 
     try:
@@ -493,6 +519,11 @@ async def analyze_mcp_environment(
         }
         if hub_map:
             result["hub_tools"] = hub_map
+        # Include prompts and resources from analysis
+        if full_analysis.get("prompts"):
+            result["prompts"] = full_analysis["prompts"]
+        if full_analysis.get("resources"):
+            result["resources"] = full_analysis["resources"]
         return result
     except TimeoutError:
         from hud.shared.exceptions import HudException
@@ -530,16 +561,21 @@ def build_docker_image(
     hud_console = HUDConsole()
     build_args = build_args or {}
 
-    # Check if Dockerfile exists
-    dockerfile = directory / "Dockerfile"
-    if not dockerfile.exists():
+    # Check if Dockerfile exists (prefer Dockerfile.hud)
+    dockerfile = find_dockerfile(directory)
+    if dockerfile is None:
         hud_console.error(f"No Dockerfile found in {directory}")
+        hud_console.info("Expected: Dockerfile.hud or Dockerfile")
         return False
 
     # Build command - use buildx when remote cache is enabled
     effective_platform = platform if platform is not None else "linux/amd64"
     cmd = ["docker", "buildx", "build"] if remote_cache else ["docker", "build"]
 
+    # Specify dockerfile explicitly if not the default name
+    if dockerfile.name != "Dockerfile":
+        cmd.extend(["-f", str(dockerfile)])
+
     if effective_platform:
         cmd.extend(["--platform", effective_platform])
     cmd.extend(["-t", tag])
@@ -653,15 +689,17 @@ def build_environment(
 
     # Step 2: If no lock, check for Dockerfile
     if not base_name:
-        dockerfile_path = env_dir / "Dockerfile"
-        if not dockerfile_path.exists():
+        dockerfile_path = find_dockerfile(env_dir)
+        if dockerfile_path is None:
             hud_console.error(f"Not a valid environment directory: {directory}")
-            hud_console.info("Expected: Dockerfile or hud.lock.yaml")
+            hud_console.info("Expected: Dockerfile.hud, Dockerfile, or hud.lock.yaml")
             raise typer.Exit(1)
 
         # First build - use directory name
         base_name = env_dir.name
         hud_console.info(f"First build - using base name: {base_name}")
+        if dockerfile_path.name == "Dockerfile.hud":
+            hud_console.info("Using Dockerfile.hud")
 
     # If user provides --tag, respect it; otherwise use base name only (version added later)
     if tag:
@@ -720,12 +758,22 @@ def build_environment(
     finally:
         loop.close()
 
-    # Show analysis results including hub tools
-    tool_msg = f"Analyzed environment: {analysis['toolCount']} tools found"
+    # Show analysis results including hub tools, prompts, resources
+    tool_count = analysis["toolCount"]
+    prompt_count = len(analysis.get("prompts") or [])
+    resource_count = len(analysis.get("resources") or [])
+
+    parts = [f"{tool_count} tools"]
+    if prompt_count:
+        parts.append(f"{prompt_count} prompts")
+    if resource_count:
+        parts.append(f"{resource_count} resources")
+
+    tool_msg = f"Analyzed environment: {', '.join(parts)} found"
     hud_console.success(tool_msg)
 
     # Extract environment variables from Dockerfile
-    dockerfile_path = env_dir / "Dockerfile"
+    dockerfile_path = find_dockerfile(env_dir) or env_dir / "Dockerfile"
     required_env, optional_env = extract_env_vars_from_dockerfile(dockerfile_path)
 
     # Show env vars detected from .env file
@@ -771,7 +819,7 @@ def build_environment(
 
     # Create lock file content with images subsection at top
     lock_content = {
-        "version": "1.2",  # Lock file format version
+        "version": "1.3",  # Lock file format version
         "images": {
             "local": f"{base_name}:{new_version}",  # Local tag with version
             "full": None,  # Will be set with digest after build
@@ -852,6 +900,16 @@ def build_environment(
     if hub_tools:
         lock_content["hubTools"] = hub_tools
 
+    # Add prompts if present
+    prompts = analysis.get("prompts")
+    if prompts:
+        lock_content["prompts"] = prompts
+
+    # Add resources if present
+    resources = analysis.get("resources")
+    if resources:
+        lock_content["resources"] = resources
+
     # Write lock file
     lock_path = env_dir / "hud.lock.yaml"
     with open(lock_path, "w") as f:
@@ -885,6 +943,10 @@ def build_environment(
     # Build command - use buildx when remote cache is enabled
     label_cmd = ["docker", "buildx", "build"] if remote_cache else ["docker", "build"]
 
+    # Specify dockerfile explicitly if not the default name
+    if dockerfile_path and dockerfile_path.name != "Dockerfile":
+        label_cmd.extend(["-f", str(dockerfile_path)])
+
     # Use same defaulting for the second build step
     label_platform = platform if platform is not None else "linux/amd64"
     if label_platform:
diff --git a/hud/cli/debug.py b/hud/cli/debug.py
index bd656f59..dbb22630 100644
--- a/hud/cli/debug.py
+++ b/hud/cli/debug.py
@@ -11,7 +11,6 @@
 
 from rich.console import Console
 
-from hud.clients import MCPClient
 from hud.utils.hud_console import HUDConsole
 
 from .utils.logging import CaptureLogger, Colors, analyze_error_for_hints
@@ -246,7 +245,10 @@ def read_stderr() -> None:
         logger.command(command)
         logger.info("Creating MCP client via hud...")
 
-        client = MCPClient(mcp_config=mcp_config, verbose=False, auto_trace=False)
+        # Lazy import to avoid loading mcp_use on simple CLI commands
+        from hud.clients.fastmcp import FastMCPHUDClient
+
+        client = FastMCPHUDClient(mcp_config=mcp_config, verbose=False)
         await client.initialize()
 
         # Wait for initialization
@@ -350,6 +352,9 @@ def read_stderr() -> None:
         try:
             logger.info("Creating 3 concurrent MCP clients...")
 
+            # Lazy import to avoid loading mcp_use on simple CLI commands
+            from hud.clients.fastmcp import FastMCPHUDClient
+
             for i in range(3):
                 client_config = {
                     f"test_concurrent_{i}": {
@@ -358,9 +363,7 @@ def read_stderr() -> None:
                     }
                 }
 
-                concurrent_client = MCPClient(
-                    mcp_config=client_config, verbose=False, auto_trace=False
-                )
+                concurrent_client = FastMCPHUDClient(mcp_config=client_config, verbose=False)
                 await concurrent_client.initialize()
                 concurrent_clients.append(concurrent_client)
                 logger.info(f"Client {i + 1} connected")
diff --git a/hud/cli/dev.py b/hud/cli/dev.py
index 38bccda1..cf0ab918 100644
--- a/hud/cli/dev.py
+++ b/hud/cli/dev.py
@@ -101,10 +101,21 @@ def show_dev_server_info(
     return cursor_deeplink
 
 
+def _has_mcp_or_env(content: str) -> bool:
+    """Check if file content defines an mcp or env variable."""
+    # Check for mcp = MCPServer(...) or mcp = FastMCP(...)
+    if "mcp" in content and ("= MCPServer" in content or "= FastMCP" in content):
+        return True
+    # Check for env = Environment(...)
+    return "env" in content and "= Environment" in content
+
+
 def auto_detect_module() -> tuple[str, Path | None] | tuple[None, None]:
     """Auto-detect MCP module in current directory.
 
-    Looks for 'mcp' defined in either __init__.py or server.py.
+    Looks for 'mcp' or 'env' defined in either __init__.py or main.py.
+    - 'mcp' with MCPServer or FastMCP
+    - 'env' with Environment
 
     Returns:
         Tuple of (module_name, parent_dir_to_add_to_path) or (None, None)
@@ -116,7 +127,7 @@ def auto_detect_module() -> tuple[str, Path | None] | tuple[None, None]:
     if init_file.exists():
         try:
             content = init_file.read_text(encoding="utf-8")
-            if "mcp" in content and ("= MCPServer" in content or "= FastMCP" in content):
+            if _has_mcp_or_env(content):
                 return (cwd.name, None)
         except Exception:  # noqa: S110
             pass
@@ -126,7 +137,7 @@ def auto_detect_module() -> tuple[str, Path | None] | tuple[None, None]:
     if main_file.exists() and init_file.exists():
         try:
             content = main_file.read_text(encoding="utf-8")
-            if "mcp" in content and ("= MCPServer" in content or "= FastMCP" in content):
+            if _has_mcp_or_env(content):
                 # Need to import as package.main, add parent to sys.path
                 return (f"{cwd.name}.main", cwd.parent)
         except Exception:  # noqa: S110
@@ -136,12 +147,15 @@ def auto_detect_module() -> tuple[str, Path | None] | tuple[None, None]:
 
 
 def should_use_docker_mode(cwd: Path) -> bool:
-    """Check if environment requires Docker mode (has Dockerfile in current dir)."""
-    return (cwd / "Dockerfile").exists()
+    """Check if environment requires Docker mode (has Dockerfile in current dir).
+
+    Checks for Dockerfile.hud first (HUD-specific), then falls back to Dockerfile.
+    """
+    return (cwd / "Dockerfile.hud").exists() or (cwd / "Dockerfile").exists()
 
 
 async def run_mcp_module(
-    module_name: str,
+    module_spec: str,
     transport: str,
     port: int,
     verbose: bool,
@@ -149,7 +163,19 @@ async def run_mcp_module(
     interactive: bool,
     new_trace: bool = False,
 ) -> None:
-    """Run an MCP module directly."""
+    """Run an MCP module directly.
+
+    Args:
+        module_spec: Module specification in format "module" or "module:attribute"
+                    e.g., "server" (looks for mcp), "env:env" (looks for env)
+    """
+    # Parse module:attribute format (like uvicorn/gunicorn)
+    if ":" in module_spec:
+        module_name, attr_name = module_spec.rsplit(":", 1)
+    else:
+        module_name = module_spec
+        attr_name = "mcp"  # Default attribute
+
     # Check if this is a reload (not first run)
     is_reload = os.environ.get("_HUD_DEV_RELOAD") == "1"
 
@@ -162,8 +188,10 @@ async def run_mcp_module(
         # Suppress tracebacks in logs unless verbose
         logging.basicConfig(stream=sys.stderr, level=logging.INFO, format="%(message)s")
 
-        # Suppress FastMCP's verbose error logging
+        # Suppress FastMCP's verbose logging
         logging.getLogger("fastmcp.tools.tool_manager").setLevel(logging.WARNING)
+        logging.getLogger("fastmcp.server.server").setLevel(logging.WARNING)
+        logging.getLogger("fastmcp.server.openapi").setLevel(logging.WARNING)
 
         # On reload, suppress most startup logs
         if is_reload:
@@ -172,9 +200,9 @@ async def run_mcp_module(
             logging.getLogger("mcp.server.streamable_http_manager").setLevel(logging.ERROR)
 
             # Suppress deprecation warnings on reload
-            import warnings
+            from hud.patches.warnings import apply_default_warning_filters
 
-            warnings.filterwarnings("ignore", category=DeprecationWarning)
+            apply_default_warning_filters(verbose=False)
 
     # Ensure proper directory is in sys.path based on module name
     cwd = Path.cwd()
@@ -208,8 +236,7 @@ async def run_mcp_module(
             hud_console.info(traceback.format_exc())
         sys.exit(1)
 
-    # Look for 'mcp' attribute - check module __dict__ directly
-    # Debug: print what's in the module
+    # Look for the specified attribute
     if verbose:
         hud_console.info(f"Module attributes: {dir(module)}")
         module_dict = module.__dict__ if hasattr(module, "__dict__") else {}
@@ -217,22 +244,22 @@ async def run_mcp_module(
 
     mcp_server = None
 
-    # Try different ways to access the mcp variable
-    if hasattr(module, "mcp"):
-        mcp_server = module.mcp
-    elif hasattr(module, "__dict__") and "mcp" in module.__dict__:
-        mcp_server = module.__dict__["mcp"]
+    # Try different ways to access the attribute
+    if hasattr(module, attr_name):
+        mcp_server = getattr(module, attr_name)
+    elif hasattr(module, "__dict__") and attr_name in module.__dict__:
+        mcp_server = module.__dict__[attr_name]
 
     if mcp_server is None:
-        hud_console.error(f"Module '{module_name}' does not have 'mcp' defined")
+        hud_console.error(f"Module '{module_name}' does not have '{attr_name}' defined")
         hud_console.info("")
         available = [k for k in dir(module) if not k.startswith("_")]
         hud_console.info(f"Available in module: {available}")
         hud_console.info("")
         hud_console.info("[bold cyan]Expected structure:[/bold cyan]")
-        hud_console.info("  from hud.server import MCPServer")
-        hud_console.info("  mcp = MCPServer(name='my-server')")
-        raise AttributeError(f"Module '{module_name}' must define 'mcp'")
+        hud_console.info("  from hud.environment import Environment")
+        hud_console.info(f"  {attr_name} = Environment('my-env')")
+        raise AttributeError(f"Module '{module_name}' must define '{attr_name}'")
 
     # Only show full header on first run, brief message on reload
     if is_reload:
@@ -529,9 +556,21 @@ def run_docker_dev_server(
     inspector: bool,
     interactive: bool,
     docker_args: list[str],
+    watch_paths: list[str] | None = None,
     new_trace: bool = False,
 ) -> None:
-    """Run MCP server in Docker with volume mounts, expose via local HTTP proxy."""
+    """Run MCP server in Docker with volume mounts, expose via local HTTP proxy.
+
+    Args:
+        port: HTTP port to expose
+        verbose: Show detailed logs
+        inspector: Launch MCP Inspector
+        interactive: Launch interactive testing mode
+        docker_args: Extra Docker run arguments
+        watch_paths: Folders/files to mount for hot-reload (e.g., ["tools", "env.py"]).
+                    If None, no hot-reload mounts are added.
+        new_trace: Create a new dev trace on hud.ai
+    """
     import atexit
     import signal
 
@@ -664,10 +703,6 @@ def signal_handler(signum: int, frame: Any) -> None:
         "--rm",  # Automatically remove container when it stops
         "--name",
         container_name,
-        "-v",
-        f"{env_dir.absolute()}/server:/app/server:rw",
-        "-v",
-        f"{env_dir.absolute()}/environment:/app/environment:rw",
         "-e",
         "PYTHONPATH=/app",
         "-e",
@@ -676,6 +711,22 @@ def signal_handler(signum: int, frame: Any) -> None:
         "HUD_DEV=1",
     ]
 
+    # Add volume mounts for watch paths (hot-reload)
+    if watch_paths:
+        hud_console.info(f"Hot-reload enabled for: {', '.join(watch_paths)}")
+        for path in watch_paths:
+            # Resolve the local path
+            local_path = env_dir.absolute() / path
+            if local_path.exists():
+                # Mount to /app/<path> in container
+                container_path = f"/app/{path}"
+                base_args.extend(["-v", f"{local_path}:{container_path}:rw"])
+            else:
+                hud_console.warning(f"Watch path not found: {path}")
+    else:
+        hud_console.info("No --watch paths specified, running without hot-reload")
+        hud_console.dim_info("Tip", "Use -w to enable hot-reload (e.g., -w tools -w env.py)")
+
     # Add debugging port mappings if available
     if debugging_ports:
         hud_console.info(f"Exposing debugging ports: {', '.join(map(str, debugging_ports))}")
@@ -751,8 +802,8 @@ def signal_handler(signum: int, frame: Any) -> None:
         )
         hud_console.dim_info(
             "",
-            "Container restarts on file changes (mounted volumes), "
-            "if changing tools run hud dev again",
+            "Container restarts on file changes in watched folders (-w), "
+            "rebuild with 'hud dev' if changing other files",
         )
         hud_console.info("")
 
@@ -866,15 +917,15 @@ def run_mcp_dev_server(
 
     # Auto-detect Docker mode if Dockerfile present and no module specified
     if not docker and module is None and should_use_docker_mode(cwd):
-        hud_console.note("Detected Dockerfile - using Docker mode with volume mounts")
+        hud_console.note("Detected Dockerfile - using Docker mode")
         hud_console.dim_info("Tip", "Use 'hud dev --help' to see all options")
         hud_console.info("")
-        run_docker_dev_server(port, verbose, inspector, interactive, docker_args, new_trace)
+        run_docker_dev_server(port, verbose, inspector, interactive, docker_args, watch, new_trace)
         return
 
     # Route to Docker mode if explicitly requested
     if docker:
-        run_docker_dev_server(port, verbose, inspector, interactive, docker_args, new_trace)
+        run_docker_dev_server(port, verbose, inspector, interactive, docker_args, watch, new_trace)
         return
 
     transport = "stdio" if stdio else "http"
@@ -883,11 +934,11 @@ def run_mcp_dev_server(
     if module is None:
         module, extra_path = auto_detect_module()
         if module is None:
-            hud_console.error("Could not auto-detect MCP module in current directory")
+            hud_console.error("Could not auto-detect module in current directory")
             hud_console.info("")
             hud_console.info("[bold cyan]Expected:[/bold cyan]")
             hud_console.info("  • __init__.py file in current directory")
-            hud_console.info("  • Module must define 'mcp' variable")
+            hud_console.info("  • Module must define 'mcp' or 'env' variable")
             hud_console.info("")
             hud_console.info("[bold cyan]Examples:[/bold cyan]")
             hud_console.info("  hud dev controller")
diff --git a/hud/cli/eval.py b/hud/cli/eval.py
index a31cb01f..c3b79757 100644
--- a/hud/cli/eval.py
+++ b/hud/cli/eval.py
@@ -20,7 +20,6 @@
 from rich import box
 from rich.table import Table
 
-from hud.cli.utils.env_check import ensure_built, find_environment_dir
 from hud.settings import settings
 from hud.types import AgentType
 from hud.utils.env import resolve_env_vars
@@ -37,7 +36,6 @@ def _is_bedrock_arn(model: str | None) -> bool:
 
 if TYPE_CHECKING:
     from hud.agents.base import MCPAgent
-    from hud.types import Task
 
 logger = logging.getLogger(__name__)
 hud_console = HUDConsole()
@@ -101,6 +99,7 @@ class AgentPreset:
 # verbose = true
 # very_verbose = true
 # auto_respond = true
+# gateway = false  # Route LLM API calls through HUD Gateway
 
 [agent]
 # allowed_tools = ["computer", "playwright"]
@@ -161,6 +160,8 @@ class EvalConfig(BaseModel):
         "group_size",
         "remote",
         "auto_respond",
+        "quiet",
+        "gateway",
     }
     # Fields loaded from [agent] section
     _AGENT_FIELDS: ClassVar[set[str]] = {"allowed_tools", "disallowed_tools"}
@@ -178,6 +179,8 @@ class EvalConfig(BaseModel):
     auto_respond: bool | None = None  # Continue without prompting (default: True for --full)
     group_size: int = 1
     remote: bool = False
+    quiet: bool = False  # Suppress opening browser for eval links
+    gateway: bool = False  # Use HUD Gateway for LLM API calls
 
     # Base agent config (these merge with task's agent_config)
     allowed_tools: list[str] | None = None
@@ -215,6 +218,14 @@ def validate_api_keys(self) -> None:
                 raise typer.Exit(1)
             return
 
+        # Gateway mode only requires HUD_API_KEY
+        if self.gateway:
+            if not settings.api_key:
+                hud_console.error("HUD_API_KEY is required for gateway mode")
+                hud_console.info("Set it: hud set HUD_API_KEY=your-key-here")
+                raise typer.Exit(1)
+            return
+
         if self.agent_type == AgentType.OPENAI_COMPATIBLE:
             # Check both CLI --model and config file model
             config_model = self.agent_config.get("openai_compatible", {}).get("model")
@@ -267,14 +278,11 @@ def get_agent_kwargs(self) -> dict[str, Any]:
         agent_key = self.agent_type.value
         if agent_key in self.agent_config:
             agent_cfg = dict(self.agent_config[agent_key])
-            # Map user-facing 'model' to internal 'checkpoint_name'
-            if "model" in agent_cfg:
-                agent_cfg["checkpoint_name"] = agent_cfg.pop("model")
             kwargs.update(agent_cfg)
 
         # CLI --model always wins
         if self.model:
-            kwargs["checkpoint_name"] = self.model
+            kwargs["model"] = self.model
 
         if self.agent_type == AgentType.OPENAI_COMPATIBLE:
             base_url = kwargs.get("base_url", "")
@@ -286,7 +294,11 @@ def get_agent_kwargs(self) -> dict[str, Any]:
                     kwargs["api_key"] = settings.openai_api_key
 
         # Auto-detect Bedrock when Claude is selected with a Bedrock ARN
-        if self.agent_type == AgentType.CLAUDE and _is_bedrock_arn(kwargs.get("checkpoint_name")):
+        # Check both model and checkpoint_name for ARN patterns
+        bedrock_arn_detected = _is_bedrock_arn(kwargs.get("model")) or _is_bedrock_arn(
+            kwargs.get("checkpoint_name")
+        )
+        if self.agent_type == AgentType.CLAUDE and bedrock_arn_detected:
             missing_aws = (
                 not settings.aws_access_key_id
                 or not settings.aws_secret_access_key
@@ -319,6 +331,50 @@ def get_agent_kwargs(self) -> dict[str, Any]:
         ):
             kwargs["validate_api_key"] = False
 
+        # Configure gateway mode - route LLM API calls through HUD gateway
+        if self.gateway:
+            hud_api_key = settings.api_key
+            if not hud_api_key:
+                raise typer.Exit(1)  # Already validated in validate_api_keys()
+
+            if self.agent_type == AgentType.CLAUDE:
+                from anthropic import AsyncAnthropic
+
+                kwargs["model_client"] = AsyncAnthropic(
+                    api_key=hud_api_key,
+                    base_url=settings.hud_gateway_url,
+                )
+                hud_console.info("🌐 Using HUD Gateway for Claude API")
+            elif self.agent_type in (AgentType.OPENAI, AgentType.OPERATOR):
+                from openai import AsyncOpenAI
+
+                kwargs["model_client"] = AsyncOpenAI(
+                    api_key=hud_api_key,
+                    base_url=settings.hud_gateway_url,
+                )
+                hud_console.info("🌐 Using HUD Gateway for OpenAI API")
+            elif self.agent_type == AgentType.OPENAI_COMPATIBLE:
+                from openai import AsyncOpenAI
+
+                kwargs["openai_client"] = AsyncOpenAI(
+                    api_key=hud_api_key,
+                    base_url=settings.hud_gateway_url,
+                )
+                hud_console.info("🌐 Using HUD Gateway for OpenAI-compatible API")
+            elif self.agent_type in (AgentType.GEMINI, AgentType.GEMINI_CUA):
+                from google import genai
+                from google.genai.types import HttpOptions
+
+                kwargs["model_client"] = genai.Client(
+                    api_key="PLACEHOLDER",
+                    http_options=HttpOptions(
+                        api_version="v1beta",
+                        base_url=settings.hud_gateway_url,
+                        headers={"Authorization": f"Bearer {hud_api_key}"},
+                    ),
+                )
+                hud_console.info("🌐 Using HUD Gateway for Gemini API")
+
         return kwargs
 
     @classmethod
@@ -398,7 +454,7 @@ def merge_cli(
 
         overrides.update({k: v for k, v in cli_args.items() if v is not None and v is not False})
 
-        for k in ("full", "verbose", "very_verbose", "remote"):
+        for k in ("full", "verbose", "very_verbose", "remote", "quiet", "gateway"):
             if cli_args.get(k) is True:
                 overrides[k] = True
             elif k in overrides and cli_args.get(k) is False:
@@ -501,6 +557,8 @@ def display(self) -> None:
             table.add_row("verbose", "[bold green]True[/bold green]")
         if self.remote:
             table.add_row("remote", "[bold green]True[/bold green] (submitting to platform)")
+        if self.gateway:
+            table.add_row("gateway", "[bold green]True[/bold green] (routing via HUD Gateway)")
 
         # Tool filters (only if set)
         if self.allowed_tools:
@@ -534,14 +592,14 @@ def display(self) -> None:
             for name in config_cls.model_fields:
                 if name in skip:
                     continue
-                # Always show model (checkpoint_name)
-                if name == "checkpoint_name":
+                # Always show model
+                if name == "model":
                     if self.model:
                         value = self.model
                     elif overrides.get("model"):
                         value = overrides["model"]
                     else:
-                        value = getattr(defaults, "checkpoint_name", None)
+                        value = getattr(defaults, "model", None)
                     table.add_row("  model", str(value) if value else "—")
                 elif name in overrides:
                     value = overrides[name]
@@ -554,93 +612,31 @@ def display(self) -> None:
         hud_console.console.print(table)
 
 
-# =============================================================================
-# Task loading
-# =============================================================================
-
-
-def _load_tasks_from_source(source: str) -> list[Task]:
-    """Load tasks from file or HuggingFace dataset."""
-    from hud.utils.tasks import load_tasks
-
-    path = Path(source)
-    if path.exists() and path.suffix in {".json", ".jsonl"}:
-        hud_console.info("📊 Loading task file…")
-        tasks = load_tasks(str(path))
-        try:
-            env_dir = find_environment_dir(path)
-            if env_dir is not None:
-                ensure_built(env_dir, interactive=False)
-        except Exception as exc:
-            hud_console.debug(f"Eval preflight env check skipped: {exc}")
-    else:
-        hud_console.info(f"📊 Loading tasks from: {source}…")
-        tasks = load_tasks(source)
-
-    if not tasks:
-        hud_console.error(f"No tasks found in: {source}")
-        raise typer.Exit(1)
-
-    return tasks  # type: ignore[return-value]
-
-
-def _warn_local_mcp(tasks: list[Task], source: str) -> None:
-    """Warn user if tasks use local MCP configs."""
-    try:
-        has_local = any(
-            isinstance(server_cfg, dict) and "command" in server_cfg and not server_cfg.get("url")
-            for t in tasks
-            for server_cfg in (getattr(t, "mcp_config", {}) or {}).values()
-            if isinstance(getattr(t, "mcp_config", {}), dict)
-        )
-
-        if not has_local:
-            return
-
-        hud_console.warning("Detected local MCP configurations (uses 'command' instead of 'url').")
-        hud_console.info("When running concurrently, exposed host ports from Docker may conflict.")
-
-        if not hud_console.confirm("Proceed with local MCP servers?", default=True):
-            hint_file = Path(source).name if Path(source).exists() else "<tasks_file>"
-            hud_console.hint(f"Convert to remote: hud convert {hint_file}")
-            raise typer.Exit(1)
-
-        hint_file = Path(source).name if Path(source).exists() else "<tasks_file>"
-        hud_console.hint(f"Convert to remote to avoid port conflicts: hud convert {hint_file}")
-
-    except typer.Exit:
-        raise
-    except Exception as e:
-        hud_console.debug(f"Local MCP check skipped: {e}")
-
-
 # =============================================================================
 # Evaluation runner
 # =============================================================================
 
 
-async def _run_evaluation(cfg: EvalConfig) -> tuple[list[Any], list[Task]]:
-    """Run evaluation with the given config."""
-    from hud.datasets import run_single_task, run_tasks
+async def _run_evaluation(cfg: EvalConfig) -> tuple[list[Any], list[Any]]:
+    """Run evaluation with the given config using run_dataset()."""
+    from hud.datasets import load_tasks, run_dataset
 
     if cfg.source is None or cfg.agent_type is None:
         raise ValueError("source and agent_type must be set")
 
-    tasks = _load_tasks_from_source(cfg.source)
+    # Load tasks using unified loader (handles v4→v5 conversion automatically)
+    hud_console.info(f"📊 Loading tasks from: {cfg.source}…")
+    tasks = load_tasks(cfg.source)
 
-    if not cfg.remote and (cfg.group_size > 1 or cfg.full):
-        _warn_local_mcp(tasks, cfg.source)
-
-    agent_kwargs = cfg.get_agent_kwargs()
-
-    path = Path(cfg.source)
-    dataset_name = path.name if path.exists() else cfg.source.split("/")[-1]
-    max_steps = cfg.max_steps or (100 if cfg.full else 10)
+    if not tasks:
+        hud_console.error(f"No tasks found in: {cfg.source}")
+        raise typer.Exit(1)
 
     # Filter by task IDs if provided
     if cfg.task_ids:
         id_set = set(cfg.task_ids)
-        filtered = [t for t in tasks if str(getattr(t, "id", "")) in id_set]
+        # Match by task.id or index
+        filtered = [t for i, t in enumerate(tasks) if t.id in id_set or str(i) in id_set]
         if not filtered:
             hud_console.error(f"No tasks found matching IDs: {', '.join(cfg.task_ids)}")
             raise typer.Exit(1)
@@ -651,57 +647,69 @@ async def _run_evaluation(cfg: EvalConfig) -> tuple[list[Any], list[Task]]:
         tasks = [tasks[0]]
         hud_console.info("Using first task (run with --full or --task-ids for more)…")
 
-    auto_respond = cfg.auto_respond if cfg.auto_respond is not None else cfg.full
+    hud_console.info(f"Loaded {len(tasks)} task(s)")
 
+    # Prepare agent kwargs
+    agent_kwargs = cfg.get_agent_kwargs()
+    auto_respond = cfg.auto_respond if cfg.auto_respond is not None else cfg.full
     if auto_respond:
         agent_kwargs = {**agent_kwargs, "auto_respond": True}
 
+    max_steps = cfg.max_steps or (100 if cfg.full else 10)
+
+    # Remote execution - submit to HUD platform
     if cfg.remote:
-        hud_console.info(f"🚀 Submitting {len(tasks)} tasks for remote execution…")
-        await run_tasks(
+        # Create a job ID for tracking
+        import uuid
+
+        from hud.datasets.utils import submit_rollouts
+
+        job_id = str(uuid.uuid4())
+        hud_console.info(
+            f"Submitting {len(tasks)} task(s) for remote execution (job_id: {job_id})…"
+        )
+
+        await submit_rollouts(
             tasks=tasks,
+            job_id=job_id,
             agent_type=cfg.agent_type,
             agent_params=agent_kwargs,
-            name=f"Evaluation {dataset_name}",
-            metadata={"dataset": cfg.source},
             max_steps=max_steps,
             group_size=cfg.group_size,
-            remote=True,
         )
+
+        hud_console.success(f"Tasks submitted. View at: https://hud.ai/job/{job_id}")
         return [], tasks
 
+    # Single task mode - show extra info
     if len(tasks) == 1 and cfg.group_size == 1:
-        task = tasks[0]
         logging.getLogger("hud.agents").setLevel(logging.INFO)
         logging.getLogger("hud.agents.base").setLevel(logging.INFO)
-
-        hud_console.info(task.prompt)
-        result = await run_single_task(
-            task=task,
-            agent_type=cfg.agent_type,
-            agent_params=agent_kwargs,
-            max_steps=max_steps,
-            trace_name=task.prompt,
+        # Get prompt from args (v4 tasks) or show scenario name
+        prompt = tasks[0].args.get("prompt") if tasks[0].args else tasks[0].scenario
+        if prompt:
+            hud_console.info(f"Prompt: {prompt}")
+    else:
+        hud_console.info(
+            f"🚀 Running evaluation (max_concurrent: {cfg.max_concurrent}, "
+            f"group_size: {cfg.group_size})…"
         )
-        hud_console.success(f"Reward: {result.reward}")
-        return [result], tasks
 
-    # Local batch execution
-    hud_console.info(
-        f"🚀 Running evaluation (max_concurrent: {cfg.max_concurrent}, "
-        f"group_size: {cfg.group_size})…"
-    )
-
-    results = await run_tasks(
-        tasks=tasks,
-        agent_type=cfg.agent_type,
+    # Run using run_dataset
+    results = await run_dataset(
+        tasks,
+        cfg.agent_type,
         agent_params=agent_kwargs,
-        name=f"Evaluation {dataset_name}",
-        max_concurrent=cfg.max_concurrent,
-        metadata={"dataset": cfg.source},
         max_steps=max_steps,
+        max_concurrent=cfg.max_concurrent,
         group_size=cfg.group_size,
+        quiet=cfg.quiet,
     )
+
+    # Show reward for single task
+    if len(tasks) == 1 and cfg.group_size == 1 and results:
+        hud_console.success(f"Reward: {results[0].reward}")
+
     return results, tasks
 
 
@@ -746,6 +754,12 @@ def eval_command(
     remote: bool = typer.Option(
         False, "--remote", help="Submit tasks to platform for remote execution"
     ),
+    quiet: bool = typer.Option(
+        False, "--quiet", "-q", help="Suppress opening browser for eval links"
+    ),
+    gateway: bool = typer.Option(
+        False, "--gateway", "-g", help="Route LLM API calls through HUD Gateway"
+    ),
 ) -> None:
     """🚀 Run evaluation on datasets or individual tasks with agents.
 
@@ -755,6 +769,7 @@ def eval_command(
         hud eval tasks.json claude --config max_tokens=32768
         hud eval tasks.json openai --config temperature=0.7
         hud eval tasks.json claude --full --remote  # Remote execution
+        hud eval tasks.json claude --gateway  # Route LLM calls through HUD Gateway
     """
     hud_console.info("🔧 Initializing evaluation...")
 
@@ -775,6 +790,8 @@ def eval_command(
         group_size=group_size,
         config=config,
         remote=remote,
+        quiet=quiet,
+        gateway=gateway,
     )
 
     # Find source if not provided
diff --git a/hud/cli/flows/dev.py b/hud/cli/flows/dev.py
index 8072cf01..a0da6a7f 100644
--- a/hud/cli/flows/dev.py
+++ b/hud/cli/flows/dev.py
@@ -143,8 +143,8 @@ def show_dev_ui(
     if is_docker:
         hud_console.dim_info(
             "",
-            "Container restarts on file changes (mounted volumes), "
-            "if changing tools run hud dev again",
+            "Container restarts on file changes in watched folders (-w), "
+            "rebuild with 'hud dev' if changing other files",
         )
     hud_console.info("")
 
diff --git a/hud/cli/flows/init.py b/hud/cli/flows/init.py
new file mode 100644
index 00000000..b58ef3ad
--- /dev/null
+++ b/hud/cli/flows/init.py
@@ -0,0 +1,191 @@
+"""Smart HUD environment initialization."""
+
+from __future__ import annotations
+
+import subprocess
+from pathlib import Path
+
+from hud.utils.hud_console import HUDConsole
+
+from .templates import DOCKERFILE_HUD, ENV_PY, PYPROJECT_TOML
+
+# Files that indicate this might be an existing project
+PROJECT_INDICATORS = {
+    "pyproject.toml",
+    "package.json",
+    "requirements.txt",
+    "setup.py",
+    "Cargo.toml",
+    "go.mod",
+}
+
+
+def _normalize_name(name: str) -> str:
+    """Normalize name for Python identifiers."""
+    name = name.replace("-", "_").replace(" ", "_")
+    return "".join(c if c.isalnum() or c == "_" else "_" for c in name)
+
+
+def _has_hud_dependency(directory: Path) -> bool:
+    """Check if hud-python is already in pyproject.toml."""
+    pyproject = directory / "pyproject.toml"
+    if not pyproject.exists():
+        return False
+    content = pyproject.read_text()
+    return "hud-python" in content or "hud_python" in content
+
+
+def _add_hud_dependency(directory: Path) -> str:
+    """Add hud-python using uv if available.
+
+    Returns:
+        "exists" if already present, "added" if added, "failed" if failed
+    """
+    if _has_hud_dependency(directory):
+        return "exists"
+
+    try:
+        result = subprocess.run(
+            ["uv", "add", "hud-python", "openai"],  # noqa: S607
+            capture_output=True,
+            text=True,
+            cwd=directory,
+            check=False,
+        )
+        if result.returncode == 0 or "already" in result.stderr.lower():
+            return "added"
+        return "failed"
+    except FileNotFoundError:
+        return "failed"
+
+
+def _is_empty_or_trivial(directory: Path) -> bool:
+    """Check if directory is empty or only has trivial files."""
+    if not directory.exists():
+        return True
+    files = list(directory.iterdir())
+    # Empty
+    if not files:
+        return True
+    # Only has hidden files or common trivial files
+    trivial = {".git", ".gitignore", ".DS_Store", "README.md", "LICENSE"}
+    return all(f.name in trivial or f.name.startswith(".") for f in files)
+
+
+def _has_project_files(directory: Path) -> bool:
+    """Check if directory has files indicating an existing project."""
+    if not directory.exists():
+        return False
+    return any(f.name in PROJECT_INDICATORS for f in directory.iterdir())
+
+
+def smart_init(
+    name: str | None = None,
+    directory: str = ".",
+    force: bool = False,
+) -> None:
+    """Initialize HUD environment files in a directory.
+
+    - If directory is empty: delegate to preset selection
+    - If directory has project files: add HUD files to existing project
+    - Otherwise: create new HUD environment
+    """
+    from hud.settings import settings
+
+    hud_console = HUDConsole()
+
+    # Check for API key first
+    if not settings.api_key:
+        hud_console.error("HUD_API_KEY not found")
+        hud_console.info("")
+        hud_console.info("Set your API key:")
+        hud_console.info("  hud set HUD_API_KEY=your-key-here")
+        hud_console.info("  Or: export HUD_API_KEY=your-key")
+        hud_console.info("")
+        hud_console.info("Get your key at: https://hud.ai/settings/api-keys")
+        return
+
+    target = Path(directory).resolve()
+
+    # If directory is empty, use preset selection
+    if _is_empty_or_trivial(target):
+        from hud.cli.init import create_environment
+
+        hud_console.info("Empty directory - showing preset selection")
+        create_environment(name, directory, force, preset=None)
+        return
+
+    # Directory has files - use smart init
+    target.mkdir(parents=True, exist_ok=True)
+    env_name = _normalize_name(name or target.name)
+    has_pyproject = (target / "pyproject.toml").exists()
+
+    hud_console.header(f"HUD Init: {env_name}")
+
+    if has_pyproject:
+        hud_console.info("Found pyproject.toml - adding HUD files")
+    else:
+        hud_console.info("Creating HUD environment in existing directory")
+
+    created = []
+
+    # Create pyproject.toml if needed
+    if not has_pyproject:
+        pyproject = target / "pyproject.toml"
+        pyproject.write_text(PYPROJECT_TOML.format(name=env_name.replace("_", "-")))
+        created.append("pyproject.toml")
+
+    # Create Dockerfile.hud
+    dockerfile = target / "Dockerfile.hud"
+    if not dockerfile.exists() or force:
+        dockerfile.write_text(DOCKERFILE_HUD)
+        created.append("Dockerfile.hud")
+    else:
+        hud_console.warning("Dockerfile.hud exists, skipping (use --force)")
+
+    # Create env.py
+    env_py = target / "env.py"
+    if not env_py.exists() or force:
+        env_py.write_text(ENV_PY.format(env_name=env_name))
+        created.append("env.py")
+    else:
+        hud_console.warning("env.py exists, skipping (use --force)")
+
+    # Add dependency
+    dep_result = _add_hud_dependency(target)
+    if dep_result == "added":
+        hud_console.success("Added hud-python dependency")
+    elif dep_result == "exists":
+        hud_console.info("hud-python already in dependencies")
+    else:
+        hud_console.info("Run manually: uv add hud-python openai")
+
+    # Summary
+    if created:
+        hud_console.section_title("Created")
+        for f in created:
+            hud_console.status_item(f, "✓")
+
+    hud_console.section_title("Next Steps")
+    hud_console.info("")
+    hud_console.info("1. Define your tools in env.py")
+    hud_console.info("   Tools are functions the agent can call. Wrap existing code")
+    hud_console.info("   with @env.tool() or connect FastAPI/OpenAPI servers.")
+    hud_console.info("")
+    hud_console.info("2. Write scripts that test agent behavior")
+    hud_console.info("   Scripts define prompts and scoring. The agent runs between")
+    hud_console.info("   two yields: first sends the task, second scores the result.")
+    hud_console.info("")
+    hud_console.info("3. Run locally to iterate")
+    hud_console.command_example("python env.py", "Run the test script")
+    hud_console.info("")
+    hud_console.info("4. Deploy for scale")
+    hud_console.info("   Push to GitHub, connect on hud.ai. Then run hundreds of")
+    hud_console.info("   evals in parallel and collect training data.")
+    hud_console.info("")
+    hud_console.section_title("Files")
+    hud_console.info("• env.py         Your tools, scripts, and test code")
+    hud_console.info("• Dockerfile.hud Container config for remote deployment")
+
+
+__all__ = ["smart_init"]
diff --git a/hud/cli/flows/tasks.py b/hud/cli/flows/tasks.py
index c4d8304d..5cba7d86 100644
--- a/hud/cli/flows/tasks.py
+++ b/hud/cli/flows/tasks.py
@@ -4,7 +4,7 @@
 import logging
 import re
 from pathlib import Path
-from typing import TYPE_CHECKING, Any
+from typing import Any
 
 import typer
 import yaml
@@ -13,12 +13,8 @@
 from hud.cli.utils.docker import require_docker_running
 from hud.cli.utils.env_check import find_environment_dir
 from hud.cli.utils.registry import extract_name_and_tag
+from hud.datasets import load_tasks
 from hud.utils.hud_console import hud_console
-from hud.utils.tasks import load_tasks
-
-if TYPE_CHECKING:
-    from hud.types import Task
-
 
 logger = logging.getLogger(__name__)
 
@@ -29,7 +25,7 @@ def _is_remote_url(url: str) -> bool:
     return bool(re.match(r"^(https?:\/\/)?(www\.)?[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,}(\/\S*)?$", url))
 
 
-def _validate_tasks(tasks: list[Task]) -> bool:
+def _validate_tasks(tasks: list[dict[str, Any]]) -> bool:
     """Validate the tasks file: return True if tasks already reference a remote MCP URL.
 
     A task is considered remote if any "url" field anywhere inside mcp_config
@@ -50,7 +46,7 @@ def _has_remote_url(obj: Any) -> bool:
         return False
 
     for task in tasks:
-        cfg = task.mcp_config or {}
+        cfg = task.get("mcp_config") or {}
         if not _has_remote_url(cfg):
             return False
     return True
@@ -115,7 +111,7 @@ def _derive_remote_image(lock_data: dict[str, Any]) -> str:
     raise typer.Exit(1)
 
 
-def _extract_existing_images(tasks: list[Task]) -> set[str]:
+def _extract_existing_images(tasks: list[dict[str, Any]]) -> set[str]:
     """Extract all Mcp-Image references from tasks."""
     images = set()
 
@@ -134,8 +130,9 @@ def _extract_from_obj(obj: Any) -> None:
                 _extract_from_obj(item)
 
     for task in tasks:
-        if task.mcp_config:
-            _extract_from_obj(task.mcp_config)
+        mcp_config = task.get("mcp_config")
+        if mcp_config:
+            _extract_from_obj(mcp_config)
 
     return images
 
@@ -267,12 +264,13 @@ def convert_tasks_to_remote(tasks_file: str) -> str:
     """
     tasks_path = Path(tasks_file).resolve()
 
-    # Load validated tasks for decision-making (may resolve env vars)
-    tasks: list[Task] = load_tasks(str(tasks_path))  # type: ignore[assignment]
-
-    # Load raw tasks to preserve placeholders when writing back to disk
+    # Load raw tasks - we work with dicts directly to preserve placeholders
+    # when writing back to disk (e.g., ${HUD_API_KEY})
     raw_tasks: list[dict[str, Any]] = load_tasks(str(tasks_path), raw=True)  # type: ignore[assignment]
 
+    # Use the same raw tasks for validation (they have mcp_config structure)
+    tasks = raw_tasks
+
     # Ensure HUD_API_KEY is available: prefer process env, else load from env_dir/.env
     from hud.settings import settings
 
@@ -446,10 +444,10 @@ def _one(x: Any) -> dict[str, Any]:
     tasks_payload: list[dict[str, Any]] = []
     for t in tasks:
         item: dict[str, Any] = {
-            "prompt": t.prompt,
+            "prompt": t.get("prompt"),
             "mcp_config": {
                 "hud": {
-                    "url": "https://mcp.hud.ai/v3/mcp",
+                    "url": settings.hud_mcp_url,
                     "headers": {
                         "Authorization": "Bearer ${HUD_API_KEY}",
                         "Mcp-Image": remote_image,
@@ -462,16 +460,16 @@ def _one(x: Any) -> dict[str, Any]:
         item["mcp_config"]["hud"]["headers"].update(extra_api_key_headers)
 
         # Optional fields, omit Nones
-        if t.setup_tool is not None:
-            item["setup_tool"] = _simplify_tool_call(t.setup_tool)
-        if t.evaluate_tool is not None:
-            item["evaluate_tool"] = _simplify_tool_call(t.evaluate_tool)
-        if t.agent_config is not None:
-            item["agent_config"] = t.agent_config
-        if t.metadata:
-            item["metadata"] = t.metadata
-        if t.id is not None:
-            item["id"] = t.id
+        if t.get("setup_tool") is not None:
+            item["setup_tool"] = _simplify_tool_call(t["setup_tool"])
+        if t.get("evaluate_tool") is not None:
+            item["evaluate_tool"] = _simplify_tool_call(t["evaluate_tool"])
+        if t.get("agent_config") is not None:
+            item["agent_config"] = t["agent_config"]
+        if t.get("metadata"):
+            item["metadata"] = t["metadata"]
+        if t.get("id") is not None:
+            item["id"] = t["id"]
 
         tasks_payload.append(item)
 
diff --git a/hud/cli/flows/templates.py b/hud/cli/flows/templates.py
new file mode 100644
index 00000000..1dfb2b05
--- /dev/null
+++ b/hud/cli/flows/templates.py
@@ -0,0 +1,151 @@
+"""Templates for hud init command."""
+
+DOCKERFILE_HUD = """\
+FROM python:3.11-slim
+
+RUN apt-get update && apt-get install -y --no-install-recommends curl \\
+    && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /app
+COPY pyproject.toml uv.lock* ./
+RUN pip install uv && uv sync --frozen --no-dev 2>/dev/null || uv sync --no-dev
+COPY . .
+
+# Most of the time this command should not change, except if you change your env path
+# or launch some other service before running the environment
+CMD ["uv", "run", "python", "-m", "hud", "dev", "env:env", "--stdio"]
+"""
+
+# fmt: off
+ENV_PY = '''\
+"""{env_name} - HUD Environment"""
+
+import asyncio
+
+import hud
+from hud.settings import settings
+from openai import AsyncOpenAI, Omit
+from hud.environment import Environment
+
+env = Environment("{env_name}")
+
+
+# =============================================================================
+# 1. TOOLS - Functions the agent can call
+# =============================================================================
+
+@env.tool()
+def count_letter(text: str, letter: str) -> int:
+    """Count occurrences of a letter in text."""
+    return text.lower().count(letter.lower())
+
+
+# =============================================================================
+# 2. SCRIPTS - Define prompts and evaluation logic
+# =============================================================================
+
+@env.scenario("count")
+async def count_script(sentence: str, letter: str, fmt: str = "integer"):
+    """Agent must count a letter. We check if they got it right."""
+    # Yield the prompt, receive the agent's final answer
+    answer = yield f"How many times does '{{letter}}' appear in: '{{sentence}}'? Format: {{fmt}}."
+
+    # Score: 1.0 if correct, 0.0 otherwise
+    correct = str(sentence.lower().count(letter.lower()))
+    yield correct in answer
+
+
+# =============================================================================
+# 3. CONNECT EXISTING SERVERS (optional)
+# =============================================================================
+
+# --- FastAPI app ---
+# from my_app import app
+# env.connect_fastapi(app)
+
+# --- FastMCP / MCPServer ---
+# from my_server import mcp
+# env.connect_server(mcp)
+
+# --- OpenAPI spec (URL or file path) ---
+# env.connect_openapi("https://api.example.com/openapi.json")
+
+# --- MCP config (stdio or SSE) ---
+# env.connect_mcp_config({{
+#     "my-server": {{"command": "uvx", "args": ["some-mcp-server"]}}
+# }})
+
+# --- HUD hub (requires deployment, see below) ---
+# env.connect_hub("my-org/my-env", prefix="remote")
+
+
+# =============================================================================
+# TEST - Run with: python env.py
+# =============================================================================
+
+async def test():
+    client = AsyncOpenAI(
+        base_url=settings.hud_gateway_url,
+        api_key=settings.api_key,
+    )
+
+    # Create a task from the scenario
+    task = env("count", sentence="Strawberry world", letter="r")
+
+    # Test with and without tools
+    async with hud.eval(task, variants={{"tools": [True, False]}}) as ctx:
+        response = await client.chat.completions.create(
+            model="gpt-4o-mini",
+            messages=[{{"role": "user", "content": ctx.prompt}}],
+            tools=ctx.as_openai_chat_tools() if ctx.variants["tools"] else Omit(),
+        )
+
+        # Handle tool calls if present
+        message = response.choices[0].message
+        if message.tool_calls:
+            result = await ctx.call_tool(message.tool_calls[0])
+            answer = str(result["content"])
+        else:
+            answer = message.content
+
+        await ctx.submit(answer or "")
+
+
+if __name__ == "__main__":
+    asyncio.run(test())
+
+
+# =============================================================================
+# DEPLOYMENT
+# =============================================================================
+# To deploy this environment on HUD:
+#
+# 1. Push this repo to GitHub
+# 2. Go to hud.ai -> New -> Environment
+# 3. Choose "From GitHub URL" and paste your repo URL
+# 4. This deploys the environment for remote connection
+#
+# Once deployed, connect to it from other environments:
+#   env.connect_hub("{env_name}")
+#
+# Remote deployment enables:
+# - Parallelized evaluations (run many agents simultaneously)
+# - Training data collection at scale
+# - Shared environments across team members
+#
+# Note: The test() function above is just for local testing.
+# It's not required for the deployed environment.
+'''
+# fmt: on
+
+PYPROJECT_TOML = """\
+[project]
+name = "{name}"
+version = "0.1.0"
+requires-python = ">=3.10"
+dependencies = ["hud-python", "openai"]
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+"""
diff --git a/hud/cli/rft.py b/hud/cli/rft.py
index 1c910b73..1c005726 100644
--- a/hud/cli/rft.py
+++ b/hud/cli/rft.py
@@ -8,9 +8,9 @@
 from rich.console import Console
 from rich.table import Table
 
+from hud.datasets import load_tasks
 from hud.settings import settings
 from hud.utils.hud_console import HUDConsole
-from hud.utils.tasks import load_tasks
 
 logger = logging.getLogger(__name__)
 console = Console()
@@ -192,12 +192,8 @@ def rft_command(
 
     # Load and validate tasks
     try:
-        # Load tasks with env vars already resolved
-        from hud.types import Task  # noqa: TC001
-
-        tasks_objects: list[Task] = load_tasks(tasks_file)  # type: ignore[assignment]
-        # Convert to dicts for patching and serialization
-        tasks: list[dict[str, Any]] = [t.model_dump() for t in tasks_objects]
+        # Load tasks as raw dicts for patching and serialization
+        tasks: list[dict[str, Any]] = load_tasks(tasks_file, raw=True)  # type: ignore[assignment]
         if not tasks:
             hud_console.error(f"No tasks found in {tasks_file}")
             raise typer.Exit(1)
@@ -243,7 +239,7 @@ def rft_command(
                 hud_console.info("Skipping task preview in auto-accept mode (--yes)")
             else:
                 try:
-                    from hud.cli.rl.viewer import show_json_interactive
+                    from hud.cli.utils.viewer import show_json_interactive
 
                     hud_console.section_title("Task Preview")
                     show_json_interactive(
diff --git a/hud/cli/rft_status.py b/hud/cli/rft_status.py
index e04e9b4a..55566a39 100644
--- a/hud/cli/rft_status.py
+++ b/hud/cli/rft_status.py
@@ -6,7 +6,7 @@
 import typer
 from rich.console import Console
 
-from hud.cli.rl.viewer import show_json_interactive
+from hud.cli.utils.viewer import show_json_interactive
 from hud.settings import settings
 from hud.utils.hud_console import HUDConsole
 
diff --git a/hud/cli/rl/__init__.py b/hud/cli/rl/__init__.py
deleted file mode 100644
index 57b29546..00000000
--- a/hud/cli/rl/__init__.py
+++ /dev/null
@@ -1,180 +0,0 @@
-"""RL training command for HUD CLI."""
-
-from __future__ import annotations
-
-import logging
-import os
-from typing import TYPE_CHECKING
-
-import typer
-from rich.console import Console
-
-from hud.cli.utils.tasks import find_tasks_file
-from hud.utils.hud_console import hud_console
-
-console = Console()
-
-if TYPE_CHECKING:
-    from pathlib import Path
-
-
-def rl_command(
-    tasks_file: str | None = typer.Argument(
-        None,
-        help="Path to tasks file (JSON/JSONL) or HuggingFace dataset name",
-    ),
-    model: str | None = typer.Argument(
-        None,
-        help="Model to train from https://hud.ai/models (default: interactive selection)",
-    ),
-    config_file: Path | None = typer.Option(  # noqa: B008
-        None,
-        "--config",
-        "-c",
-        help="Path to existing configuration file",
-    ),
-    output_dir: str = typer.Option(
-        "/checkpoints",
-        "--output-dir",
-        "-o",
-        help="Output directory for checkpoints",
-    ),
-    restart: bool = typer.Option(
-        False,
-        "--restart",
-        help="Restart the vLLM server before training",
-    ),
-    verbose: bool = typer.Option(
-        False,
-        "--verbose",
-        "-v",
-        help="Enable verbose output",
-    ),
-    # DDP options
-    no_ddp: bool = typer.Option(
-        False,
-        "--no-ddp",
-        help="Disable DDP even with multiple GPUs",
-    ),
-    ddp_gpus: str | None = typer.Option(
-        None,
-        "--ddp-gpus",
-        help="Specific GPUs for DDP (e.g., '0,1,2,3')",
-    ),
-    vllm_gpu: int | None = typer.Option(
-        None,
-        "--vllm-gpu",
-        help="Specific GPU for vLLM server",
-    ),
-    # Execution mode options
-    local: bool = typer.Option(
-        False,
-        "--local",
-        help="Run training locally instead of using remote API server",
-    ),
-    yes: bool = typer.Option(
-        False,
-        "--yes",
-        "-y",
-        help="Auto-accept all prompts and use defaults (lazy mode)",
-    ),
-    vllm_gpu_count: int = typer.Option(
-        None,
-        "--vllm-gpu-count",
-        help="Number of GPUs for vLLM server",
-    ),
-    skip_vllm_startup: bool = typer.Option(
-        False,
-        "--skip-vllm-startup",
-        help="Skip local vLLM server startup (for internal use)",
-    ),
-) -> None:
-    """Run GRPO reinforcement learning training on tasks."""
-    # Configure logging based on verbose flag BEFORE any output
-    if not verbose:
-        os.environ["HUD_LOG_LEVEL"] = "WARNING"
-        logging.basicConfig(level=logging.WARNING, force=True)
-        root_logger = logging.getLogger()
-        root_logger.setLevel(logging.WARNING)
-
-        # Suppress INFO logs from various components
-        for logger_name in [
-            "httpx",
-            "hud.agents",
-            "hud.utils.design",
-            "hud",
-            "asyncio",
-            "transformers",
-        ]:
-            logging.getLogger(logger_name).setLevel(logging.WARNING)
-        logging.getLogger("hud.agents.base").setLevel(logging.WARNING)
-    else:
-        logging.basicConfig(level=logging.INFO)
-
-    hud_console.header("HUD RL Training")
-
-    # Determine execution mode
-    use_remote = not local
-
-    if not tasks_file:
-        tasks_file = find_tasks_file(tasks_file)
-        if not tasks_file:
-            hud_console.warning("No tasks file found in current directory")
-            hud_console.hint(
-                "Download a HF dataset using `hud get <dataset_name>` (e.g., `hud get hud-evals/2048-basic`)"  # noqa: E501
-            )
-            hud_console.hint("or create a tasks file manually.")
-            raise typer.Exit(1)
-
-    # If user ran bare `hud rl`, guide them through remote task conversion flow
-    # before proceeding (remote only)
-    if use_remote:
-        try:
-            from hud.cli.flows.tasks import convert_tasks_to_remote
-
-            console.print("[cyan]Preparing remote training tasks...[/cyan]")
-            tasks_file = convert_tasks_to_remote(tasks_file)
-        except typer.Exit:
-            raise
-        except Exception as e:
-            hud_console.warning(f"[red]Tasks file is not valid for remote training: {e!s}[/red]")
-            hud_console.hint("Either ensure the tasks file has remote urls")
-            hud_console.hint("Or rerun `hud rl` within an environment directory")
-            raise typer.Exit(1) from e
-
-        try:
-            from .remote_runner import run_remote_training
-
-            run_remote_training(
-                tasks_file=tasks_file,
-                model=model,
-                config_file=config_file,
-                output_dir=output_dir,
-                vllm_gpu_count=vllm_gpu_count,
-                yes=yes,
-            )
-            return
-        except Exception as e:
-            console.print(f"[red]❌ Remote training failed: {e!s}[/red]")
-            raise typer.Exit(1) from e
-
-    # Local execution flow delegated to local_runner (imports heavy deps lazily)
-    from .local_runner import run_local_training
-
-    run_local_training(
-        tasks_file=tasks_file,
-        model=model,
-        config_file=config_file,
-        output_dir=output_dir,
-        yes=yes,
-        restart=restart,
-        verbose=verbose,
-        no_ddp=no_ddp,
-        ddp_gpus=ddp_gpus,
-        vllm_gpu=vllm_gpu,
-        skip_vllm_startup=skip_vllm_startup,
-    )
-
-
-# Export the command function
-__all__ = ["rl_command"]
diff --git a/hud/cli/rl/config.py b/hud/cli/rl/config.py
deleted file mode 100644
index fd6721aa..00000000
--- a/hud/cli/rl/config.py
+++ /dev/null
@@ -1,101 +0,0 @@
-"""Configuration generation and management for RL training."""
-
-from __future__ import annotations
-
-import json
-from typing import TYPE_CHECKING, Any
-
-from rich.console import Console
-
-from hud.rl.config import Config, validate_vl_model
-from hud.utils.hud_console import hud_console
-
-from .display import display_preset_table
-from .presets import estimate_memory_usage
-
-if TYPE_CHECKING:
-    from pathlib import Path
-console = Console()
-
-
-def generate_config_interactive(
-    model_name: str,
-    presets: list[dict[str, Any]],
-    yes: bool = False,
-) -> tuple[Config, float]:
-    """Generate RL training configuration interactively."""
-    # Validate model is a VL model
-    validate_vl_model(model_name)
-
-    # Display preset options
-    if not yes:
-        display_preset_table(presets, 80.0)  # Assuming A100 80GB
-
-    # Let user select preset
-    if yes:
-        # Use default preset (Balanced if available, otherwise first)
-        preset_choice = 1 if len(presets) > 1 else 0
-        selected_preset = presets[preset_choice]
-        hud_console.info(f"Auto-selecting preset: {selected_preset['name']} (--yes mode)")
-    else:
-        preset_choice = hud_console.select(
-            "Select a training configuration preset:",
-            choices=[{"name": p["name"], "value": i} for i, p in enumerate(presets)],
-            default=1 if len(presets) > 1 else 0,  # Default to "Balanced" if available
-        )
-        selected_preset = presets[preset_choice]  # type: ignore
-
-    # Use preset values directly
-    max_steps_per_episode = selected_preset["max_steps_per_episode"]
-
-    # Calculate memory estimate
-    max_pixels = 256 * 28 * 28
-    estimated_memory = estimate_memory_usage(
-        selected_preset["mini_batch_size"],
-        max_steps_per_episode,
-        selected_preset["max_new_tokens"],
-        max_pixels,
-    )
-
-    config_adds = {
-        "actor": {
-            "max_new_tokens": selected_preset["max_new_tokens"],
-            "max_parallel_episodes": selected_preset["batch_size"],
-            "max_steps_per_episode": selected_preset["max_steps_per_episode"],
-            "force_tool_choice": True,
-        },
-        "training": {
-            "mini_batch_size": selected_preset["mini_batch_size"],
-            "group_size": selected_preset["group_size"],
-            "batch_size": selected_preset["batch_size"],
-            "lr": selected_preset["lr"],
-            "epochs": selected_preset["epochs"],
-        },
-        "verbose": True,
-    }
-
-    # Create config
-    config = Config.from_dict(config_adds)
-
-    return config, estimated_memory
-
-
-def save_config(config: Config, path: Path) -> None:
-    """Save configuration to a JSON file."""
-    config_dict = config.to_dict()
-
-    with open(path, "w", encoding="utf-8") as f:
-        json.dump(config_dict, f, indent=2)
-        f.write("\n")  # Add newline at end of file
-
-    if not path.name.startswith("."):  # Don't show message for temp files
-        console.print(f"[green]✅ Configuration saved to {path}[/green]")
-
-
-def load_config(path: Path) -> Config:
-    """Load configuration from a JSON file."""
-    with open(path, encoding="utf-8") as f:
-        data = json.load(f)
-
-    # Use Config.from_dict which handles missing fields gracefully
-    return Config.from_dict(data)
diff --git a/hud/cli/rl/display.py b/hud/cli/rl/display.py
deleted file mode 100644
index 06435cf5..00000000
--- a/hud/cli/rl/display.py
+++ /dev/null
@@ -1,133 +0,0 @@
-"""Display utilities for RL training configuration."""
-
-from __future__ import annotations
-
-from typing import TYPE_CHECKING, Any
-
-from rich.console import Console
-from rich.table import Table
-
-if TYPE_CHECKING:
-    from hud.rl.config import Config
-
-console = Console()
-
-
-def display_gpu_info(gpu_info: dict[str, Any]) -> None:
-    """Display GPU information in a table."""
-    if not gpu_info["available"]:
-        console.print(f"[red]❌ CUDA not available: {gpu_info.get('error', 'Unknown error')}[/red]")
-        return
-
-    gpu_table = Table(title="🖥️  Available GPUs", title_style="bold cyan")
-    gpu_table.add_column("Index", style="yellow")
-    gpu_table.add_column("Name", style="cyan")
-    gpu_table.add_column("Memory", style="green")
-
-    for device in gpu_info["devices"]:
-        gpu_table.add_row(f"GPU {device['index']}", device["name"], f"{device['memory_gb']:.1f} GB")
-
-    console.print(gpu_table)
-
-
-def display_preset_table(presets: list[dict[str, Any]], gpu_memory_gb: float) -> None:
-    """Display training configuration presets in a table."""
-    preset_table = Table(title="📊 Training Configuration Presets", title_style="bold cyan")
-    preset_table.add_column("Option", style="yellow")
-    preset_table.add_column("Steps", style="cyan")
-    preset_table.add_column("Mini-batch", style="cyan")
-    preset_table.add_column("Group", style="cyan")
-    preset_table.add_column("Episodes/batch", style="cyan")
-
-    # Add time columns for A100
-    if gpu_memory_gb >= 40:
-        preset_table.add_column("Tasks/hour", style="green")
-        preset_table.add_column("Updates/hour", style="green")
-
-    for i, preset in enumerate(presets):
-        row = [
-            f"{i + 1}. {preset['name']}",
-            str(preset["max_steps_per_episode"]),
-            str(preset["mini_batch_size"]),
-            str(preset["group_size"]),
-            str(preset["batch_size"]),
-        ]
-        if "tasks_per_hour" in preset:
-            row.extend(
-                [
-                    str(preset["tasks_per_hour"]),
-                    str(preset["steps_per_hour"]),
-                ]
-            )
-        preset_table.add_row(*row)
-
-    console.print("\n")
-    console.print(preset_table)
-    console.print("\n")
-
-
-def display_config_summary(
-    config: Config, tasks_count: int, gpu_info: dict[str, Any], estimated_memory: float
-) -> None:
-    """Display comprehensive configuration summary for review."""
-    console.print("\n[bold cyan]📋 RL Training Configuration Summary[/bold cyan]\n")
-
-    # GPU Information
-    if gpu_info["available"]:
-        gpu_table = Table(title="🖥️  GPU Information", title_style="bold yellow")
-        gpu_table.add_column("Property", style="cyan")
-        gpu_table.add_column("Value", style="green")
-
-        device = gpu_info["devices"][0]  # Primary GPU
-        gpu_table.add_row("GPU 0", device["name"])
-        gpu_table.add_row("Memory", f"{device['memory_gb']:.1f} GB")
-        gpu_table.add_row("Compute Capability", "8.0")  # Assuming A100
-
-        console.print(gpu_table)
-
-    # Model Configuration
-    model_table = Table(title="🤖 Model Configuration", title_style="bold yellow")
-    model_table.add_column("Parameter", style="cyan")
-    model_table.add_column("Value", style="green")
-
-    model_table.add_row("Base Model", config.model.base_model)
-    model_table.add_row("LoRA Rank (r)", str(config.model.lora_r))
-    model_table.add_row("LoRA Alpha", str(config.model.lora_alpha))
-    model_table.add_row("LoRA Dropout", str(config.model.lora_dropout))
-
-    console.print(model_table)
-
-    # Training Configuration
-    training_table = Table(title="🎯 Training Configuration", title_style="bold yellow")
-    training_table.add_column("Parameter", style="cyan")
-    training_table.add_column("Value", style="green")
-
-    training_table.add_row("Tasks Count", str(tasks_count))
-    training_table.add_row("Learning Rate", f"{config.training.lr:.1e}")
-    training_table.add_row("Epochs", str(config.training.epochs))
-    training_table.add_row("Mini Batch Size", str(config.training.mini_batch_size))
-    training_table.add_row("Batch Size", str(config.training.batch_size))
-    training_table.add_row("Group Size", str(config.training.group_size))
-    training_table.add_row("Training Steps", str(config.training.training_steps))
-    training_table.add_row("Max Parallel Episodes", str(config.actor.max_parallel_episodes))
-
-    console.print(training_table)
-
-    # Memory Estimation
-    memory_table = Table(title="💾 Memory Estimation", title_style="bold yellow")
-    memory_table.add_column("Metric", style="cyan")
-    memory_table.add_column("Value", style="green")
-
-    memory_table.add_row("Estimated GPU Memory", f"{estimated_memory:.1f} GB")
-    if gpu_info["available"]:
-        available_memory = gpu_info["devices"][0]["memory_gb"]
-        memory_table.add_row("Available GPU Memory", f"{available_memory:.1f} GB")
-
-        if estimated_memory > available_memory:
-            status = "[red]⚠️  May exceed available memory[/red]"
-        else:
-            status = "[green]✅ Within memory limits[/green]"
-        memory_table.add_row("Status", status)
-
-    console.print(memory_table)
-    console.print("\n")
diff --git a/hud/cli/rl/gpu.py b/hud/cli/rl/gpu.py
deleted file mode 100644
index 56690cd7..00000000
--- a/hud/cli/rl/gpu.py
+++ /dev/null
@@ -1,63 +0,0 @@
-"""GPU detection and validation utilities for RL training."""
-
-from __future__ import annotations
-
-import subprocess
-from typing import Any
-
-
-def detect_cuda_devices() -> dict[str, Any]:
-    """Detect available CUDA devices and their properties."""
-    try:
-        # Check if CUDA is available
-        result = subprocess.run(
-            ["nvidia-smi", "--query-gpu=index,name,memory.total", "--format=csv,noheader,nounits"],  # noqa: S607
-            capture_output=True,
-            text=True,
-            check=True,
-        )
-
-        if result.returncode != 0:
-            return {"available": False, "error": "nvidia-smi command failed"}
-
-        devices = []
-        for line in result.stdout.strip().split("\n"):
-            parts = line.split(", ")
-            if len(parts) >= 3:
-                devices.append(
-                    {
-                        "index": int(parts[0]),
-                        "name": parts[1],
-                        "memory_gb": float(parts[2]) / 1024,  # Convert MB to GB
-                    }
-                )
-
-        return {"available": True, "devices": devices}
-
-    except FileNotFoundError:
-        return {
-            "available": False,
-            "error": "nvidia-smi not found - CUDA drivers may not be installed",
-        }
-    except Exception as e:
-        return {"available": False, "error": str(e)}
-
-
-def select_gpu_for_vllm(devices: list[dict[str, Any]]) -> int:
-    """Select the best GPU for vLLM server (typically GPU 1 if available)."""
-    if len(devices) > 1:
-        # Prefer GPU 1 for vLLM to leave GPU 0 for other processes
-        return 1
-    return 0
-
-
-def validate_gpu_memory(gpu_memory_gb: float, model_size: str = "3B") -> bool:
-    """Validate if GPU has sufficient memory for the model."""
-    min_memory_requirements = {
-        "3B": 12.0,  # Minimum for Qwen 2.5 VL 3B
-        "7B": 24.0,
-        "14B": 40.0,
-    }
-
-    min_required = min_memory_requirements.get(model_size, 12.0)
-    return gpu_memory_gb >= min_required
diff --git a/hud/cli/rl/gpu_utils.py b/hud/cli/rl/gpu_utils.py
deleted file mode 100644
index 8b999aa2..00000000
--- a/hud/cli/rl/gpu_utils.py
+++ /dev/null
@@ -1,321 +0,0 @@
-"""GPU utilities for DDP training."""
-
-from __future__ import annotations
-
-import logging
-import subprocess
-import time
-from typing import TYPE_CHECKING, Any
-
-from hud.utils.hud_console import HUDConsole
-
-if TYPE_CHECKING:
-    from hud.rl.config import Config
-hud_console = HUDConsole(logging.getLogger(__name__))
-
-
-def get_gpu_memory_info() -> dict[int, dict[str, Any]]:
-    """Get memory usage information for all GPUs."""
-
-    gpu_memory = {}
-    try:
-        # Get memory info for all GPUs
-        cmd = [
-            "nvidia-smi",
-            "--query-gpu=index,memory.used,memory.total,memory.free",
-            "--format=csv,noheader,nounits",
-        ]
-        result = subprocess.run(cmd, capture_output=True, text=True, check=True)  # noqa: S603
-
-        for line in result.stdout.strip().split("\n"):
-            if not line:
-                continue
-            parts = line.split(", ")
-            if len(parts) >= 4:
-                gpu_idx = int(parts[0])
-                memory_used = float(parts[1])
-                memory_total = float(parts[2])
-                memory_free = float(parts[3])
-                gpu_memory[gpu_idx] = {
-                    "used_mb": memory_used,
-                    "total_mb": memory_total,
-                    "free_mb": memory_free,
-                    "used_pct": (memory_used / memory_total) * 100,
-                }
-
-        # Get process information per GPU
-        for gpu_idx in gpu_memory:  # noqa: PLC0206
-            cmd = [
-                "nvidia-smi",
-                "-i",
-                str(gpu_idx),
-                "--query-compute-apps=pid,used_memory",
-                "--format=csv,noheader,nounits",
-            ]
-            try:
-                result = subprocess.run(cmd, capture_output=True, text=True, check=True)  # noqa: S603
-                processes = []
-                for line in result.stdout.strip().split("\n"):
-                    if not line:
-                        continue
-                    parts = line.split(", ")
-                    if len(parts) >= 2:
-                        pid = int(parts[0])
-                        memory_mb = float(parts[1])
-                        processes.append({"pid": pid, "memory_mb": memory_mb})
-                gpu_memory[gpu_idx]["processes"] = processes
-            except Exception as e:
-                hud_console.error(f"Failed to get process info for GPU {gpu_idx}: {e}")
-                gpu_memory[gpu_idx]["processes"] = []
-
-    except Exception as e:
-        hud_console.error(f"Failed to get GPU memory info {e}")
-        return {}
-
-    return gpu_memory
-
-
-def health_check_gpus(gpu_indices: list[int]) -> dict[str, Any]:
-    """Perform health check on specified GPUs including memory status.
-
-    Returns:
-        Dict with:
-        - healthy_gpus: List of healthy GPU indices
-        - unhealthy_gpus: Dict of unhealthy GPU index -> error message
-        - all_healthy: Boolean indicating if all GPUs are healthy
-        - memory_issues: Boolean indicating if there are memory issues
-    """
-    import torch
-    from rich.console import Console
-    from rich.table import Table
-
-    console = Console()
-
-    console.print("\n[bold cyan]🏥 GPU Health Check[/bold cyan]")
-
-    # First get memory info
-    memory_info = get_gpu_memory_info()
-
-    healthy_gpus = []
-    unhealthy_gpus = {}
-    memory_issues = []
-
-    # Create a table for results
-    table = Table(title="GPU Health Status")
-    table.add_column("GPU", style="cyan")
-    table.add_column("Memory Usage", style="yellow")
-    table.add_column("Status", style="green")
-    table.add_column("Details", style="yellow")
-
-    for gpu_idx in gpu_indices:
-        # Memory info
-        mem_str = "Unknown"
-        if gpu_idx in memory_info:
-            mem = memory_info[gpu_idx]
-            used_gb = mem["used_mb"] / 1024
-            total_gb = mem["total_mb"] / 1024
-            mem_str = f"{used_gb:.1f}/{total_gb:.1f} GB ({mem['used_pct']:.0f}%)"
-
-            # Check for high memory usage
-            if mem["used_pct"] > 70:
-                memory_issues.append(gpu_idx)
-                proc_info = f" ({len(mem['processes'])} processes)" if mem["processes"] else ""
-                unhealthy_gpus[gpu_idx] = f"High memory usage{proc_info}"
-                table.add_row(
-                    f"GPU {gpu_idx}", mem_str, "❌ Unhealthy", f"High memory usage{proc_info}"
-                )
-                continue
-
-        # If no severe memory issue, do accessibility test
-        try:
-            # Try to allocate a small tensor on the GPU
-            torch.cuda.set_device(gpu_idx)
-            device = torch.device(f"cuda:{gpu_idx}")
-
-            # Test basic allocation
-            test_tensor = torch.zeros(100, 100, device=device)
-
-            # Test computation
-            result = torch.matmul(test_tensor, test_tensor)
-
-            # Force synchronization
-            torch.cuda.synchronize(device)
-
-            # Clean up
-            del test_tensor, result
-            torch.cuda.empty_cache()
-
-            healthy_gpus.append(gpu_idx)
-            table.add_row(f"GPU {gpu_idx}", mem_str, "✅ Healthy", "Passed all tests")
-
-        except Exception as e:
-            error_msg = str(e)
-            if "busy or unavailable" in error_msg:
-                short_msg = "Device busy or unavailable"
-            elif "out of memory" in error_msg:
-                short_msg = "Insufficient memory"
-            else:
-                short_msg = error_msg[:50] + "..." if len(error_msg) > 50 else error_msg
-
-            unhealthy_gpus[gpu_idx] = short_msg
-            table.add_row(f"GPU {gpu_idx}", mem_str, "❌ Unhealthy", short_msg)
-
-        # Small delay between GPU checks
-        time.sleep(0.1)
-
-    console.print(table)
-
-    return {
-        "healthy_gpus": healthy_gpus,
-        "unhealthy_gpus": unhealthy_gpus,
-        "all_healthy": len(unhealthy_gpus) == 0,
-        "memory_issues": memory_issues,
-    }
-
-
-def calculate_optimal_gpu_allocation(gpu_info: dict[str, Any], config: Config) -> dict[str, Any]:
-    """Calculate optimal GPU allocation for DDP GRPO training.
-
-    Key insight: In GRPO, we want to process groups in parallel.
-    Optimal case: num_gpus = num_groups (each GPU processes 1 group).
-    """
-    devices = gpu_info["devices"]
-    available_gpus = [device["index"] for device in devices]
-
-    # Need at least 2 GPUs (1 for training, 1 for vLLM)
-    if len(available_gpus) < 2:
-        return {"use_ddp": False, "reason": "Need at least 2 GPUs"}
-
-    # Reserve last GPU for vLLM
-    vllm_gpu = available_gpus[-1]
-    training_gpus = available_gpus[:-1]
-
-    # Calculate number of groups
-    batch_size = config.training.batch_size
-    group_size = config.training.group_size
-    num_groups = batch_size // group_size
-
-    if num_groups == 0:
-        num_groups = 1
-
-    # Optimal: Use exactly num_groups GPUs (each processes 1 group in parallel)
-    # But cap at available training GPUs
-    optimal_gpu_count = min(len(training_gpus), num_groups)
-
-    # Only use DDP if we have more than 1 group and more than 1 GPU
-    use_ddp = optimal_gpu_count > 1 and num_groups > 1
-
-    if not use_ddp:
-        # Single GPU training
-        return {
-            "use_ddp": False,
-            "reason": f"Single GPU sufficient for {num_groups} group(s)",
-            "training_gpus": [training_gpus[0]],
-            "vllm_gpu": vllm_gpu,
-            "num_groups": num_groups,
-        }
-
-    # Use optimal number of GPUs for DDP
-    training_gpus = training_gpus[:optimal_gpu_count]
-
-    return {
-        "use_ddp": True,
-        "training_gpus": training_gpus,
-        "vllm_gpu": vllm_gpu,
-        "num_groups": num_groups,
-        "groups_per_gpu": num_groups / len(training_gpus),
-        "parallel_efficiency": min(
-            1.0, num_groups / len(training_gpus)
-        ),  # 1.0 = perfect load balance
-    }
-
-
-def adjust_config_for_ddp(config: Config, num_gpus: int) -> Config:
-    """Adjust configuration for optimal DDP performance.
-
-    Scaling rule:
-    - For 1 GPU: batch_size = 2 * group_size
-    - For N GPUs (N > 1): batch_size = N * group_size
-
-    This ensures each GPU processes exactly 1 group in parallel for optimal performance.
-    """
-    group_size = config.training.group_size
-
-    # Apply scaling rule
-    if num_gpus == 1:
-        # Special case: 2 groups for single GPU
-        groups_per_gpu = 2
-        config.training.batch_size = 2 * group_size
-    else:
-        groups_per_gpu = config.training.batch_size // group_size
-        # Multi-GPU: each GPU processes groups_per_gpu groups
-        config.training.batch_size = num_gpus * group_size * groups_per_gpu
-
-    # Update max_parallel_episodes to match
-    config.actor.max_parallel_episodes = config.training.batch_size
-
-    config.training.num_gpus = num_gpus
-
-    # Log the adjustment
-    from rich.console import Console
-
-    console = Console()
-    console.print(
-        f"\n[cyan]📊 Adjusted batch_size to {config.training.batch_size} ({config.training.batch_size // group_size} groups)[/cyan]"  # noqa: E501
-    )
-    console.print(
-        f"[cyan]   Each of the {num_gpus} GPU(s) will process {groups_per_gpu} group(s) in parallel[/cyan]"  # noqa: E501
-    )
-
-    return config
-
-
-def kill_high_memory_processes(memory_threshold: float = 70.0) -> int:
-    """Kill all GPU processes using more than threshold% memory.
-
-    Returns:
-        Number of processes killed
-    """
-    from rich.console import Console
-
-    console = Console()
-
-    memory_info = get_gpu_memory_info()
-    killed_count = 0
-
-    for gpu_idx, info in memory_info.items():
-        if info["used_pct"] > memory_threshold:
-            for proc in info.get("processes", []):
-                pid = proc["pid"]
-                try:
-                    # Try graceful termination first
-                    subprocess.run(["kill", "-TERM", str(pid)], check=False, capture_output=True)  # noqa: S603, S607
-                    killed_count += 1
-                    console.print(
-                        f"[yellow]Terminating PID {pid} on GPU {gpu_idx} ({proc['memory_mb'] / 1024:.1f} GB)[/yellow]"  # noqa: E501
-                    )
-                except Exception as e:
-                    console.print(f"[red]Failed to kill PID {pid}: {e}[/red]")
-
-    if killed_count > 0:
-        console.print(f"\n[yellow]Sent termination signal to {killed_count} processes...[/yellow]")
-        time.sleep(3)
-
-        # Force kill any remaining
-        for info in memory_info.values():
-            for proc in info.get("processes", []):
-                pid = proc["pid"]
-                try:
-                    # Check if still running
-                    subprocess.run(  # noqa: S603
-                        ["kill", "-0", str(pid)],  # noqa: S607
-                        check=True,
-                        capture_output=True,
-                    )
-                    # If no error, process is still running, force kill
-                    subprocess.run(["kill", "-KILL", str(pid)], check=False)  # noqa: S603, S607
-                    console.print(f"[red]Force killed PID {pid}[/red]")
-                except Exception:
-                    hud_console.error(f"Failed to kill PID {pid}")
-
-    return killed_count
diff --git a/hud/cli/rl/local_runner.py b/hud/cli/rl/local_runner.py
deleted file mode 100644
index dec63677..00000000
--- a/hud/cli/rl/local_runner.py
+++ /dev/null
@@ -1,607 +0,0 @@
-"""
-Local runner for HUD RL training.
-
-This module encapsulates the local training flow and imports heavy
-dependencies (torch, transformers, etc.) only when actually running
-locally. The CLI entrypoint should import this module lazily to avoid
-pulling heavy deps during remote-only usage.
-"""
-
-from __future__ import annotations
-
-import asyncio
-import os
-import subprocess
-import sys
-from pathlib import Path
-
-from rich.console import Console
-
-from hud.rl.config import validate_vl_model
-from hud.utils.hud_console import hud_console
-from hud.utils.tasks import load_tasks
-
-console = Console()
-
-
-def run_local_training(
-    *,
-    tasks_file: str,
-    model: str | None,
-    config_file: Path | None,
-    output_dir: str,
-    yes: bool,
-    restart: bool,
-    verbose: bool,
-    no_ddp: bool,
-    ddp_gpus: str | None,
-    vllm_gpu: int | None,
-    skip_vllm_startup: bool,
-) -> None:
-    """Run RL training locally on the current machine.
-
-    Heavy modules are imported inside this function to avoid import-time side effects
-    during remote-only runs.
-    """
-    # Light-weight utilities
-    from .config import generate_config_interactive, load_config, save_config
-    from .display import display_config_summary, display_gpu_info
-    from .gpu import detect_cuda_devices, validate_gpu_memory
-    from .presets import get_training_presets
-
-    # Python version compatibility warning for vLLM
-    python_version = sys.version_info
-    if python_version.major == 3 and python_version.minor >= 13:
-        console.print("[red]⚠️  Warning: Python 3.13+ detected![/red]")
-        console.print("[yellow]vLLM has compatibility issues with Python 3.13.[/yellow]")
-        console.print("[yellow]Recommended: Use Python 3.12 or 3.11[/yellow]")
-        console.print("\n[dim]To create a new environment with Python 3.12:[/dim]")
-        console.print("[dim]  1. Exit this shell: exit[/dim]")
-        console.print("[dim]  2. Remove current venv: sudo rm -rf .venv[/dim]")
-        console.print("[dim]  3. Create new venv: uv venv --python 3.12[/dim]")
-        console.print("[dim]  4. Install dependencies: uv pip install -e '.[rl]'[/dim]")
-
-        try:
-            import typer
-
-            if not yes:
-                if not typer.confirm("\nDo you want to continue anyway?", default=False):
-                    raise typer.Exit(1)
-            else:
-                hud_console.warning("Auto-continuing despite Python 3.13+ (--yes mode)")
-        except Exception as e:
-            hud_console.warning(f"Failed to confirm: {e}")
-            return
-
-    # Step 1: Validate CUDA devices
-    console.print("[yellow]Checking GPU availability...[/yellow]")
-    gpu_info = detect_cuda_devices()
-
-    if not gpu_info["available"]:
-        console.print(f"[red]❌ {gpu_info['error']}[/red]")
-        console.print("[yellow]RL training requires CUDA-capable GPUs[/yellow]")
-        try:
-            import typer
-
-            raise typer.Exit(1)
-        except Exception:
-            return
-
-    display_gpu_info(gpu_info)
-
-    # Perform GPU health check (imports torch lazily)
-    all_gpu_indices = [device["index"] for device in gpu_info["devices"]]
-    from .gpu_utils import health_check_gpus  # heavy import (torch)
-
-    health_results = health_check_gpus(all_gpu_indices)
-
-    if not health_results["all_healthy"]:
-        console.print("\n[yellow]⚠️  Some GPUs failed health checks![/yellow]")
-        console.print(
-            f"[yellow]Unhealthy GPUs: {list(health_results['unhealthy_gpus'].keys())}[/yellow]"
-        )
-
-        if not health_results["healthy_gpus"]:
-            console.print("[red]❌ No healthy GPUs available for training![/red]")
-            try:
-                import typer
-
-                raise typer.Exit(1)
-            except Exception:
-                return
-
-        console.print(
-            f"\n[cyan]You have {len(health_results['healthy_gpus'])} healthy GPUs available.[/cyan]"
-        )
-
-        try:
-            import typer
-
-            if yes:
-                continue_training = True
-                hud_console.info("Auto-continuing with healthy GPUs only (--yes mode)")
-            else:
-                continue_training = typer.confirm(
-                    "\nContinue with healthy GPUs only?", default=True
-                )
-        except Exception:
-            continue_training = True
-
-        if not continue_training:
-            healthy_str = ",".join(map(str, health_results["healthy_gpus"]))
-            console.print("\n[yellow]Exiting. Please resolve GPU issues and try again.[/yellow]")
-            console.print("\n[cyan]💡 Tip: To use only healthy GPUs, you can run:[/cyan]")
-            console.print(f"[white]hud rl {tasks_file} --ddp-gpus {healthy_str} --local[/white]\n")
-            try:
-                import typer
-
-                raise typer.Exit(0)
-            except Exception:
-                return
-        else:
-            # Continue with healthy GPUs only
-            gpu_info["devices"] = [
-                d for d in gpu_info["devices"] if d["index"] in health_results["healthy_gpus"]
-            ]
-            console.print(
-                f"\n[green]✅ Continuing with {len(gpu_info['devices'])} healthy GPUs[/green]"
-            )
-
-    # Get primary GPU memory for configuration
-    primary_gpu = gpu_info["devices"][0]
-    gpu_memory_gb = primary_gpu["memory_gb"]
-
-    # Validate GPU memory for 3B model
-    if not validate_gpu_memory(gpu_memory_gb, "3B"):
-        console.print(f"[red]❌ Insufficient GPU memory ({gpu_memory_gb:.1f} GB)[/red]")
-        console.print("[yellow]Qwen 2.5 VL 3B requires at least 12 GB of GPU memory[/yellow]")
-        try:
-            import typer
-
-            raise typer.Exit(1)
-        except Exception:
-            return
-
-    # Step 2: Load and validate tasks
-    if tasks_file:
-        console.print(f"\n[cyan]Loading tasks from: {tasks_file}[/cyan]")
-    else:
-        possible_files = ["tasks.json", "tasks.jsonl", "browser_2048_tasks.jsonl"]
-        for f in possible_files:
-            if Path(f).exists():
-                tasks_file = f
-                console.print(f"[green]Auto-detected tasks file: {f}[/green]")
-                break
-
-        if not tasks_file:
-            console.print("[red]❌ No tasks file specified or auto-detected[/red]")
-            console.print(
-                "[yellow]Please provide a tasks file or create one of: tasks.json, tasks.jsonl[/yellow]"  # noqa: E501
-            )
-            try:
-                import typer
-
-                raise typer.Exit(1)
-            except Exception:
-                return
-
-    tasks = load_tasks(tasks_file)
-    console.print(f"[green]✅ Loaded {len(tasks)} tasks[/green]")
-
-    invalid_tasks: list[str] = []
-    for i, task in enumerate(tasks):
-        if not hasattr(task, "prompt") or not task.prompt:  # type: ignore
-            invalid_tasks.append(f"Task {i}: missing 'prompt' field")
-        if not hasattr(task, "mcp_config") or not task.mcp_config:  # type: ignore
-            invalid_tasks.append(f"Task {i}: missing 'mcp_config' field")
-
-    if invalid_tasks:
-        console.print("[red]❌ Invalid tasks found:[/red]")
-        for error in invalid_tasks[:5]:
-            console.print(f"  - {error}")
-        if len(invalid_tasks) > 5:
-            console.print(f"  ... and {len(invalid_tasks) - 5} more")
-        try:
-            import typer
-
-            raise typer.Exit(1)
-        except Exception:
-            return
-
-    # Step 3: Model selection (if not provided)
-    if model is None and not config_file:
-        if yes:
-            model = "Qwen/Qwen2.5-VL-3B-Instruct"  # Default model in yes mode
-            hud_console.info(f"Auto-selecting model: {model} (--yes mode)")
-        else:
-            model = hud_console.select(
-                "Select a model for RL training:",
-                choices=[
-                    {
-                        "name": "Qwen 2.5 VL 3B (Recommended - Vision-Language)",
-                        "value": "Qwen/Qwen2.5-VL-3B-Instruct",
-                    },
-                    {"name": "Custom model", "value": "custom"},
-                ],
-                default=0,
-            )
-
-            if model == "custom":
-                console.print("Enter the model name (HuggingFace ID):")
-                model = input().strip()
-
-    # try to get model from config file
-    if config_file:
-        console.print(f"\n[cyan]Loading configuration from: {config_file}[/cyan]")
-        config = load_config(config_file)
-        if hasattr(config, "model") and hasattr(config.model, "base_model"):
-            if model is None:
-                model = config.model.base_model
-            else:
-                console.print(
-                    f"[yellow]Model already set to {model}, using that instead "
-                    f"of {config.model.base_model}[/yellow] (override)"
-                )
-
-    if model is None:
-        console.print("[red]❌ No model specified either through CLI or config file[/red]")
-        try:
-            import typer
-
-            raise typer.Exit(1)
-        except Exception:
-            return
-
-    # Validate model is a VL model (whether provided via CLI or selected)
-    try:
-        validate_vl_model(model)
-    except ValueError as e:
-        console.print(f"\n[red]❌ {e}[/red]")
-        try:
-            import typer
-
-            raise typer.Exit(1)
-        except Exception:
-            return
-
-    # Step 4: Generate or load configuration
-    if config_file:
-        console.print(f"\n[cyan]Loading configuration from: {config_file}[/cyan]")
-        config = load_config(config_file)
-
-        # Validate model from config
-        if hasattr(config, "model") and hasattr(config.model, "base_model"):
-            try:
-                validate_vl_model(config.model.base_model)
-            except ValueError as e:
-                console.print(f"\n[red]❌ {e}[/red]")
-                try:
-                    import typer
-
-                    raise typer.Exit(1)
-                except Exception:
-                    return
-
-        # Estimate memory for display
-        from .presets import estimate_memory_usage
-
-        estimated_memory = estimate_memory_usage(
-            config.training.mini_batch_size,
-            config.actor.max_steps_per_episode,
-            config.actor.max_new_tokens,
-            config.model.max_pixels,
-        )
-    else:
-        console.print("\n[cyan]Generating training configuration...[/cyan]")
-        # Get number of GPUs for preset scaling
-        num_training_gpus = 1  # Default, will be adjusted later
-        if len(gpu_info["devices"]) > 2:
-            num_training_gpus = len(gpu_info["devices"]) - 1  # Reserve 1 for vLLM
-            console.print(
-                f"[yellow]Note: Episodes will be scaled for {num_training_gpus} training GPUs[/yellow]\n"  # noqa: E501
-            )
-
-        presets = get_training_presets(gpu_memory_gb)
-        config, estimated_memory = generate_config_interactive(
-            model_name=model,
-            presets=presets,
-            yes=yes,
-        )
-
-    # Step 5: Save temporary config and display summary
-    temp_config_path = Path(".rl_config_temp.json")
-    save_config(config, temp_config_path)
-    console.print(f"\n[cyan]📝 Configuration saved to: {temp_config_path.absolute()}[/cyan]")
-    console.print("[yellow]You can edit this file before starting training.[/yellow]")
-
-    # Display configuration summary
-    display_config_summary(config, len(tasks), gpu_info, estimated_memory)
-
-    # Step 6: Ask for confirmation (skip if config was provided or in yes mode)
-    if not config_file and not yes:
-        console.print("\n[bold yellow]Options:[/bold yellow]")
-        console.print("  • Type [green]'start'[/green] to begin training")
-        console.print("  • Type [cyan]'edit'[/cyan] to open config in your editor")
-        console.print("  • Type [red]'cancel'[/red] to abort")
-        console.print("\n[bold]Your choice:[/bold] ", end="")
-
-        while True:
-            choice = input().strip().lower()
-
-            if choice == "start":
-                config = load_config(temp_config_path)  # Reload config in case it was edited
-                break
-            elif choice == "edit":
-                editor = os.environ.get("EDITOR", "nano")
-
-                if editor == "nano":
-                    console.print("\n[cyan]Opening config in nano editor...[/cyan]")
-                    console.print("[yellow]Tips:[/yellow]")
-                    console.print("  • Edit the configuration values as needed")
-                    console.print("  • Press [bold]Ctrl+O[/bold] then [bold]Enter[/bold] to save")
-                    console.print("  • Press [bold]Ctrl+X[/bold] to exit")
-                    console.print("  • Press [bold]Ctrl+C[/bold] to cancel without saving\n")
-                    input("Press Enter to continue...")
-
-                try:
-                    subprocess.run([editor, str(temp_config_path)], check=True)  # noqa: S603
-                    # Reload and display updated config
-                    config = load_config(temp_config_path)
-                    from .presets import estimate_memory_usage as _estimate_memory
-
-                    estimated_memory = _estimate_memory(
-                        config.training.mini_batch_size,
-                        config.actor.max_steps_per_episode,
-                        config.actor.max_new_tokens,
-                        config.model.max_pixels,
-                    )
-                    display_config_summary(config, len(tasks), gpu_info, estimated_memory)
-                    console.print(
-                        "\n[bold]Type 'start' to begin or 'cancel' to abort:[/bold] ", end=""
-                    )
-                except subprocess.CalledProcessError:
-                    console.print(
-                        "\n[yellow]Editor closed without saving or was cancelled.[/yellow]"
-                    )
-                    console.print("[bold]Your choice:[/bold] ", end="")
-                except Exception as e:
-                    console.print(f"\n[red]Failed to open editor: {e}[/red]")
-                    console.print(
-                        f"[yellow]Please edit {temp_config_path} manually and type 'start' when ready.[/yellow]"  # noqa: E501
-                    )
-                    console.print("[bold]Your choice:[/bold] ", end="")
-            elif choice == "cancel":
-                console.print("[red]Training cancelled[/red]")
-                try:
-                    import typer
-
-                    if yes:
-                        # Always save in yes mode
-                        config_path = Path("rl_config.json")
-                        save_config(config, config_path)
-                        hud_console.info("Auto-saved configuration (--yes mode)")
-                    elif typer.confirm("Save this configuration for later?", default=True):
-                        config_path = Path("rl_config.json")
-                        save_config(config, config_path)
-                except Exception as e:
-                    hud_console.warning(f"Failed to save config: {e}")
-
-                try:
-                    temp_config_path.unlink()
-                except Exception as e:
-                    hud_console.warning(f"Failed to clean up temp config: {e}")
-
-                try:
-                    import typer
-
-                    raise typer.Exit(0)
-                except Exception:
-                    return
-            else:
-                console.print(
-                    "[red]Invalid choice. Type 'start', 'edit', or 'cancel':[/red] ", end=""
-                )
-    elif yes:
-        # In yes mode, auto-start training
-        hud_console.info("Auto-starting training (--yes mode)")
-        config = load_config(temp_config_path)
-    else:
-        console.print("\n[dim]Using provided configuration file...[/dim]")
-        config = load_config(temp_config_path)
-
-    # Step 7: Determine if DDP should be used (imports heavy helpers lazily)
-    num_gpus = len(gpu_info["devices"])
-    use_ddp = False
-    training_gpus = [0]  # Default single GPU
-    vllm_gpu_idx = 1 if num_gpus > 1 else 0
-
-    if num_gpus > 2 and not no_ddp:
-        console.print(f"\n[cyan]🚀 Detected {num_gpus} GPUs - checking DDP configuration...[/cyan]")
-
-        from .gpu_utils import calculate_optimal_gpu_allocation  # heavy import (torch at module)
-
-        gpu_allocation = calculate_optimal_gpu_allocation(gpu_info, config)
-
-        if gpu_allocation["use_ddp"]:
-            use_ddp = True
-            training_gpus = gpu_allocation["training_gpus"]
-            vllm_gpu_idx = gpu_allocation["vllm_gpu"]
-
-            console.print(
-                f"[green]✅ Will use DDP with {len(training_gpus)} GPUs for training[/green]"
-            )
-            console.print(f"[green]✅ GPU {vllm_gpu_idx} reserved for vLLM server[/green]")
-
-            console.print("\n[cyan]Training Configuration:[/cyan]")
-            console.print(f"  • Groups to process: {gpu_allocation['num_groups']}")
-            console.print(f"  • Training GPUs: {training_gpus}")
-            console.print(f"  • Groups per GPU: {gpu_allocation.get('groups_per_gpu', 'N/A'):.1f}")
-
-            if gpu_allocation.get("parallel_efficiency", 1.0) < 0.8:
-                console.print(
-                    f"\n[yellow]⚠️  GPU efficiency: {gpu_allocation['parallel_efficiency'] * 100:.0f}%[/yellow]"  # noqa: E501
-                )
-                console.print(
-                    f"[yellow]Consider adjusting batch_size to {len(training_gpus) * config.training.group_size} for optimal performance[/yellow]"  # noqa: E501
-                )
-        else:
-            console.print(f"[cyan]{gpu_allocation.get('reason', 'Using single GPU')}[/cyan]")
-
-    # Allow manual overrides
-    if ddp_gpus is not None:
-        requested_gpus = [int(x) for x in ddp_gpus.split(",")]
-        console.print(f"[cyan]Manual GPU selection: {requested_gpus}[/cyan]")
-        available_indices = [d["index"] for d in gpu_info["devices"]]
-        invalid_gpus = [g for g in requested_gpus if g not in available_indices]
-        if invalid_gpus:
-            console.print(f"[red]❌ Invalid/unhealthy GPU(s) requested: {invalid_gpus}[/red]")
-            console.print(f"[yellow]Available healthy GPUs: {available_indices}[/yellow]")
-            try:
-                import typer
-
-                raise typer.Exit(1)
-            except Exception:
-                return
-        training_gpus = requested_gpus
-        use_ddp = len(training_gpus) > 1
-
-    if vllm_gpu is not None:
-        vllm_gpu_idx = vllm_gpu
-        console.print(f"[cyan]Manual vLLM GPU: {vllm_gpu_idx}[/cyan]")
-        available_indices = [d["index"] for d in gpu_info["devices"]]
-        if vllm_gpu_idx not in available_indices:
-            console.print(f"[red]❌ vLLM GPU {vllm_gpu_idx} is not available/healthy![/red]")
-            console.print(f"[yellow]Available healthy GPUs: {available_indices}[/yellow]")
-            try:
-                import typer
-
-                raise typer.Exit(1)
-            except Exception:
-                return
-
-    # Ensure we have at least one training GPU
-    if not training_gpus:
-        console.print("[red]❌ No available GPUs for training![/red]")
-        try:
-            import typer
-
-            raise typer.Exit(1)
-        except Exception:
-            return
-
-    # Always adjust batch_size based on number of training GPUs (lazy import)
-    from .gpu_utils import adjust_config_for_ddp  # heavy import (torch at module)
-
-    config = adjust_config_for_ddp(config, len(training_gpus))
-    save_config(config, temp_config_path)
-
-    # Step 8: Start vLLM server (unless we're using a remote one)
-    if not skip_vllm_startup:
-        console.print(f"\n[cyan]Setting up vLLM server on GPU {vllm_gpu_idx}...[/cyan]")
-
-        from .vllm import start_vllm_server, wait_for_vllm_server
-
-        start_vllm_server(config.model.base_model, vllm_gpu_idx, restart=restart)
-        server_ready = asyncio.run(wait_for_vllm_server())
-        if not server_ready:
-            console.print("[red]❌ Failed to start vLLM server[/red]")
-            try:
-                import typer
-
-                raise typer.Exit(1)
-            except Exception:
-                return
-    else:
-        console.print("\n[cyan]Using remote vLLM server (skipping local startup)[/cyan]")
-
-    # Step 9: Run training (DDP or single GPU)
-    if use_ddp:
-        console.print(
-            f"\n[bold green]🎯 Starting DDP training on {len(training_gpus)} GPUs...[/bold green]\n"
-        )
-        launch_ddp_training(training_gpus, tasks_file, temp_config_path, verbose)
-    else:
-        console.print("\n[bold green]🎯 Starting single-GPU training...[/bold green]\n")
-        try:
-            # Set verbose in config instead of passing as parameter
-            if verbose:
-                config.verbose = True
-
-            # Import and run the async training function lazily
-            from hud.rl.train import train  # heavy import
-
-            asyncio.run(train(config, tasks))  # type: ignore
-            console.print("\n[green]✅ Training completed successfully![/green]")
-
-            try:
-                temp_config_path.unlink()
-            except Exception as e:
-                hud_console.warning(f"Failed to clean up temp config: {e}")
-
-        except KeyboardInterrupt:
-            console.print("\n[yellow]Training interrupted by user[/yellow]")
-            try:
-                import typer
-
-                raise typer.Exit(1)
-            except Exception:
-                return
-        except Exception as e:
-            console.print(f"\n[red]❌ Training failed: {e}")
-            try:
-                import typer
-
-                raise typer.Exit(1)
-            except Exception:
-                return
-
-
-def launch_ddp_training(
-    training_gpus: list[int], tasks_file: str, config_path: Path, verbose: bool
-) -> None:
-    """Launch DDP training with torchrun.
-
-    Uses subprocess to run the training module, so heavy dependencies load in
-    the spawned processes rather than the CLI import path.
-    """
-    import subprocess as _subprocess
-    import sys as _sys
-
-    env = os.environ.copy()
-    env["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, training_gpus))
-
-    if not verbose:
-        env["HUD_LOG_LEVEL"] = "WARNING"
-
-    cmd = [
-        _sys.executable,
-        "-m",
-        "torch.distributed.run",
-        f"--nproc_per_node={len(training_gpus)}",
-        "--master_port=29500",
-        "-m",
-        "hud.rl.train",
-        "--config",
-        str(config_path),
-        "--tasks",
-        tasks_file,
-    ]
-
-    if verbose:
-        cmd.append("--verbose")
-
-    try:
-        _subprocess.run(cmd, env=env, check=True)  # noqa: S603
-    except _subprocess.CalledProcessError as e:
-        console.print(f"\n[red]❌ DDP training failed with exit code {e.returncode}[/red]")
-        try:
-            import typer
-
-            raise typer.Exit(1)
-        except Exception:
-            return
-    finally:
-        try:
-            config_path.unlink()
-        except Exception as e:
-            hud_console.warning(f"Failed to clean up temp config: {e}")
diff --git a/hud/cli/rl/presets.py b/hud/cli/rl/presets.py
deleted file mode 100644
index ead1e560..00000000
--- a/hud/cli/rl/presets.py
+++ /dev/null
@@ -1,96 +0,0 @@
-"""Training configuration presets for different GPU configurations."""
-
-from __future__ import annotations
-
-from typing import Any
-
-
-def get_training_presets(gpu_memory_gb: float) -> list[dict[str, Any]]:
-    """Get training configuration presets based on GPU memory."""
-    # Time estimates based on provided benchmarks
-    if gpu_memory_gb >= 40:  # A100 40GB or better
-        presets = [
-            {
-                "name": "More Steps",
-                "max_steps_per_episode": 12,
-                "mini_batch_size": 1,
-                "group_size": 4,
-                "batch_size": 8,
-                "max_new_tokens": 256,
-                "tasks_per_hour": 847,
-                "steps_per_hour": 424,
-                "lr": 3e-5,
-                "epochs": 2,
-            },
-            {
-                "name": "Balanced (Recommended)",
-                "max_steps_per_episode": 5,
-                "mini_batch_size": 1,
-                "group_size": 6,
-                "batch_size": 12,
-                "max_new_tokens": 1024,
-                "tasks_per_hour": 738,
-                "steps_per_hour": 415,
-                "lr": 3e-5,
-                "epochs": 2,
-            },
-            {
-                "name": "Low Variance",
-                "max_steps_per_episode": 3,
-                "mini_batch_size": 2,
-                "group_size": 8,
-                "batch_size": 16,
-                "max_new_tokens": 512,
-                "tasks_per_hour": 900,
-                "steps_per_hour": 450,
-                "lr": 3e-5,
-                "epochs": 2,
-            },
-        ]
-    elif gpu_memory_gb >= 24:  # RTX 4090, A10, etc
-        presets = [
-            {
-                "name": "Balanced (Recommended)",
-                "max_steps_per_episode": 4,
-                "mini_batch_size": 1,
-                "group_size": 4,
-                "batch_size": 16,
-                "lr": 1e-4,
-                "epochs": 2,
-            },
-            {
-                "name": "Low Variance",
-                "max_steps_per_episode": 3,
-                "mini_batch_size": 2,
-                "group_size": 4,
-                "batch_size": 16,
-                "lr": 5e-5,
-                "epochs": 2,
-            },
-        ]
-    else:  # Smaller GPUs
-        presets = [
-            {
-                "name": "Test",
-                "max_steps_per_episode": 5,
-                "mini_batch_size": 1,
-                "group_size": 4,
-                "batch_size": 8,
-                "lr": 1e-4,
-                "epochs": 1,
-            },
-        ]
-
-    return presets
-
-
-def estimate_memory_usage(
-    mini_batch_size: int, max_steps: int, max_new_tokens: int, max_pixels: int
-) -> float:
-    """Calculate estimated GPU memory usage using the formula from train.py."""
-    INITIAL_MEMORY = 8.0
-    SCALING_FACTOR = 4 / (28 * 28 * 256 * 1024)
-    token_estimate = mini_batch_size * max_steps * max_new_tokens
-    image_estimate = max_pixels
-    total_memory = INITIAL_MEMORY + SCALING_FACTOR * token_estimate * image_estimate
-    return total_memory
diff --git a/hud/cli/rl/remote_runner.py b/hud/cli/rl/remote_runner.py
deleted file mode 100644
index c857a973..00000000
--- a/hud/cli/rl/remote_runner.py
+++ /dev/null
@@ -1,463 +0,0 @@
-"""
-Remote runner for HUD RL training via API server.
-
-This module implements the new interactive flow for RL training.
-"""
-
-from __future__ import annotations
-
-import time
-import uuid
-from pathlib import Path
-
-from rich.console import Console
-
-from hud.cli.rl.celebrate import show_confetti_async
-from hud.cli.rl.gpu_utils import adjust_config_for_ddp
-from hud.cli.rl.viewer import show_json_interactive
-from hud.cli.rl.wait_utils import wait_for_enter_cancel_or_change
-from hud.utils.hud_console import hud_console
-from hud.utils.tasks import load_tasks
-
-from . import rl_api
-from .config import generate_config_interactive, load_config, save_config
-from .presets import get_training_presets
-
-console = Console()
-
-# GPU pricing information
-GPU_PRICING = {
-    "A100": {"price": "1", "memory": "80GB"},
-    "H100": {"price": "2", "memory": "80GB"},
-}
-
-
-def ensure_vllm_deployed(
-    model_name: str, gpu_type: str = "A100", gpu_count: int = 1, timeout: int = 600
-) -> None:
-    """Deploy vLLM for a model if needed and wait until it's ready.
-
-    Args:
-        model_name: The name of the model to deploy vLLM for
-        gpu_type: GPU type to use for deployment (e.g., A100, H100)
-        timeout: Max seconds to wait for vLLM to be ready
-    """
-    # Check current model status
-    info = rl_api.get_model(model_name)
-    if info.vllm_url:
-        hud_console.success("vLLM server already running")
-        return
-
-    hud_console.info(f"Deploying vLLM server for {model_name}...")
-    rl_api.deploy_vllm(model_name, gpu_type=gpu_type, gpu_count=gpu_count)
-    hud_console.success("vLLM deployment started")
-
-    hud_console.info("Waiting for vLLM server to be ready...")
-    start_time = time.time()
-    with hud_console.progress() as progress:
-        progress.update("Checking deployment status (see live status on https://hud.ai/models)")
-        while True:
-            if time.time() - start_time > timeout:
-                hud_console.error("Timeout waiting for vLLM deployment")
-                raise ValueError("vLLM deployment timeout")
-            info = rl_api.get_model(model_name)
-            if info.status == "ready":
-                hud_console.success(
-                    f"vLLM server ready at http://rl.hud.ai/v1/models/{model_name}/vllm"
-                )
-                break
-            time.sleep(5)
-
-
-def run_remote_training(
-    tasks_file: str | None,
-    model: str | None,
-    config_file: Path | None,
-    output_dir: str,
-    vllm_gpu_count: int = 1,
-    yes: bool = False,
-) -> None:
-    """Run RL training remotely via the API server following the new interactive flow."""
-    from hud.settings import settings
-
-    if not settings.api_key:
-        hud_console.error("API key not found")
-        console.print(
-            "[yellow]Set it in your environment or run: hud set HUD_API_KEY=your-key-here[/yellow]"
-        )
-        raise ValueError("API key not found")
-
-    # Step 1: CONFIRMATION - Load tasks
-    if tasks_file:
-        tasks: list[Task] = load_tasks(tasks_file)  # type: ignore[assignment]
-        # Resolve tasks immediately after loading (validate + fill defaults)
-        from hud.types import Task
-
-        resolved_tasks: list[dict] = []
-        for t in tasks:
-            try:
-                resolved = Task(**t.model_dump()).model_dump()
-            except Exception:
-                resolved = t.model_dump()
-            resolved_tasks.append(resolved)
-
-        # Preview resolved task
-        if resolved_tasks and not yes:
-            try:
-                show_json_interactive(resolved_tasks[0], title="Task Preview")
-            except Exception as e:
-                hud_console.warning(f"Interactive viewer failed: {e}")
-    else:
-        raise ValueError("Tasks file not found")
-
-    # Show example task for confirmation
-    # hud_console.section_title("Example Task from Dataset")
-
-    # if tasks:
-    #     # Display task with truncated values
-    #     try:
-    #         task_data = resolved_tasks[0]
-    #     except Exception:
-    #         task_data = tasks[0].model_dump()
-    #     truncated_data = {}
-    #     max_value_length = 120  # Maximum characters to show per line
-
-    #     for key, value in task_data.items():
-    #         value_str = str(value)
-    #         if len(value_str) > max_value_length:
-    #             truncated_data[key] = value_str[:max_value_length] + "..."
-    #         else:
-    #             truncated_data[key] = value_str
-
-    #     hud_console.key_value_table(truncated_data)
-
-    #     if not hud_console.confirm("Proceed with training on this dataset?", default=True):
-    #         hud_console.error("Training cancelled")
-    #         return
-
-    # Step 2: MODEL SELECTION
-    hud_console.section_title("Model Selection")
-
-    # Fetch existing models
-    hud_console.info("Fetching your models from https://hud.ai/models")
-
-    try:
-        models = rl_api.list_models()
-        # Filter for active/training models and sort by recency
-        active_models = [m for m in models if m.status in ["ready", "training"]]
-        active_models.sort(key=lambda m: m.created_at or "", reverse=True)
-
-        if active_models or model is None:
-            # Build choices
-            choices = []
-            for m in active_models:
-                status_emoji = {
-                    "ready": "✅",
-                    "training": "🔄",
-                    "deploying": "🚀",
-                    "pending": "⏳",
-                }.get(m.status, "❓")
-
-                choices.append({"name": f"{status_emoji} {m.name} ({m.status})", "value": m.name})
-
-            choices.append({"name": "Create new model", "value": "__new__"})
-
-            if not model:
-                if yes:
-                    # In yes mode, always create a new model to avoid conflicts
-                    selected = "__new__"
-                    hud_console.info("Auto-creating new model (--yes mode)")
-                elif choices:
-                    selected = hud_console.select("Select a model:", choices=choices)
-                else:
-                    selected = "__new__"
-                    hud_console.hint("No existing models found. Creating new model...")
-            else:
-                # Model was provided via CLI
-                selected = model
-
-        else:
-            selected = "__new__"
-
-        # Handle model selection
-        if selected == "__new__":
-            # Create new model flow
-            hud_console.info("Creating new model...")
-
-            # Ask for model type
-            if yes:
-                if config_file:
-                    config = load_config(config_file)
-                    model_type = config.model.base_model
-                else:
-                    model_type = "Qwen/Qwen2.5-VL-3B-Instruct"
-                hud_console.info(f"Auto-selecting base model: {model_type} (--yes mode)")
-            else:
-                model_type = hud_console.select(
-                    "Select base model type:",
-                    choices=[
-                        {"name": "Qwen2.5-VL-3B-Instruct", "value": "Qwen/Qwen2.5-VL-3B-Instruct"},
-                        {"name": "Qwen2.5-3B-Instruct", "value": "Qwen/Qwen2.5-3B-Instruct"},
-                    ],
-                    default=0,
-                )
-            from rich.prompt import Prompt
-
-            # Ask for model name
-            base_default = model_type.split("/")[-1].lower()
-            default_name = base_default
-            existing_names = {m.name for m in active_models}
-            suffix = 1
-            while default_name in existing_names:
-                default_name = f"{base_default}-{suffix}"
-                suffix += 1
-
-            if yes:
-                model_name = default_name
-                hud_console.info(f"Auto-using model name: {model_name} (--yes mode)")
-            else:
-                hud_console.info(f"Enter model name (default: {default_name}):")
-                model_name = Prompt.ask("Model name", default=default_name)
-                model_name = model_name.replace("/", "-").lower()
-
-            # Create the model with retry on name conflict
-            hud_console.info(f"Creating model: {model_name}")
-            try:
-                rl_api.create_model(model_name, model_type)
-                hud_console.success(f"Created model: {model_name}")
-                ensure_vllm_deployed(model_name, gpu_type="A100", gpu_count=vllm_gpu_count)
-
-            except Exception as e:
-                # If the name already exists, suggest a new name and prompt once
-                message = str(e)
-                if "already exists" in message or "409" in message:
-                    alt_name = f"{model_name}-1"
-                    i = 1
-                    while True:
-                        candidate = f"{model_name}-{str(uuid.uuid4())[:4]}"
-                        if candidate not in existing_names:
-                            alt_name = candidate
-                            break
-                        i += 1
-                    hud_console.warning(
-                        f"Model '{model_name}' exists. Suggesting '{alt_name}' instead."
-                    )
-                    try:
-                        from rich.prompt import Prompt as _Prompt
-
-                        if yes:
-                            chosen = alt_name
-                            hud_console.info(f"Auto-using suggested name: {chosen} (--yes mode)")
-                        else:
-                            chosen = _Prompt.ask("Use different name", default=alt_name)
-                        chosen = chosen.replace("/", "-").lower()
-                        rl_api.create_model(chosen, model_type)
-                        hud_console.success(f"Created model: {chosen}")
-                        model_name = chosen
-                        ensure_vllm_deployed(model_name, gpu_type="A100", gpu_count=vllm_gpu_count)
-                    except Exception as e2:
-                        hud_console.error(f"Failed to create model: {e2}")
-                        raise
-                else:
-                    hud_console.error(f"Failed to create model: {e}")
-                    raise
-
-        else:
-            # Existing model selected
-            model_name = selected
-            model_info = rl_api.get_model(model_name)
-
-            # Check if model is in training
-            if model_info.status == "training":
-                if yes:
-                    # In yes mode, skip training if model is already training
-                    hud_console.warning(f"{model_name} is already training, skipping (--yes mode)")
-                    return
-                elif hud_console.confirm(
-                    f"{model_name} is currently training. Stop current training?", default=False
-                ):
-                    hud_console.info(f"Stopping training for {model_name}...")
-                    try:
-                        rl_api.stop_training(model_name)
-                        hud_console.success("Training stopped")
-                    except Exception as e:
-                        hud_console.error(f"Failed to stop training: {e}")
-                        raise
-                else:
-                    hud_console.error("Cannot start new training while model is already training")
-                    return
-
-            # Ensure vLLM is deployed
-            ensure_vllm_deployed(model_name, gpu_type="A100", gpu_count=vllm_gpu_count)
-    except KeyboardInterrupt:
-        hud_console.dim_info("Training cancelled", "")
-        return
-    except Exception as e:
-        hud_console.error(f"Error during model selection: {e}")
-        raise
-
-    # Get final model info
-    model_info = rl_api.get_model(model_name)
-
-    # Step 3: TRAINING CONFIG
-    hud_console.section_title("Training Configuration")
-
-    if not config_file:
-        # Ask about number of GPUs with pricing
-        # hud_console.info("GPU Selection (Pricing per GPU):")
-
-        # gpu_table = Table(show_header=True, header_style="bold magenta")
-        # gpu_table.add_column("GPU Type", style="cyan")
-        # gpu_table.add_column("Memory", style="green")
-        # gpu_table.add_column("Price/hr", style="yellow")
-
-        # for gpu, info in GPU_PRICING.items():
-        #     gpu_table.add_row(gpu, info["memory"], "see pricing on hud.ai")
-
-        # console.print(gpu_table)
-
-        if yes:
-            gpu_choice = "A100"
-            hud_console.info(f"Auto-selecting GPU: {gpu_choice} 80GB (--yes mode)")
-        else:
-            gpu_choice = hud_console.select(
-                "Select GPU type:",
-                choices=[
-                    {"name": "A100 80GB", "value": "A100"},
-                    {"name": "H100 80GB", "value": "H100"},
-                ],
-                default=0,
-            )
-
-        if yes:
-            num_gpus = 2  # Default to 2 GPUs in yes mode
-            hud_console.info(f"Auto-selecting {num_gpus} GPU(s) (--yes mode)")
-        else:
-            num_gpus = hud_console.select(
-                "Number of GPUs:",
-                choices=[
-                    {"name": "1 GPU", "value": 1},
-                    {"name": "2 GPUs", "value": 2},
-                    {"name": "4 GPUs", "value": 4},
-                    {"name": "8 GPUs", "value": 8},
-                ],
-                default=1,
-            )
-
-        # Generate config with presets
-        hud_console.info("Generating training configuration...")
-        gpu_memory_gb = 80.0 if gpu_choice in ["A100", "H100"] else 48.0
-        presets = get_training_presets(gpu_memory_gb)
-
-        config, _ = generate_config_interactive(
-            model_name=model_info.base_model,
-            presets=presets,
-            yes=yes,
-        )
-
-        config = adjust_config_for_ddp(config, int(num_gpus))
-
-        config.training.gpu_type = gpu_choice
-
-        # Use a short label for tasks (avoid full absolute paths)
-        try:
-            if tasks_file and Path(tasks_file).exists():
-                tasks_label = Path(tasks_file).name
-            else:
-                # Fallback: last segment of a non-existent path or dataset name
-                tasks_label = str(tasks_file).replace("\\", "/").split("/")[-1]
-        except Exception:
-            tasks_label = str(tasks_file)
-
-        config.job_name = f"RL {tasks_label} | {model_name}"
-
-        # Save config so user can review/edit externally
-        temp_config_path = Path(f".rl_config_temp_{model_name}.json")
-        save_config(config, temp_config_path)
-
-        # Interactive review loop: show preview, allow external edits, press Enter to start
-        hud_console.info(
-            f"Using training configuration from [underline cyan]{temp_config_path.absolute()}[/underline cyan]"  # noqa: E501
-        )
-
-        if yes:
-            # In yes mode, skip the interactive review loop
-            hud_console.info("Auto-accepting config (--yes mode)")
-            # Still show the config briefly
-            try:
-                show_json_interactive(
-                    config.to_dict() if hasattr(config, "to_dict") else {},
-                    title="RL Config Preview",
-                    prompt=False,
-                )
-            except Exception as e:
-                hud_console.warning(f"Interactive viewer failed: {e}")
-        else:
-            while True:
-                # Reload latest config from file each cycle
-                try:
-                    config = load_config(temp_config_path)
-                except Exception as e:
-                    hud_console.warning(f"Failed to load config from disk, using in-memory: {e}")
-
-                # Preview current config (no extra prompt here; main loop handles start/cancel)
-                try:
-                    show_json_interactive(
-                        config.to_dict() if hasattr(config, "to_dict") else {},
-                        title="RL Config Preview",
-                        prompt=False,
-                    )
-                except Exception as e:
-                    hud_console.warning(f"Interactive viewer failed: {e}")
-
-                console.print(
-                    "\n[dim]Edit the config file above if needed, then save.[/dim]\n"
-                    "[bold]Press Enter to start training[/bold], or press 'q' to cancel."
-                )
-
-                start_training, cancelled, changed = wait_for_enter_cancel_or_change(
-                    temp_config_path
-                )
-
-                if cancelled:
-                    hud_console.error("Training cancelled")
-                    return
-                if start_training:
-                    break  # proceed
-                if changed:
-                    hud_console.info("Detected configuration changes. Reloading preview...")
-
-        config_dict = config.to_dict()
-    else:
-        # Load provided config
-        hud_console.info(f"Loading configuration from: {config_file}")
-        config = load_config(config_file)
-        gpu_choice = config.training.gpu_type
-        num_gpus = config.training.num_gpus
-
-        config = adjust_config_for_ddp(config, int(num_gpus))
-        config_dict = config.to_dict()
-
-    # Launch training
-    try:
-        # Little celebration before launching
-        try:
-            show_confetti_async(console)
-        except Exception:
-            hud_console.info("Launching training...")
-
-        rl_api.launch_training(
-            model_name=model_name,
-            config=config_dict,
-            tasks=resolved_tasks,
-            gpu_type=gpu_choice,
-            gpu_count=int(num_gpus),
-        )
-
-        hud_console.info(f"Your model {model_name} has started training")
-        hud_console.hint("Launch another training run via: hud rl <tasks_file>")
-        hud_console.hint("Or evaluate the model via: hud eval <tasks_file>")
-
-    except Exception as e:
-        hud_console.error(f"Failed to launch training: {e}")
-        raise
diff --git a/hud/cli/rl/rl_api.py b/hud/cli/rl/rl_api.py
deleted file mode 100644
index 92761b3f..00000000
--- a/hud/cli/rl/rl_api.py
+++ /dev/null
@@ -1,150 +0,0 @@
-"""
-Direct API functions for HUD RL remote endpoints using shared requests module.
-
-This module provides functions for interacting with the HUD RL API server.
-"""
-
-from __future__ import annotations
-
-from typing import TYPE_CHECKING, Any
-
-from pydantic import BaseModel
-
-from hud.settings import settings
-from hud.shared.requests import make_request_sync
-
-if TYPE_CHECKING:
-    from collections.abc import Iterator
-
-
-class RLModelInfo(BaseModel):
-    """Model information from the API."""
-
-    name: str
-    base_model: str
-    vllm_url: str | None = None
-    trainer_name: str | None = None
-    checkpoint_volume: str | None = None
-    status: str = "pending"  # pending, deploying, ready, training, terminated
-    created_at: str | None = None
-    updated_at: str | None = None
-    terminated_at: str | None = None
-
-
-def create_model(name: str, base_model: str) -> dict[str, Any]:
-    """Create a new model."""
-    return make_request_sync(
-        method="POST",
-        url=f"{settings.hud_rl_url}/models",
-        json={"name": name, "base_model": base_model},
-        api_key=settings.api_key,
-    )
-
-
-def get_model(name: str) -> RLModelInfo:
-    """Get model information."""
-    response = make_request_sync(
-        method="GET", url=f"{settings.hud_rl_url}/models/{name}", api_key=settings.api_key
-    )
-    return RLModelInfo(**response)
-
-
-def list_models() -> list[RLModelInfo]:
-    """List all models."""
-    response = make_request_sync(
-        method="GET", url=f"{settings.hud_rl_url}/models", api_key=settings.api_key
-    )
-    if not isinstance(response, list):
-        response = [response]
-    return [
-        RLModelInfo(**(model if isinstance(model, dict) else model.__dict__)) for model in response
-    ]
-
-
-def deploy_vllm(model_name: str, gpu_type: str = "A100", gpu_count: int = 1) -> dict[str, Any]:
-    """Deploy a vLLM server for a model."""
-    return make_request_sync(
-        method="POST",
-        url=f"{settings.hud_rl_url}/models/{model_name}/deploy",
-        json={"gpu_type": gpu_type, "gpu_count": gpu_count},
-        api_key=settings.api_key,
-    )
-
-
-def stop_vllm(model_name: str) -> dict[str, Any]:
-    """Stop the vLLM server for a model."""
-    return make_request_sync(
-        method="DELETE",
-        url=f"{settings.hud_rl_url}/models/{model_name}/deploy",
-        api_key=settings.api_key,
-    )
-
-
-def stop_training(model_name: str) -> dict[str, Any]:
-    """Stop the training for a model."""
-    return make_request_sync(
-        method="DELETE",
-        url=f"{settings.hud_rl_url}/models/{model_name}/training",
-        api_key=settings.api_key,
-    )
-
-
-def launch_training(
-    model_name: str,
-    config: dict[str, Any],
-    tasks: list[dict[str, Any]],
-    gpu_type: str = "A100",
-    gpu_count: int = 1,
-) -> dict[str, Any]:
-    """Launch a training run for a model."""
-    return make_request_sync(
-        method="POST",
-        url=f"{settings.hud_rl_url}/models/{model_name}/training/launch",
-        json={"config": config, "tasks": tasks, "gpu_type": gpu_type, "gpu_count": gpu_count},
-        api_key=settings.api_key,
-    )
-
-
-def get_training_status(model_name: str) -> dict[str, Any]:
-    """Get the status of a training run."""
-    return make_request_sync(
-        method="GET",
-        url=f"{settings.hud_rl_url}/models/{model_name}/training/status",
-        api_key=settings.api_key,
-    )
-
-
-def get_training_logs(model_name: str, lines: int = 100, follow: bool = False) -> Iterator[str]:
-    """Get training logs for a model.
-
-    Args:
-        model_name: Name of the model
-        lines: Number of lines to return
-        follow: If True, stream logs as they arrive
-
-    Yields:
-        Log lines as strings
-    """
-    # For streaming logs, we need to use httpx directly
-    # as the shared requests module expects JSON responses
-    import httpx
-
-    params = {"lines": lines}
-    if follow:
-        params["follow"] = True
-
-    headers = {"Authorization": f"Bearer {settings.api_key}"}
-
-    with (
-        httpx.Client(timeout=300.0) as client,
-        client.stream(
-            "GET",
-            f"{settings.hud_rl_url}/models/{model_name}/training/logs",
-            params=params,
-            headers=headers,
-        ) as response,
-    ):
-        response.raise_for_status()
-        for line in response.iter_lines():
-            if line:
-                yield line
diff --git a/hud/cli/rl/vllm.py b/hud/cli/rl/vllm.py
deleted file mode 100644
index 969a961c..00000000
--- a/hud/cli/rl/vllm.py
+++ /dev/null
@@ -1,179 +0,0 @@
-"""vLLM server management utilities."""
-
-from __future__ import annotations
-
-import asyncio
-import logging
-import os
-import subprocess
-import time
-from pathlib import Path
-
-import httpx
-from rich.console import Console
-
-from hud.utils.hud_console import HUDConsole
-
-logger = logging.getLogger(__name__)
-hud_console = HUDConsole(logger)
-
-console = Console()
-
-
-def get_vllm_args(model_name: str, chat_template_path: Path | None = None) -> list[str]:
-    """Get common vLLM server arguments for both local and remote deployments."""
-    args = [
-        "serve",
-        model_name,
-        "--api-key",
-        "token-abc123",
-        "--host",
-        "0.0.0.0",  # noqa: S104
-        "--port",
-        "8000",
-        "--tensor-parallel-size",
-        "1",
-        "--trust-remote-code",
-        "--max-model-len",
-        "16384",
-        "--enable-lora",
-        "--max-lora-rank",
-        "64",
-        "--max-cpu-loras",
-        "4",
-        "--enable-auto-tool-choice",
-        "--tool-call-parser",
-        "hermes",
-        "--disable-log-requests",
-        "--dtype",
-        "auto",
-    ]
-
-    # Add chat template if provided
-    if chat_template_path and chat_template_path.exists():
-        args.extend(["--chat-template", str(chat_template_path.absolute())])
-
-    return args
-
-
-def check_vllm_server() -> bool:
-    """Check if vLLM server is running."""
-    try:
-        response = httpx.get("http://localhost:8000/health", timeout=2.0)
-        return response.status_code == 200
-    except Exception:
-        return False
-
-
-def kill_vllm_server() -> None:
-    """Kill any running vLLM server processes."""
-    try:
-        # Check for PID file first
-        pid_file = Path("/tmp/vllm_server.pid")  # noqa: S108
-        if pid_file.exists():
-            try:
-                pid = int(pid_file.read_text().strip())
-                subprocess.run(["kill", "-TERM", str(pid)], check=False)  # noqa: S603, S607
-                time.sleep(2)
-                # Force kill if still running
-                subprocess.run(["kill", "-9", str(pid)], check=False)  # noqa: S603, S607
-                pid_file.unlink()
-            except Exception as e:
-                hud_console.error(f"Failed to kill vLLM server: {e}")
-
-        # Also try to kill by process name
-        subprocess.run(["pkill", "-f", "vllm serve"], check=False)  # noqa: S607
-        subprocess.run(["pkill", "-f", "vllm.entrypoints.openai.api_server"], check=False)  # noqa: S607
-        time.sleep(2)
-
-        # Check for any process using port 8000
-        result = subprocess.run(["lsof", "-ti:8000"], capture_output=True, text=True, check=False)  # noqa: S607
-
-        if result.stdout.strip():
-            for pid in result.stdout.strip().split("\n"):
-                try:
-                    subprocess.run(["kill", "-9", pid], check=False)  # noqa: S603, S607
-                except Exception as e:
-                    hud_console.error(f"Failed to kill vLLM server: {e}")
-
-        console.print("[yellow]Killed existing vLLM server processes[/yellow]")
-    except Exception as e:
-        hud_console.error(f"Error killing vLLM server: {e}")
-
-
-def start_vllm_server(model_name: str, gpu_index: int = 1, restart: bool = False) -> None:
-    """Start vLLM server in the background with dynamic GPU selection."""
-    if restart:
-        kill_vllm_server()
-        time.sleep(3)
-
-    # Check if already running
-    if check_vllm_server():
-        console.print("[green]vLLM server is already running[/green]")
-        return
-
-    console.print(f"[cyan]Starting vLLM server with {model_name} on GPU {gpu_index}...[/cyan]")
-
-    # Set up environment variables
-    env = os.environ.copy()
-    env.update(
-        {
-            "CUDA_VISIBLE_DEVICES": str(gpu_index),
-            "VLLM_ALLOW_RUNTIME_LORA_UPDATING": "True",
-            "TOKENIZERS_PARALLELISM": "false",
-            "VLLM_LOGGING_LEVEL": "INFO",  # Changed from DEBUG to reduce noise
-            "CUDA_LAUNCH_BLOCKING": "1",  # Better error messages
-        }
-    )
-
-    # Get the path to chat template
-    chat_template_path = Path(__file__).parent.parent.parent / "rl" / "chat_template.jinja"
-
-    # Build the vLLM command
-    vllm_args = get_vllm_args(model_name, chat_template_path)
-    cmd = ["uv", "run", "vllm", *vllm_args]
-
-    # Start the server in the background
-    with open("/tmp/vllm_server.log", "w") as log_file:  # noqa: S108,
-        process = subprocess.Popen(  # noqa: S603
-            cmd,
-            env=env,
-            stdout=log_file,
-            stderr=subprocess.STDOUT,
-            preexec_fn=os.setpgrp,  # type: ignore
-            cwd=Path.cwd(),  # Use current working directory
-        )
-
-    console.print("[yellow]vLLM server starting in background...[/yellow]")
-    console.print(f"[yellow]Process ID: {process.pid}[/yellow]")
-    console.print("[yellow]Check logs at: /tmp/vllm_server.log[/yellow]")
-
-    # Save PID for later management
-    pid_file = Path("/tmp/vllm_server.pid")  # noqa: S108
-    pid_file.write_text(str(process.pid))
-
-
-async def wait_for_vllm_server(timeout: int = 360) -> bool:  # noqa: ASYNC109
-    """Wait for vLLM server to be ready."""
-    start_time = time.time()
-    console.print("[yellow]Waiting for vLLM server to be ready (up to 6 minutes)...[/yellow]")
-
-    async with httpx.AsyncClient() as client:
-        while time.time() - start_time < timeout:
-            try:
-                response = await client.get("http://localhost:8000/health", timeout=2.0)
-                if response.status_code == 200:
-                    console.print("[green]✅ vLLM server is ready![/green]")
-                    return True
-            except httpx.ConnectError:
-                pass
-            except Exception as e:
-                hud_console.error(f"Failed to connect to vLLM server: {e}")
-
-            await asyncio.sleep(2)
-            elapsed = int(time.time() - start_time)
-            console.print(f"[yellow]Waiting... ({elapsed}s / {timeout}s)[/yellow]", end="\r")
-
-    console.print("\n[red]❌ vLLM server failed to start within timeout[/red]")
-    console.print("[yellow]Check /tmp/vllm_server.log for details[/yellow]")
-    return False
diff --git a/hud/cli/rl/wait_utils.py b/hud/cli/rl/wait_utils.py
deleted file mode 100644
index f1a587ac..00000000
--- a/hud/cli/rl/wait_utils.py
+++ /dev/null
@@ -1,89 +0,0 @@
-from __future__ import annotations
-
-import contextlib
-import os
-import select
-import sys
-import threading
-import time as _time
-from typing import TYPE_CHECKING
-
-from watchfiles import watch
-
-if TYPE_CHECKING:
-    from pathlib import Path
-
-
-def wait_for_enter_cancel_or_change(file_path: Path) -> tuple[bool, bool, bool]:
-    """Block until Enter (start), 'q' (cancel), or file change.
-
-    Returns (start_training, cancelled, changed).
-    - start_training: True if Enter (or any non-'q' line on POSIX) was received
-    - cancelled: True if 'q' was received or Ctrl-C
-    - changed: True if the file changed on disk
-    """
-    start_training = False
-    cancelled = False
-    changed = False
-
-    stop_evt: threading.Event = threading.Event()
-    changed_evt: threading.Event = threading.Event()
-
-    def _watcher() -> None:
-        with contextlib.suppress(Exception):
-            for _ in watch(file_path, stop_event=stop_evt, debounce=200):
-                changed_evt.set()
-                break
-
-    t = threading.Thread(target=_watcher, daemon=True)
-    t.start()
-
-    try:
-        if os.name == "nt":
-            import msvcrt  # type: ignore[attr-defined]
-
-            while True:
-                if changed_evt.is_set():
-                    changed = True
-                    break
-
-                if msvcrt.kbhit():
-                    ch = msvcrt.getwch()
-                    if ch in ("\r", "\n"):
-                        start_training = True
-                        break
-                    if ch.lower() == "q":
-                        cancelled = True
-                        break
-                _time.sleep(0.15)
-        else:
-            while True:
-                if changed_evt.is_set():
-                    changed = True
-                    break
-
-                rlist, _, _ = select.select([sys.stdin], [], [], 0.25)
-                if rlist:
-                    line = sys.stdin.readline()
-                    if line is None:
-                        continue
-                    stripped = line.strip().lower()
-                    if stripped == "q":
-                        cancelled = True
-                        break
-                    # Any other (including empty) => start
-                    start_training = True
-                    break
-                _time.sleep(0.05)
-
-    except KeyboardInterrupt:
-        cancelled = True
-    finally:
-        stop_evt.set()
-        with contextlib.suppress(Exception):
-            t.join(timeout=1)
-
-    return start_training, cancelled, changed
-
-
-__all__ = ["wait_for_enter_cancel_or_change"]
diff --git a/hud/cli/tests/test_analyze.py b/hud/cli/tests/test_analyze.py
index 74d2b6a8..7bc1440c 100644
--- a/hud/cli/tests/test_analyze.py
+++ b/hud/cli/tests/test_analyze.py
@@ -50,7 +50,7 @@ async def test_analyze_environment_success(self) -> None:
         }
 
         with (
-            patch("hud.cli.analyze.MCPClient") as MockClient,
+            patch("hud.clients.fastmcp.FastMCPHUDClient") as MockClient,
             patch("hud.cli.analyze.console"),
             patch("hud.cli.analyze.display_interactive") as mock_interactive,
         ):
@@ -80,7 +80,7 @@ async def test_analyze_environment_success(self) -> None:
     async def test_analyze_environment_failure(self) -> None:
         """Test handling analysis failure."""
         with (
-            patch("hud.cli.analyze.MCPClient") as MockClient,
+            patch("hud.clients.fastmcp.FastMCPHUDClient") as MockClient,
             patch("hud.cli.analyze.console") as mock_console,
             patch("platform.system", return_value="Windows"),
         ):
@@ -119,7 +119,7 @@ async def test_analyze_environment_formats(self) -> None:
 
         for output_format in ["json", "markdown", "interactive"]:
             with (
-                patch("hud.cli.analyze.MCPClient") as MockClient,
+                patch("hud.clients.fastmcp.FastMCPHUDClient") as MockClient,
                 patch("hud.cli.analyze.console") as mock_console,
                 patch("hud.cli.analyze.display_interactive") as mock_interactive,
                 patch("hud.cli.analyze.display_markdown") as mock_markdown,
@@ -163,7 +163,7 @@ async def test_analyze_with_config_success(self) -> None:
         }
 
         with (
-            patch("hud.cli.analyze.MCPClient") as MockClient,
+            patch("hud.clients.fastmcp.FastMCPHUDClient") as MockClient,
             patch("hud.cli.analyze.console"),
             patch("hud.cli.analyze.display_interactive") as mock_interactive,
         ):
@@ -190,7 +190,7 @@ async def test_analyze_with_config_exception(self) -> None:
         mock_config = {"server": {"command": "test"}}
 
         with (
-            patch("hud.cli.analyze.MCPClient") as MockClient,
+            patch("hud.clients.fastmcp.FastMCPHUDClient") as MockClient,
             patch("hud.cli.analyze.console"),
         ):
             # Setup mock client that fails
diff --git a/hud/cli/tests/test_analyze_module.py b/hud/cli/tests/test_analyze_module.py
index 468389f1..0996b270 100644
--- a/hud/cli/tests/test_analyze_module.py
+++ b/hud/cli/tests/test_analyze_module.py
@@ -29,7 +29,7 @@ def test_parse_docker_command():
 
 
 @pytest.mark.asyncio
-@patch("hud.cli.analyze.MCPClient")
+@patch("hud.clients.fastmcp.FastMCPHUDClient")
 @patch("hud.cli.analyze.console")
 async def test_analyze_environment_success_json(mock_console, MockClient):
     client = AsyncMock()
@@ -46,7 +46,7 @@ async def test_analyze_environment_success_json(mock_console, MockClient):
 
 
 @pytest.mark.asyncio
-@patch("hud.cli.analyze.MCPClient")
+@patch("hud.clients.fastmcp.FastMCPHUDClient")
 @patch("hud.cli.analyze.console")
 async def test_analyze_environment_failure(mock_console, MockClient):
     client = AsyncMock()
@@ -93,7 +93,7 @@ def test_display_markdown_both_paths(capsys):
     assert "MCP Environment Analysis" in captured.out
 
 
-@patch("hud.cli.analyze.MCPClient")
+@patch("hud.clients.fastmcp.FastMCPHUDClient")
 async def test_analyze_environment_from_config(MockClient, tmp_path: Path):
     client = AsyncMock()
     client.initialize.return_value = None
@@ -107,7 +107,7 @@ async def test_analyze_environment_from_config(MockClient, tmp_path: Path):
     assert client.initialize.awaited and client.shutdown.awaited
 
 
-@patch("hud.cli.analyze.MCPClient")
+@patch("hud.clients.fastmcp.FastMCPHUDClient")
 async def test_analyze_environment_from_mcp_config(MockClient):
     client = AsyncMock()
     client.initialize.return_value = None
diff --git a/hud/cli/tests/test_build.py b/hud/cli/tests/test_build.py
index 9a7bb77b..78287fa6 100644
--- a/hud/cli/tests/test_build.py
+++ b/hud/cli/tests/test_build.py
@@ -206,7 +206,7 @@ def test_extract_no_dockerfile(self, tmp_path):
 class TestAnalyzeMcpEnvironment:
     """Test analyzing MCP environment."""
 
-    @mock.patch("hud.cli.build.MCPClient")
+    @mock.patch("hud.clients.fastmcp.FastMCPHUDClient")
     async def test_analyze_success(self, mock_client_class):
         """Test successful environment analysis."""
         # Setup mock client
@@ -240,7 +240,7 @@ async def test_analyze_success(self, mock_client_class):
         assert result["tools"][0]["name"] == "test_tool"
         assert "initializeMs" in result
 
-    @mock.patch("hud.cli.build.MCPClient")
+    @mock.patch("hud.clients.fastmcp.FastMCPHUDClient")
     async def test_analyze_failure(self, mock_client_class):
         """Test failed environment analysis."""
         # Setup mock client to fail
@@ -253,7 +253,7 @@ async def test_analyze_failure(self, mock_client_class):
         with pytest.raises(HudException, match="Connection failed"):
             await analyze_mcp_environment("test:latest")
 
-    @mock.patch("hud.cli.build.MCPClient")
+    @mock.patch("hud.clients.fastmcp.FastMCPHUDClient")
     async def test_analyze_verbose_mode(self, mock_client_class):
         """Test analysis in verbose mode."""
         mock_client = mock.AsyncMock()
@@ -403,6 +403,9 @@ def test_build_environment_success(
         with open(lock_file) as f:
             lock_data = yaml.safe_load(f)
 
+        # Lock file format version
+        assert lock_data["version"] == "1.3"
+
         assert lock_data["images"]["full"] == "test-env:0.1.0@sha256:abc123"
         assert lock_data["images"]["local"] == "test-env:0.1.0"
         assert lock_data["build"]["version"] == "0.1.0"
@@ -472,6 +475,7 @@ def test_build_environment_internal_tools(
         lock_file = env_dir / "hud.lock.yaml"
         with open(lock_file) as f:
             data = yaml.safe_load(f)
+        assert data["version"] == "1.3"
         assert data["environment"]["internalToolCount"] == 2
         assert data["tools"][0]["name"] == "setup"
         assert data["tools"][0]["internalTools"] == ["board", "seed"]
diff --git a/hud/cli/tests/test_cli_root.py b/hud/cli/tests/test_cli_root.py
index d0951d74..cf3c2be3 100644
--- a/hud/cli/tests/test_cli_root.py
+++ b/hud/cli/tests/test_cli_root.py
@@ -7,6 +7,11 @@
 
 import hud.cli as cli
 
+# Import the function directly from the __init__ module to avoid namespace conflict with analyze.py
+import hud.cli.__init__ as cli_init
+
+analyze_fn = cli_init.analyze
+
 if TYPE_CHECKING:
     from pathlib import Path
 
@@ -15,7 +20,7 @@
 @patch("asyncio.run")
 def test_analyze_params_metadata(mock_run, mock_analyze):
     # image only -> metadata path
-    cli.analyze(params=["img:latest"], output_format="json", verbose=False)
+    analyze_fn(params=["img:latest"], output_format="json", verbose=False)
     assert mock_run.called
 
 
@@ -25,7 +30,7 @@ def test_analyze_params_metadata(mock_run, mock_analyze):
 def test_analyze_params_live(mock_run, mock_build_cmd, mock_analyze_env):
     mock_build_cmd.return_value = ["docker", "run", "img", "-e", "K=V"]
     # docker args trigger live path
-    cli.analyze(params=["img:latest", "-e", "K=V"], output_format="json", verbose=True)
+    analyze_fn(params=["img:latest", "-e", "K=V"], output_format="json", verbose=True)
     assert mock_run.called
 
 
@@ -34,7 +39,7 @@ def test_analyze_no_params_errors():
 
     # When no params provided, analyze prints help and exits(1)
     with pytest.raises(typer.Exit):
-        cli.analyze(params=None, config=None, cursor=None, output_format="json", verbose=False)  # type: ignore
+        analyze_fn(params=None, config=None, cursor=None, output_format="json", verbose=False)  # type: ignore
 
 
 @patch("hud.cli.analyze.analyze_environment_from_config", new_callable=AsyncMock)
@@ -42,16 +47,17 @@ def test_analyze_no_params_errors():
 def test_analyze_from_config(mock_run, mock_func, tmp_path: Path):
     cfg = tmp_path / "cfg.json"
     cfg.write_text("{}")
-    cli.analyze(params=None, config=cfg, cursor=None, output_format="json", verbose=False)  # type: ignore
+    analyze_fn(params=None, config=cfg, cursor=None, output_format="json", verbose=False)  # type: ignore
     assert mock_run.called
 
 
-@patch("hud.cli.parse_cursor_config")
+@patch("hud.cli.console")
+@patch("hud.cli.__init__.parse_cursor_config")
 @patch("hud.cli.analyze.analyze_environment_from_mcp_config", new_callable=AsyncMock)
 @patch("asyncio.run")
-def test_analyze_from_cursor(mock_run, mock_analyze, mock_parse):
+def test_analyze_from_cursor(mock_run, mock_analyze, mock_parse, mock_console):
     mock_parse.return_value = (["cmd", "arg"], None)
-    cli.analyze(params=None, config=None, cursor="server", output_format="json", verbose=False)  # type: ignore
+    analyze_fn(params=None, config=None, cursor="server", output_format="json", verbose=False)  # type: ignore
     assert mock_run.called
 
 
diff --git a/hud/cli/tests/test_convert.py b/hud/cli/tests/test_convert.py
index 02c66481..004b5b69 100644
--- a/hud/cli/tests/test_convert.py
+++ b/hud/cli/tests/test_convert.py
@@ -8,7 +8,6 @@
 import typer
 
 from hud.cli.flows.tasks import convert_tasks_to_remote
-from hud.types import Task
 
 
 class TestConvertCommand:
@@ -70,6 +69,7 @@ def test_convert_tasks_basic(
         """Test basic task conversion from local to remote."""
         # Setup mocks
         mock_settings.api_key = "test-api-key"
+        mock_settings.hud_mcp_url = "https://mcp.hud.ai/v3/mcp"
         mock_find_env.return_value = mock_env_dir
 
         # Mock the push check to return updated lock data
@@ -83,12 +83,6 @@ def test_convert_tasks_basic(
         # Mock derive remote image
         mock_derive_remote.return_value = "registry.hud.ai/test-org/test-env:v1.0.0"
 
-        task = Task(
-            prompt="Test task",
-            mcp_config={
-                "local": {"command": "docker", "args": ["run", "--rm", "-i", "test-image:latest"]}
-            },
-        )
         raw_task = {
             "prompt": "Test task",
             "mcp_config": {
@@ -96,7 +90,7 @@ def test_convert_tasks_basic(
             },
         }
 
-        mock_load_tasks.side_effect = [[task], [raw_task]]
+        mock_load_tasks.return_value = [raw_task]
 
         # Run conversion
         result_path = convert_tasks_to_remote(str(temp_tasks_file))
@@ -131,18 +125,18 @@ def test_convert_already_remote(
         mock_settings.api_key = "test-api-key"
         mock_find_env.return_value = None  # No env dir needed for remote tasks
 
-        # Create task that's already remote
-        task = Task(
-            prompt="Test task",
-            mcp_config={
+        # Create task that's already remote (as raw dict)
+        raw_task = {
+            "prompt": "Test task",
+            "mcp_config": {
                 "remote": {
                     "url": "https://mcp.hud.ai",
                     "headers": {"Mcp-Image": "registry.hud.ai/test/image:v1"},
                 }
             },
-        )
+        }
 
-        mock_load_tasks.return_value = [task]
+        mock_load_tasks.return_value = [raw_task]
 
         # Should return original path without modification
         result_path = convert_tasks_to_remote(str(temp_tasks_file))
@@ -158,14 +152,14 @@ def test_convert_no_environment(
         mock_settings.api_key = "test-api-key"
         mock_find_env.return_value = None
 
-        task = Task(
-            prompt="Test task",
-            mcp_config={
+        raw_task = {
+            "prompt": "Test task",
+            "mcp_config": {
                 "local": {"command": "docker", "args": ["run", "--rm", "-i", "test-image:latest"]}
             },
-        )
+        }
 
-        mock_load_tasks.return_value = [task]
+        mock_load_tasks.return_value = [raw_task]
 
         with pytest.raises(typer.Exit):
             convert_tasks_to_remote(str(temp_tasks_file))
@@ -189,6 +183,7 @@ def test_convert_with_env_vars(
     ):
         """Test conversion includes environment variables as headers."""
         mock_settings.api_key = "test-api-key"
+        mock_settings.hud_mcp_url = "https://mcp.hud.ai/v3/mcp"
         mock_find_env.return_value = mock_env_dir
         mock_confirm.return_value = True  # Always confirm in tests
 
@@ -207,18 +202,17 @@ def test_convert_with_env_vars(
         env_file = mock_env_dir / ".env"
         env_file.write_text("OPENAI_API_KEY=sk-test123\nANTHROPIC_API_KEY=sk-ant456")
 
-        task = Task(
-            prompt="Test task",
-            mcp_config={
+        raw_task = {
+            "prompt": "Test task",
+            "mcp_config": {
                 "local": {
                     "command": "docker",
                     "args": ["run", "--rm", "-i", "-e", "OPENAI_API_KEY", "test-image:latest"],
                 }
             },
-        )
-        raw_task = task.model_dump()
+        }
 
-        mock_load_tasks.side_effect = [[task], [raw_task]]
+        mock_load_tasks.return_value = [raw_task]
 
         # Run conversion
         result_path = convert_tasks_to_remote(str(temp_tasks_file))
diff --git a/hud/cli/tests/test_debug.py b/hud/cli/tests/test_debug.py
index 6c4c5d90..b5d46b0a 100644
--- a/hud/cli/tests/test_debug.py
+++ b/hud/cli/tests/test_debug.py
@@ -132,7 +132,7 @@ async def test_phase_2_no_response(self) -> None:
         with (
             patch("subprocess.run", return_value=mock_run_result),
             patch("subprocess.Popen", return_value=mock_proc),
-            patch("time.time", side_effect=[0, 0, 20]),
+            patch("hud.cli.debug.time.time", side_effect=[0, 0, 20]),
         ):
             phases = await debug_mcp_stdio(["test-cmd"], logger, max_phase=5)
             assert phases == 1
@@ -165,7 +165,7 @@ async def test_phase_2_invalid_json_response(self) -> None:
             # Simulate timeout - time.time() is called multiple times in the loop
             # Return increasing values to simulate time passing
             time_values = list(range(20))
-            with patch("time.time", side_effect=time_values):
+            with patch("hud.cli.debug.time.time", side_effect=time_values):
                 phases = await debug_mcp_stdio(["test-cmd"], logger, max_phase=5)
                 assert phases == 1
                 output = logger.get_output()
@@ -207,7 +207,7 @@ async def test_phase_3_tool_discovery(self) -> None:
         with (
             patch("subprocess.run", return_value=mock_run_result),
             patch("subprocess.Popen", return_value=mock_proc),
-            patch("hud.cli.debug.MCPClient") as MockClient,
+            patch("hud.clients.fastmcp.FastMCPHUDClient") as MockClient,
         ):
             mock_client = MockClient.return_value
             mock_client.initialize = AsyncMock()
@@ -240,7 +240,7 @@ async def test_phase_3_no_tools(self) -> None:
         with (
             patch("subprocess.run", return_value=mock_run_result),
             patch("subprocess.Popen", return_value=mock_proc),
-            patch("hud.cli.debug.MCPClient") as MockClient,
+            patch("hud.clients.fastmcp.FastMCPHUDClient") as MockClient,
         ):
             mock_client = MockClient.return_value
             mock_client.initialize = AsyncMock()
@@ -277,7 +277,7 @@ async def test_phase_4_remote_deployment(self) -> None:
         with (
             patch("subprocess.run", return_value=mock_run_result),
             patch("subprocess.Popen", return_value=mock_proc),
-            patch("hud.cli.debug.MCPClient") as MockClient,
+            patch("hud.clients.fastmcp.FastMCPHUDClient") as MockClient,
         ):
             mock_client = MockClient.return_value
             mock_client.initialize = AsyncMock()
@@ -286,7 +286,9 @@ async def test_phase_4_remote_deployment(self) -> None:
             mock_client.call_tool = AsyncMock()
             mock_client.shutdown = AsyncMock()
 
-            with patch("time.time", side_effect=[0, 5, 5, 5, 5]):  # Start at 0, then 5 for the rest
+            with patch(
+                "hud.cli.debug.time.time", side_effect=[0, 5, 5, 5, 5]
+            ):  # Start at 0, then 5 for the rest
                 phases = await debug_mcp_stdio(["test-cmd"], logger, max_phase=4)
                 assert phases == 4
                 output = logger.get_output()
@@ -311,7 +313,7 @@ async def test_phase_4_slow_initialization(self) -> None:
         with (
             patch("subprocess.run", return_value=mock_run_result),
             patch("subprocess.Popen", return_value=mock_proc),
-            patch("hud.cli.debug.MCPClient") as MockClient,
+            patch("hud.clients.fastmcp.FastMCPHUDClient") as MockClient,
         ):
             mock_client = MockClient.return_value
             mock_client.initialize = AsyncMock()
@@ -324,7 +326,7 @@ async def test_phase_4_slow_initialization(self) -> None:
 
             # Simulate slow init (>30s)
             # time.time() is called at start and after phase 3
-            with patch("time.time", side_effect=[0, 0, 0, 35, 35, 35]):
+            with patch("hud.cli.debug.time.time", side_effect=[0, 0, 0, 35, 35, 35]):
                 phases = await debug_mcp_stdio(["test-cmd"], logger, max_phase=5)
                 output = logger.get_output()
                 # Check if we got to phase 4 where the timing check happens
@@ -349,7 +351,7 @@ async def test_phase_5_concurrent_clients(self) -> None:
         with (
             patch("subprocess.run", return_value=mock_run_result),
             patch("subprocess.Popen", return_value=mock_proc),
-            patch("hud.cli.debug.MCPClient") as MockClient,
+            patch("hud.clients.fastmcp.FastMCPHUDClient") as MockClient,
         ):
             # Create different mock instances for each client
             mock_clients = []
@@ -393,7 +395,7 @@ async def test_phase_5_concurrent_failure(self) -> None:
         with (
             patch("subprocess.run", return_value=mock_run_result),
             patch("subprocess.Popen", return_value=mock_proc),
-            patch("hud.cli.debug.MCPClient") as MockClient,
+            patch("hud.clients.fastmcp.FastMCPHUDClient") as MockClient,
         ):
             # Set up for phase 1-4 success first
             test_tool = Mock()
diff --git a/hud/cli/tests/test_dev.py b/hud/cli/tests/test_dev.py
index d1027303..0cfcfc16 100644
--- a/hud/cli/tests/test_dev.py
+++ b/hud/cli/tests/test_dev.py
@@ -81,8 +81,42 @@ def test_detect_module_from_main_py(self, tmp_path, monkeypatch):
         assert module_name == f"{tmp_path.name}.main"
         assert extra_path == tmp_path.parent
 
-    def test_no_detection_without_mcp(self, tmp_path, monkeypatch):
-        """Test no detection when mcp not defined."""
+    def test_detect_module_from_init_with_environment(self, tmp_path, monkeypatch):
+        """Test detection from __init__.py with Environment."""
+        monkeypatch.chdir(tmp_path)
+
+        init_file = tmp_path / "__init__.py"
+        init_file.write_text("""
+from hud import Environment
+env = Environment(name='test')
+""")
+
+        module_name, extra_path = auto_detect_module()
+
+        assert module_name == tmp_path.name
+        assert extra_path is None
+
+    def test_detect_module_from_main_py_with_environment(self, tmp_path, monkeypatch):
+        """Test detection from main.py with Environment."""
+        monkeypatch.chdir(tmp_path)
+
+        # Need both __init__.py and main.py
+        init_file = tmp_path / "__init__.py"
+        init_file.write_text("")
+
+        main_file = tmp_path / "main.py"
+        main_file.write_text("""
+from hud import Environment
+env = Environment(name='test')
+""")
+
+        module_name, extra_path = auto_detect_module()
+
+        assert module_name == f"{tmp_path.name}.main"
+        assert extra_path == tmp_path.parent
+
+    def test_no_detection_without_mcp_or_env(self, tmp_path, monkeypatch):
+        """Test no detection when neither mcp nor env is defined."""
         monkeypatch.chdir(tmp_path)
 
         init_file = tmp_path / "__init__.py"
diff --git a/hud/cli/tests/test_eval.py b/hud/cli/tests/test_eval.py
index d3d3dd4e..adda91cd 100644
--- a/hud/cli/tests/test_eval.py
+++ b/hud/cli/tests/test_eval.py
@@ -1,539 +1,251 @@
-"""Tests for hud.cli.eval module."""
+"""Tests for hud.cli.eval module and run_dataset function."""
 
 from __future__ import annotations
 
-from types import SimpleNamespace
 from typing import Any
 from unittest.mock import AsyncMock, MagicMock, patch
 
 import pytest
-from anthropic import AsyncAnthropic
 from mcp import types
 
-from hud.agents.tests.conftest import MockMCPClient
-from hud.types import Task, Trace
+from hud.environment.router import ToolRouter
+from hud.eval.context import EvalContext
+from hud.types import AgentType, MCPToolResult, Trace
 
 
-class TestToolFiltering:
-    """Test wildcard tool filtering via agent_config in tasks."""
+class MockEvalContext(EvalContext):
+    """Mock EvalContext for testing."""
 
-    @pytest.fixture
-    def mock_mcp_client(self):
-        """Fixture for mock MCP client."""
-        return MockMCPClient()
-
-    @pytest.fixture
-    def mock_model_client(self):
-        """Fixture for a lightweight Anthropic client."""
-        client = AsyncAnthropic(api_key="test_key")
-        client.__dict__["beta"] = SimpleNamespace(messages=AsyncMock())
-        return client
-
-    async def _run_agent_with_tools(
+    def __init__(
         self,
-        mock_mcp_client: MagicMock,
-        mock_model_client: MagicMock,
-        tools: list[types.Tool],
-        agent_config: dict[str, Any] | None = None,
-    ) -> list[types.Tool]:
-        """Helper to create agent, initialize with tools and config, return filtered tools."""
-        from hud.agents import ClaudeAgent
-        from hud.types import BaseAgentConfig
-
-        mock_mcp_client.list_tools = AsyncMock(return_value=tools)
-
-        task = Task(
-            prompt="Test",
-            mcp_config={"local": {"url": "http://localhost"}},
-            agent_config=BaseAgentConfig(**agent_config) if agent_config else None,
-        )
-
-        agent = ClaudeAgent.create(
-            mcp_client=mock_mcp_client,
-            model_client=mock_model_client,
-            checkpoint_name="test",
-            validate_api_key=False,
+        prompt: str = "Test prompt",
+        tools: list[types.Tool] | None = None,
+    ) -> None:
+        # Core attributes
+        self.prompt = prompt
+        self._tools = tools or []
+        self._submitted: str | None = None
+        self.reward: float | None = None
+        self.results: list[EvalContext] = []
+
+        # Environment attributes
+        self._router = ToolRouter()
+        self._agent_include: list[str] | None = None
+        self._agent_exclude: list[str] | None = None
+
+        # EvalContext attributes
+        self._task = None
+        self.trace_id = "test-trace-id"
+        self.eval_name = "test-eval"
+        self.job_id: str | None = None
+        self.group_id: str | None = None
+        self.index = 0
+        self.variants: dict[str, Any] = {}
+        self.answer: str | None = None
+        self.system_prompt: str | None = None
+        self.error: BaseException | None = None
+        self.metadata: dict[str, Any] = {}
+        self._is_summary = False
+
+    def as_tools(self) -> list[types.Tool]:
+        return self._tools
+
+    @property
+    def has_scenario(self) -> bool:
+        return False
+
+    async def list_tools(self) -> list[types.Tool]:
+        return self._tools
+
+    async def call_tool(self, call: Any, /, **kwargs: Any) -> MCPToolResult:
+        return MCPToolResult(
+            content=[types.TextContent(type="text", text="ok")],
+            isError=False,
         )
-        await agent.initialize(task)
-        return agent.get_available_tools()
-
-    @pytest.mark.asyncio
-    async def test_no_filters_returns_all_tools(self, mock_mcp_client, mock_model_client) -> None:
-        """Test that no filters in agent_config returns all tools."""
-        tools = [
-            types.Tool(
-                name="tool1",
-                description="Tool 1",
-                inputSchema={"type": "object", "properties": {}},
-            ),
-            types.Tool(
-                name="tool2",
-                description="Tool 2",
-                inputSchema={"type": "object", "properties": {}},
-            ),
-            types.Tool(
-                name="debug_tool",
-                description="Debug",
-                inputSchema={"type": "object", "properties": {}},
-            ),
-        ]
 
-        result = await self._run_agent_with_tools(mock_mcp_client, mock_model_client, tools)
+    async def submit(self, answer: str) -> None:
+        self._submitted = answer
 
-        assert len(result) == 3
 
-    @pytest.mark.asyncio
-    async def test_allowed_tools_filters_correctly(
-        self, mock_mcp_client, mock_model_client
-    ) -> None:
-        """Test that allowed_tools in agent_config filters to matching patterns."""
-        tools = [
-            types.Tool(
-                name="screenshot_take",
-                description="Tool 1",
-                inputSchema={"type": "object", "properties": {}},
-            ),
-            types.Tool(
-                name="screenshot_full",
-                description="Tool 2",
-                inputSchema={"type": "object", "properties": {}},
-            ),
-            types.Tool(
-                name="click",
-                description="Tool 3",
-                inputSchema={"type": "object", "properties": {}},
-            ),
-        ]
-        agent_config = {"allowed_tools": ["screenshot_*"]}
+def _create_mock_agent_cls() -> tuple[MagicMock, MagicMock]:
+    """Create a mock agent class and instance for testing."""
+    mock_agent_instance = MagicMock()
+    mock_agent_instance.run = AsyncMock(return_value=Trace(reward=1.0, done=True))
+    mock_agent_cls = MagicMock()
+    mock_agent_cls.create.return_value = mock_agent_instance
+    return mock_agent_cls, mock_agent_instance
 
-        result = await self._run_agent_with_tools(
-            mock_mcp_client, mock_model_client, tools, agent_config
-        )
 
-        assert len(result) == 2
-        assert all("screenshot" in t.name for t in result)
+class TestRunDataset:
+    """Test the new run_dataset function."""
 
     @pytest.mark.asyncio
-    async def test_disallowed_tools_excludes_correctly(
-        self, mock_mcp_client, mock_model_client
-    ) -> None:
-        """Test that disallowed_tools in agent_config excludes matching patterns."""
-        tools = [
-            types.Tool(
-                name="tool1",
-                description="Tool 1",
-                inputSchema={"type": "object", "properties": {}},
-            ),
-            types.Tool(
-                name="debug_tool",
-                description="Tool 2",
-                inputSchema={"type": "object", "properties": {}},
-            ),
-            types.Tool(
-                name="internal_secret",
-                description="Tool 3",
-                inputSchema={"type": "object", "properties": {}},
-            ),
-        ]
-        agent_config = {"disallowed_tools": ["debug_*", "internal_*"]}
+    async def test_run_dataset_with_task_list(self) -> None:
+        """Test run_dataset with a list of tasks."""
+        from hud.eval.task import Task
 
-        result = await self._run_agent_with_tools(
-            mock_mcp_client, mock_model_client, tools, agent_config
-        )
+        tasks = [
+            Task(env={"name": "test"}, id="task1", scenario="test"),
+            Task(env={"name": "test"}, id="task2", scenario="test"),
+        ]
+        mock_agent_cls, mock_agent_instance = _create_mock_agent_cls()
 
-        assert len(result) == 1
-        assert result[0].name == "tool1"
+        # Mock hud.eval to return our mock context
+        mock_ctx = MockEvalContext()
 
-    @pytest.mark.asyncio
-    async def test_both_filters_applies_allowed_then_disallowed(
-        self, mock_mcp_client, mock_model_client
-    ) -> None:
-        """Test that both filters in agent_config work together (disallowed takes precedence)."""
-        tools = [
-            types.Tool(
-                name="browser_click",
-                description="Tool 1",
-                inputSchema={"type": "object", "properties": {}},
-            ),
-            types.Tool(
-                name="browser_debug",
-                description="Tool 2",
-                inputSchema={"type": "object", "properties": {}},
-            ),
-            types.Tool(
-                name="system_click",
-                description="Tool 3",
-                inputSchema={"type": "object", "properties": {}},
-            ),
-        ]
-        agent_config = {"allowed_tools": ["browser_*"], "disallowed_tools": ["*_debug"]}
+        with (
+            patch("hud.datasets.runner.hud.eval") as mock_eval,
+            patch("hud.agents.claude.ClaudeAgent", mock_agent_cls),
+        ):
+            # Set up the async context manager
+            mock_eval.return_value.__aenter__ = AsyncMock(return_value=mock_ctx)
+            mock_eval.return_value.__aexit__ = AsyncMock(return_value=None)
 
-        result = await self._run_agent_with_tools(
-            mock_mcp_client, mock_model_client, tools, agent_config
-        )
+            from hud.datasets.runner import run_dataset
 
-        assert len(result) == 1
-        assert result[0].name == "browser_click"
-
-
-class TestRunDatasetToolFiltering:
-    """Test tool filtering via run_dataset with agent_config in both init and task."""
-
-    @pytest.fixture
-    def all_tools(self):
-        """Fixture for a standard set of tools."""
-        return [
-            types.Tool(
-                name="browser_click",
-                description="Click",
-                inputSchema={"type": "object", "properties": {}},
-            ),
-            types.Tool(
-                name="browser_type",
-                description="Type",
-                inputSchema={"type": "object", "properties": {}},
-            ),
-            types.Tool(
-                name="browser_debug",
-                description="Debug",
-                inputSchema={"type": "object", "properties": {}},
-            ),
-            types.Tool(
-                name="system_screenshot",
-                description="Screenshot",
-                inputSchema={"type": "object", "properties": {}},
-            ),
-            types.Tool(
-                name="system_execute",
-                description="Execute",
-                inputSchema={"type": "object", "properties": {}},
-            ),
-        ]
+            await run_dataset(tasks, agent_type="claude", max_steps=5)
 
-    @pytest.fixture
-    def captured_agent_fixture(self):
-        """Fixture that returns a dictionary to capture the agent instance."""
-        return {"agent": None}
+            # Verify hud.eval was called with correct params
+            mock_eval.assert_called_once()
+            call_kwargs = mock_eval.call_args[1]
+            assert call_kwargs["group"] == 1
+            assert call_kwargs["max_concurrent"] == 30
 
-    @pytest.fixture
-    def mock_run_context(self, captured_agent_fixture):
-        """Fixture for mocking _run_context."""
+            # Agent should have run
+            mock_agent_instance.run.assert_called_once()
 
-        async def _mock(self, context, max_steps=10):
-            captured_agent_fixture["agent"] = self
-            return Trace(reward=1.0, done=True, content="Done")
+    @pytest.mark.asyncio
+    async def test_run_dataset_with_string_source(self) -> None:
+        """Test run_dataset with a string source (loads via load_dataset)."""
+        from hud.eval.task import Task
 
-        return _mock
+        mock_tasks = [Task(env={"name": "test"}, id="loaded_task", scenario="loaded")]
+        mock_agent_cls, _ = _create_mock_agent_cls()
+        mock_ctx = MockEvalContext()
 
-    @pytest.fixture
-    def mock_call_tools(self):
-        """Fixture for mocking call_tools."""
+        with (
+            patch("hud.datasets.loader.load_tasks", return_value=mock_tasks) as mock_load,
+            patch("hud.datasets.runner.hud.eval") as mock_eval,
+            patch("hud.agents.OpenAIAgent", mock_agent_cls),
+        ):
+            mock_eval.return_value.__aenter__ = AsyncMock(return_value=mock_ctx)
+            mock_eval.return_value.__aexit__ = AsyncMock(return_value=None)
 
-        async def _mock(self, tool_call=None):
-            return []
+            from hud.datasets.runner import run_dataset
 
-        return _mock
+            await run_dataset("my-tasks.json", agent_type="openai")
 
-    @pytest.fixture
-    def mock_client_instance(self, all_tools):
-        """Fixture for mock MCP client instance."""
-        mock_client = MagicMock()
-        mock_client.initialize = AsyncMock()
-        mock_client.list_tools = AsyncMock(return_value=all_tools)
-        mock_client.shutdown = AsyncMock()
-        mock_client.mcp_config = {"local": {"url": "http://localhost:8765/mcp"}}
-        return mock_client
+            # Verify load_dataset was called
+            mock_load.assert_called_once_with("my-tasks.json")
 
     @pytest.mark.asyncio
-    async def test_agent_config_intersection_union_via_run_dataset(
-        self,
-        all_tools,
-        captured_agent_fixture,
-        mock_run_context,
-        mock_call_tools,
-        mock_client_instance,
-    ) -> None:
-        """Test that allowed_tools intersect and disallowed_tools union when set in both __init__ and task.agent_config."""  # noqa: E501
-        from hud.agents import ClaudeAgent
-        from hud.datasets.runner import run_dataset
-
-        # Create a task with its own agent_config
-        task_dict = {
-            "prompt": "Test task",
-            "mcp_config": {"local": {"url": "http://localhost:8765/mcp"}},
-            "agent_config": {
-                "allowed_tools": [
-                    "browser_*",
-                    "system_screenshot",
-                ],  # Task wants browser_* and system_screenshot
-                "disallowed_tools": [
-                    "*_debug",
-                    "*_execute",
-                ],  # Task disallows *_debug and *_execute
-            },
-        }
-
-        # Agent config passed to __init__ via run_dataset
-        agent_init_config = {
-            "allowed_tools": ["browser_*", "system_*"],  # Agent init wants browser_* and system_*
-            "disallowed_tools": ["browser_debug"],  # Agent init disallows browser_debug
-            "validate_api_key": False,
-        }
+    async def test_run_dataset_empty_tasks_raises(self) -> None:
+        """Test run_dataset raises ValueError for empty tasks."""
+        with patch("hud.datasets.loader.load_dataset", return_value=[]):
+            from hud.datasets.runner import run_dataset
 
-        with (
-            patch("hud.job"),
-            patch("hud.trace"),
-            patch.object(ClaudeAgent, "_run_context", mock_run_context),
-            patch.object(ClaudeAgent, "call_tools", mock_call_tools),
-            patch("hud.clients.MCPClient", return_value=mock_client_instance),
-            patch("hud.settings.settings.anthropic_api_key", "sk-test-key"),
-            # run_dataset() uses async_trace -> configure_telemetry(); disable telemetry so tests
-            # don't require HUD_API_KEY and don't attempt network calls.
-            patch("hud.settings.settings.telemetry_enabled", False),
-        ):
-            # Run the dataset
-            await run_dataset(
-                name="test_job",
-                dataset=[task_dict],
-                agent_class=ClaudeAgent,
-                agent_config=agent_init_config,
-                max_steps=10,
-            )
-
-            # Verify agent was created and ran
-            captured_agent = captured_agent_fixture["agent"]
-            assert captured_agent is not None
-
-            # Get the filtered tools
-            filtered_tools = captured_agent.get_available_tools()
-            filtered_names = {tool.name for tool in filtered_tools}
-
-            # Expected behavior:
-            # 1. allowed_tools intersection: ["browser_*", "system_*"] ∩ ["browser_*", "system_screenshot"] # noqa: E501
-            #    Exact string intersection: only "browser_*" is in both lists
-            #    So only tools matching browser_* are allowed: browser_click, browser_type, browser_debug # noqa: E501
-            # 2. disallowed_tools union: ["browser_debug"] U ["*_debug", "*_execute"]
-            #    Result: ["browser_debug", "*_debug", "*_execute"] (all patterns included)
-            # 3. Final: {browser_click, browser_type, browser_debug} - {browser_debug}
-            #    Result: browser_click, browser_type
-
-            expected_tools = {"browser_click", "browser_type"}
-            assert filtered_names == expected_tools, (
-                f"Expected {expected_tools}, got {filtered_names}"
-            )
+            with pytest.raises(ValueError, match="No tasks to run"):
+                await run_dataset([], agent_type=AgentType.CLAUDE)
 
     @pytest.mark.asyncio
-    async def test_no_allowed_tools_keeps_all_tools_except_disallowed(
-        self,
-        all_tools,
-        captured_agent_fixture,
-        mock_run_context,
-        mock_call_tools,
-        mock_client_instance,
-    ) -> None:
-        """Test that when allowed_tools is not set, all tools are available except disallowed ones."""  # noqa: E501
-        from hud.agents import ClaudeAgent
-        from hud.datasets.runner import run_dataset
-
-        # Create a task with its own agent_config (no allowed_tools)
-        task_dict = {
-            "prompt": "Test task",
-            "mcp_config": {"local": {"url": "http://localhost:8765/mcp"}},
-            "agent_config": {
-                # No allowed_tools set - should allow all tools
-                "disallowed_tools": ["*_execute"],  # Task disallows *_execute
-            },
-        }
-
-        # Agent config passed to __init__ via run_dataset (no allowed_tools)
-        agent_init_config = {
-            # No allowed_tools set - should allow all tools
-            "disallowed_tools": ["browser_debug"],  # Agent init disallows browser_debug
-            "validate_api_key": False,
-        }
+    async def test_run_dataset_with_group_size(self) -> None:
+        """Test run_dataset passes group_size to hud.eval."""
+        from hud.eval.task import Task
+
+        tasks = [Task(env={"name": "test"}, id="task1", scenario="test")]
+        mock_agent_cls, _ = _create_mock_agent_cls()
+        mock_ctx = MockEvalContext()
 
         with (
-            patch("hud.job"),
-            patch("hud.trace"),
-            patch.object(ClaudeAgent, "_run_context", mock_run_context),
-            patch.object(ClaudeAgent, "call_tools", mock_call_tools),
-            patch("hud.clients.MCPClient", return_value=mock_client_instance),
-            patch("hud.settings.settings.anthropic_api_key", "sk-test-key"),
-            # run_dataset() uses async_trace -> configure_telemetry(); disable telemetry so tests
-            # don't require HUD_API_KEY and don't attempt network calls.
-            patch("hud.settings.settings.telemetry_enabled", False),
+            patch("hud.datasets.runner.hud.eval") as mock_eval,
+            patch("hud.agents.claude.ClaudeAgent", mock_agent_cls),
         ):
-            # Run the dataset
-            await run_dataset(
-                name="test_job",
-                dataset=[task_dict],
-                agent_class=ClaudeAgent,
-                agent_config=agent_init_config,
-                max_steps=10,
-            )
-
-            # Verify agent was created and ran
-            captured_agent = captured_agent_fixture["agent"]
-            assert captured_agent is not None
+            mock_eval.return_value.__aenter__ = AsyncMock(return_value=mock_ctx)
+            mock_eval.return_value.__aexit__ = AsyncMock(return_value=None)
 
-            # Get the filtered tools
-            filtered_tools = captured_agent.get_available_tools()
-            filtered_names = {tool.name for tool in filtered_tools}
+            from hud.datasets.runner import run_dataset
 
-            # Expected behavior:
-            # 1. allowed_tools: None (no allowed_tools set in either init or task)
-            #    Result: All tools are initially allowed
-            # 2. disallowed_tools union: ["browser_debug"] U ["*_execute"]
-            #    Result: ["browser_debug", "*_execute"] (all patterns included)
-            # 3. Final: {all tools} - {browser_debug, system_execute}
-            #    Result: browser_click, browser_type, system_screenshot
+            await run_dataset(tasks, agent_type="claude", group_size=3)
 
-            expected_tools = {"browser_click", "browser_type", "system_screenshot"}
-            assert filtered_names == expected_tools, (
-                f"Expected {expected_tools}, got {filtered_names}"
-            )
+            call_kwargs = mock_eval.call_args[1]
+            assert call_kwargs["group"] == 3
 
+    @pytest.mark.asyncio
+    async def test_run_dataset_with_max_concurrent(self) -> None:
+        """Test run_dataset passes max_concurrent to hud.eval."""
+        from hud.eval.task import Task
 
-SYSTEM_PROMPT = "You are an assistant that can use tools to help the user. You will be given a task and you will need to use the tools to complete the task."  # noqa: E501
+        tasks = [Task(env={"name": "test"}, id="task1", scenario="test")]
+        mock_agent_cls, _ = _create_mock_agent_cls()
+        mock_ctx = MockEvalContext()
 
+        with (
+            patch("hud.datasets.runner.hud.eval") as mock_eval,
+            patch("hud.agents.claude.ClaudeAgent", mock_agent_cls),
+        ):
+            mock_eval.return_value.__aenter__ = AsyncMock(return_value=mock_ctx)
+            mock_eval.return_value.__aexit__ = AsyncMock(return_value=None)
 
-class TestSystemPromptHandling:
-    """Test system prompt handling through run_dataset flow."""
+            from hud.datasets.runner import run_dataset
 
-    @pytest.fixture
-    def mock_mcp_client(self):
-        """Fixture for mock MCP client."""
-        return MockMCPClient()
+            await run_dataset(tasks, agent_type="claude", max_concurrent=10)
 
-    @pytest.fixture
-    def captured_agent_fixture(self):
-        """Fixture that returns a dictionary to capture the agent instance."""
-        return {"agent": None}
+            call_kwargs = mock_eval.call_args[1]
+            assert call_kwargs["max_concurrent"] == 10
 
-    @pytest.fixture
-    def mock_run_context(self, captured_agent_fixture):
-        """Fixture for mocking _run_context to capture agent."""
+    @pytest.mark.asyncio
+    async def test_run_dataset_returns_results(self) -> None:
+        """Test run_dataset returns EvalContext results."""
+        from hud.eval.task import Task
 
-        async def _mock(self, context, max_steps=10):
-            captured_agent_fixture["agent"] = self
-            return Trace(reward=1.0, done=True, content="Done")
+        tasks = [Task(env={"name": "test"}, id="task1", scenario="test")]
+        mock_agent_cls, _ = _create_mock_agent_cls()
+        mock_ctx = MockEvalContext()
 
-        return _mock
+        with (
+            patch("hud.datasets.runner.hud.eval") as mock_eval,
+            patch("hud.agents.claude.ClaudeAgent", mock_agent_cls),
+        ):
+            mock_eval.return_value.__aenter__ = AsyncMock(return_value=mock_ctx)
+            mock_eval.return_value.__aexit__ = AsyncMock(return_value=None)
 
-    @pytest.fixture
-    def mock_call_tools(self):
-        """Fixture for mocking call_tools."""
+            from hud.datasets.runner import run_dataset
 
-        async def _mock(self, tool_call=None):
-            return []
+            results = await run_dataset(tasks, agent_type="claude")
 
-        return _mock
+            # Should return list with the context
+            assert len(results) == 1
+            assert results[0] is mock_ctx
 
     @pytest.mark.asyncio
-    async def test_task_system_prompt_only(
-        self, captured_agent_fixture, mock_run_context, mock_call_tools, mock_mcp_client
-    ) -> None:
-        """Test that task system_prompt is appended when agent has default system prompt."""
-        from hud.agents import ClaudeAgent
-        from hud.datasets.runner import run_dataset
+    async def test_run_dataset_parallel_results(self) -> None:
+        """Test run_dataset returns ctx.results for parallel execution."""
+        from hud.eval.task import Task
 
-        task_system_prompt = "Task prompt"
+        tasks = [Task(env={"name": "test"}, id="task1", scenario="test")]
+        mock_agent_cls, _ = _create_mock_agent_cls()
 
-        # Create a task with its own system_prompt in agent_config
-        task_dict = {
-            "prompt": "Test task",
-            "mcp_config": {"local": {"url": "http://localhost:8765/mcp"}},
-            "agent_config": {
-                "system_prompt": task_system_prompt,
-            },
-        }
+        # Create mock context with results (parallel execution)
+        mock_result1 = MockEvalContext(prompt="result1")
+        mock_result1.reward = 0.8
+        mock_result2 = MockEvalContext(prompt="result2")
+        mock_result2.reward = 0.9
 
-        # Agent config with no custom system_prompt (will use default)
-        agent_init_config = {"validate_api_key": False, "system_prompt": SYSTEM_PROMPT}
+        mock_ctx = MockEvalContext()
+        mock_ctx.results = [mock_result1, mock_result2]
 
         with (
-            patch("hud.job"),
-            patch("hud.trace"),
-            patch.object(ClaudeAgent, "_run_context", mock_run_context),
-            patch.object(ClaudeAgent, "call_tools", mock_call_tools),
-            patch("hud.clients.MCPClient", return_value=mock_mcp_client),
-            patch("hud.settings.settings.anthropic_api_key", "sk-test-key"),
-            # run_dataset() uses async_trace -> configure_telemetry(); disable telemetry so tests
-            # don't require HUD_API_KEY and don't attempt network calls.
-            patch("hud.settings.settings.telemetry_enabled", False),
+            patch("hud.datasets.runner.hud.eval") as mock_eval,
+            patch("hud.agents.claude.ClaudeAgent", mock_agent_cls),
         ):
-            # Run the dataset
-            await run_dataset(
-                name="test_job",
-                dataset=[task_dict],
-                agent_class=ClaudeAgent,
-                agent_config=agent_init_config,
-                max_steps=10,
-            )
-
-            # Verify agent was created and ran
-            captured_agent = captured_agent_fixture["agent"]
-            assert captured_agent is not None
-
-            # Verify the task system prompt was appended
-            assert captured_agent.system_prompt.endswith(f"\n\n{task_system_prompt}")
-            # Verify it starts with the base global system prompt
-            assert captured_agent.system_prompt.startswith(SYSTEM_PROMPT)
+            mock_eval.return_value.__aenter__ = AsyncMock(return_value=mock_ctx)
+            mock_eval.return_value.__aexit__ = AsyncMock(return_value=None)
 
-    @pytest.mark.asyncio
-    async def test_both_agent_and_task_system_prompts(
-        self, captured_agent_fixture, mock_run_context, mock_call_tools, mock_mcp_client
-    ) -> None:
-        """Test that both agent init and task system prompts are present when both are set."""
-        from hud.agents import ClaudeAgent
-        from hud.datasets.runner import run_dataset
-
-        agent_custom_prompt = "Agent init prompt"
-        task_system_prompt = "Task prompt"
-
-        # Create a task with its own system_prompt in agent_config
-        task_dict = {
-            "prompt": "Test task",
-            "mcp_config": {"local": {"url": "http://localhost:8765/mcp"}},
-            "agent_config": {
-                "system_prompt": task_system_prompt,
-            },
-        }
-
-        # Agent config WITH custom system_prompt
-        agent_init_config = {
-            "system_prompt": agent_custom_prompt,
-            "validate_api_key": False,
-        }
+            from hud.datasets.runner import run_dataset
 
-        with (
-            patch("hud.job"),
-            patch("hud.trace"),
-            patch.object(ClaudeAgent, "_run_context", mock_run_context),
-            patch.object(ClaudeAgent, "call_tools", mock_call_tools),
-            patch("hud.clients.MCPClient", return_value=mock_mcp_client),
-            patch("hud.settings.settings.anthropic_api_key", "sk-test-key"),
-            # run_dataset() uses async_trace -> configure_telemetry(); disable telemetry so tests
-            # don't require HUD_API_KEY and don't attempt network calls.
-            patch("hud.settings.settings.telemetry_enabled", False),
-        ):
-            # Run the dataset
-            await run_dataset(
-                name="test_job",
-                dataset=[task_dict],
-                agent_class=ClaudeAgent,
-                agent_config=agent_init_config,
-                max_steps=10,
-            )
-
-            # Verify agent was created and ran
-            captured_agent = captured_agent_fixture["agent"]
-            assert captured_agent is not None
-
-            # Verify the task system prompt was appended at the end
-            assert captured_agent.system_prompt.endswith(f"\n\n{task_system_prompt}")
-            # Verify it starts with the agent custom prompt
-            assert captured_agent.system_prompt.startswith(agent_custom_prompt)
-            # Verify both prompts are present
-            assert agent_custom_prompt in captured_agent.system_prompt
-            assert task_system_prompt in captured_agent.system_prompt
+            results = await run_dataset(tasks, agent_type="claude")
+
+            # Should return the parallel results
+            assert len(results) == 2
+            assert results[0].reward == 0.8
+            assert results[1].reward == 0.9
diff --git a/hud/cli/tests/test_registry.py b/hud/cli/tests/test_registry.py
index 6dd92b6a..5a09c283 100644
--- a/hud/cli/tests/test_registry.py
+++ b/hud/cli/tests/test_registry.py
@@ -189,7 +189,7 @@ def test_load_success(self, tmp_path):
             digest_dir = registry_dir / "abc123"
             digest_dir.mkdir(parents=True)
 
-            lock_data = {"image": "test:latest", "version": "1.0"}
+            lock_data = {"image": "test:latest", "version": "1.3"}
             lock_file = digest_dir / "hud.lock.yaml"
             lock_file.write_text(yaml.dump(lock_data))
 
diff --git a/hud/cli/rl/celebrate.py b/hud/cli/utils/celebrate.py
similarity index 86%
rename from hud/cli/rl/celebrate.py
rename to hud/cli/utils/celebrate.py
index e9b48cde..66eb5fc4 100644
--- a/hud/cli/rl/celebrate.py
+++ b/hud/cli/utils/celebrate.py
@@ -1,4 +1,6 @@
 # ruff: noqa: S311
+"""Confetti celebration animation for CLI."""
+
 from __future__ import annotations
 
 import random
@@ -121,20 +123,20 @@ def render_with_colors(self) -> Text:
         return text
 
 
-def show_confetti(console: Console, seconds: float = 2.5) -> None:
-    """Display celebratory confetti animation inspired by confetty.
+def show_confetti(console: Console, seconds: float = 2.5, message: str | None = None) -> None:
+    """Display celebratory confetti animation.
 
-    Shows "Starting training!" message first, then creates two bursts of
+    Shows a message first, then creates two bursts of
     falling confetti particles that fall away completely.
 
     Args:
         console: Rich console instance
         seconds: Duration to show confetti
+        message: Custom message to display (default: "🎉 Success!")
     """
     # Show celebratory message first
-    console.print(
-        "[bold green]🎉 Starting training! See your model on https://hud.ai/models[/bold green]"
-    )
+    msg = message or "[bold green]🎉 Success![/bold green]"
+    console.print(msg)
     time.sleep(0.3)  # Brief pause to see the message
 
     width = min(console.size.width, 120)  # Cap width for performance
@@ -166,22 +168,22 @@ def show_confetti(console: Console, seconds: float = 2.5) -> None:
             frame += 1
 
 
-def show_confetti_async(console: Console, seconds: float = 2.5) -> None:
+def show_confetti_async(console: Console, seconds: float = 2.5, message: str | None = None) -> None:
     """Non-blocking confetti animation that runs in a background thread.
 
-    The animation will run independently while training starts immediately.
+    The animation will run independently while other operations continue.
     """
     import threading
 
     def _run_confetti() -> None:
         try:
-            show_confetti(console, seconds)
+            show_confetti(console, seconds, message)
         except Exception:
-            hud_console.info("Launching training...")
+            hud_console.info("Continuing...")
 
     thread = threading.Thread(target=_run_confetti, daemon=True)
     thread.start()
-    # Don't wait - let training start immediately while confetti plays
+    # Don't wait - let operations continue while confetti plays
 
 
-__all__ = ["show_confetti", "show_confetti_async"]
+__all__ = ["ConfettiSystem", "Particle", "show_confetti", "show_confetti_async"]
diff --git a/hud/cli/utils/config.py b/hud/cli/utils/config.py
index 439cee3d..d13e6669 100644
--- a/hud/cli/utils/config.py
+++ b/hud/cli/utils/config.py
@@ -27,7 +27,9 @@ def parse_env_file(contents: str) -> dict[str, str]:
     """Parse simple KEY=VALUE lines into a dict.
 
     - Ignores blank lines and lines starting with '#'.
-    - Does not perform variable substitution or quoting.
+    - Strips inline comments (# and everything after) from unquoted values.
+    - Respects single and double quoted values (comments inside quotes are preserved).
+    - Does not perform variable substitution.
     """
     data: dict[str, str] = {}
     for raw_line in contents.splitlines():
@@ -39,6 +41,21 @@ def parse_env_file(contents: str) -> dict[str, str]:
         key, value = line.split("=", 1)
         key = key.strip()
         value = value.strip()
+
+        # Handle quoted values - preserve everything inside quotes
+        if value and value[0] in ('"', "'"):
+            quote_char = value[0]
+            # Find the closing quote
+            end_quote = value.find(quote_char, 1)
+            # Extract value without quotes (or strip opening quote if no closing quote)
+            value = value[1:end_quote] if end_quote != -1 else value[1:]
+        else:
+            # Unquoted value - strip inline comments
+            # Find # that's not escaped and treat as comment start
+            comment_idx = value.find("#")
+            if comment_idx != -1:
+                value = value[:comment_idx].rstrip()
+
         if key:
             data[key] = value
     return data
diff --git a/hud/cli/utils/docker.py b/hud/cli/utils/docker.py
index 8ef850af..3fed4551 100644
--- a/hud/cli/utils/docker.py
+++ b/hud/cli/utils/docker.py
@@ -121,7 +121,7 @@ def detect_environment_dir(start_dir: Path | None = None) -> Path | None:
     - Current directory containing `hud.lock.yaml`
     - Parent directory containing `hud.lock.yaml`
     - Current directory that looks like an environment if it has either a
-      `Dockerfile` or a `pyproject.toml` (looser than `is_environment_directory`).
+      `Dockerfile.hud`, `Dockerfile`, or a `pyproject.toml` (looser than `is_environment_directory`)
 
     Returns the detected directory path or None if not found.
     """
@@ -132,8 +132,12 @@ def detect_environment_dir(start_dir: Path | None = None) -> Path | None:
         if (candidate / "hud.lock.yaml").exists():
             return candidate
 
-    # Fallback: treat as env if it has Dockerfile OR pyproject.toml
-    if (base / "Dockerfile").exists() or (base / "pyproject.toml").exists():
+    # Fallback: treat as env if it has Dockerfile.hud, Dockerfile, or pyproject.toml
+    if (
+        (base / "Dockerfile.hud").exists()
+        or (base / "Dockerfile").exists()
+        or (base / "pyproject.toml").exists()
+    ):
         return base
 
     return None
@@ -200,7 +204,7 @@ def create_docker_run_command(
 
     # Load env from `.env` in detected env directory
     env_dir_path: Path | None = (
-        Path(env_dir).resolve() if isinstance(env_dir, (str, Path)) else detect_environment_dir()
+        Path(env_dir).resolve() if isinstance(env_dir, str | Path) else detect_environment_dir()
     )
 
     merged_env: dict[str, str] = {}
diff --git a/hud/cli/utils/interactive.py b/hud/cli/utils/interactive.py
index 22a698d4..53a2ab34 100644
--- a/hud/cli/utils/interactive.py
+++ b/hud/cli/utils/interactive.py
@@ -3,7 +3,7 @@
 from __future__ import annotations
 
 import json
-from typing import Any
+from typing import TYPE_CHECKING, Any
 
 import questionary
 from mcp.types import ImageContent, TextContent
@@ -13,9 +13,11 @@
 from rich.syntax import Syntax
 from rich.tree import Tree
 
-from hud.clients import MCPClient
 from hud.utils.hud_console import HUDConsole
 
+if TYPE_CHECKING:
+    from hud.clients import MCPClient
+
 console = Console()
 
 
@@ -38,6 +40,9 @@ def __init__(self, server_url: str, verbose: bool = False) -> None:
     async def connect(self) -> bool:
         """Connect to the MCP server."""
         try:
+            # Lazy import to avoid loading mcp_use on simple CLI commands
+            from hud.clients import MCPClient
+
             # Create MCP config for HTTP transport
             # Note: We explicitly set auth to None to prevent OAuth discovery attempts
             config = {"server": {"url": self.server_url, "auth": None}}
@@ -45,7 +50,6 @@ async def connect(self) -> bool:
             self.client = MCPClient(
                 mcp_config=config,
                 verbose=self.verbose,
-                auto_trace=False,  # Disable telemetry for interactive testing
             )
             await self.client.initialize()
 
diff --git a/hud/cli/utils/metadata.py b/hud/cli/utils/metadata.py
index d19a344c..f26db9ea 100644
--- a/hud/cli/utils/metadata.py
+++ b/hud/cli/utils/metadata.py
@@ -173,6 +173,8 @@ async def analyze_from_metadata(reference: str, output_format: str, verbose: boo
         "tools": [],
         "resources": [],
         "prompts": [],
+        "scenarios": [],
+        "verbose": verbose,
     }
 
     # Add basic info
@@ -206,6 +208,73 @@ async def analyze_from_metadata(reference: str, output_format: str, verbose: boo
                 }
             )
 
+    # Extract resources
+    if "resources" in lock_data:
+        for resource in lock_data["resources"]:
+            analysis["resources"].append(
+                {
+                    "uri": resource.get("uri", ""),
+                    "name": resource.get("name", ""),
+                    "description": resource.get("description", ""),
+                    "mime_type": resource.get("mimeType", resource.get("mime_type", "")),
+                }
+            )
+
+    # Extract prompts
+    if "prompts" in lock_data:
+        for prompt in lock_data["prompts"]:
+            analysis["prompts"].append(
+                {
+                    "name": prompt.get("name", ""),
+                    "description": prompt.get("description", ""),
+                    "arguments": prompt.get("arguments", []),
+                }
+            )
+
+    # Derive scenarios from scenario prompts/resources if present
+    scenarios_by_id: dict[str, dict] = {}
+    for p in analysis["prompts"]:
+        desc = (p.get("description") or "").strip()
+        if not desc.startswith("[Setup]"):
+            continue
+        scenario_id = p.get("name")
+        if not scenario_id:
+            continue
+        env_name, scenario_name = ([*scenario_id.split(":", 1), ""])[:2]
+        scenarios_by_id[scenario_id] = {
+            "id": scenario_id,
+            "env": env_name,
+            "name": scenario_name or scenario_id,
+            "setup_description": desc,
+            "arguments": p.get("arguments") or [],
+            "has_setup_prompt": True,
+            "has_evaluate_resource": False,
+        }
+    for r in analysis["resources"]:
+        desc = (r.get("description") or "").strip()
+        if not desc.startswith("[Evaluate]"):
+            continue
+        scenario_id = r.get("uri")
+        if not scenario_id:
+            continue
+        env_name, scenario_name = ([*scenario_id.split(":", 1), ""])[:2]
+        if scenario_id not in scenarios_by_id:
+            scenarios_by_id[scenario_id] = {
+                "id": scenario_id,
+                "env": env_name,
+                "name": scenario_name or scenario_id,
+                "arguments": [],
+                "has_setup_prompt": False,
+                "has_evaluate_resource": True,
+            }
+        scenarios_by_id[scenario_id]["evaluate_description"] = desc
+        scenarios_by_id[scenario_id]["has_evaluate_resource"] = True
+
+    analysis["scenarios"] = sorted(
+        scenarios_by_id.values(),
+        key=lambda s: (str(s.get("env") or ""), str(s.get("name") or "")),
+    )
+
     # Display results
     hud_console.info("")
     if source == "local":
diff --git a/hud/cli/utils/server.py b/hud/cli/utils/server.py
index 3f3bcc18..6d942d07 100644
--- a/hud/cli/utils/server.py
+++ b/hud/cli/utils/server.py
@@ -138,9 +138,9 @@ async def run_http_server(
             logging.getLogger("uvicorn.access").setLevel(logging.ERROR)
             logging.getLogger("uvicorn.error").setLevel(logging.ERROR)
 
-            import warnings
+            from hud.patches.warnings import apply_default_warning_filters
 
-            warnings.filterwarnings("ignore", category=DeprecationWarning)
+            apply_default_warning_filters(verbose=False)
 
         try:
             await proxy.run_async(
diff --git a/hud/cli/utils/source_hash.py b/hud/cli/utils/source_hash.py
index 71af3bfc..22123396 100644
--- a/hud/cli/utils/source_hash.py
+++ b/hud/cli/utils/source_hash.py
@@ -1,7 +1,7 @@
 """Utilities to compute a fast, deterministic source hash for environments.
 
 This intentionally focuses on the typical HUD environment layout and aims to be fast:
-- Always include: Dockerfile, pyproject.toml
+- Always include: Dockerfile.hud, Dockerfile, pyproject.toml
 - Include directories: controller/, environment/, src/
 - Exclude common build/runtime caches and lock files
 
@@ -40,7 +40,7 @@
     "hud.lock.yaml",
 }
 
-INCLUDE_FILES = {"Dockerfile", "pyproject.toml"}
+INCLUDE_FILES = {"Dockerfile", "Dockerfile.hud", "pyproject.toml"}
 INCLUDE_DIRS = {"server", "mcp", "controller", "environment"}
 
 
diff --git a/hud/cli/utils/tests/test_env_check.py b/hud/cli/utils/tests/test_env_check.py
index 1ec55c77..134549d0 100644
--- a/hud/cli/utils/tests/test_env_check.py
+++ b/hud/cli/utils/tests/test_env_check.py
@@ -50,7 +50,7 @@ def test_find_environment_dir_prefers_lock(tmp_path: Path):
     tasks.write_text("[]")
     env = tmp_path / "env"
     env.mkdir()
-    (env / "hud.lock.yaml").write_text("version: 1.0")
+    (env / "hud.lock.yaml").write_text("version: 1.3")
     # Set cwd to env so it's in the candidate list
     with patch("pathlib.Path.cwd", return_value=env):
         found = find_environment_dir(tasks)
diff --git a/hud/cli/utils/tests/test_interactive_module.py b/hud/cli/utils/tests/test_interactive_module.py
index e234abd9..993233dc 100644
--- a/hud/cli/utils/tests/test_interactive_module.py
+++ b/hud/cli/utils/tests/test_interactive_module.py
@@ -9,7 +9,7 @@
 
 
 @pytest.mark.asyncio
-@patch("hud.cli.utils.interactive.MCPClient")
+@patch("hud.clients.MCPClient")
 async def test_connect_and_disconnect(MockClient):
     client = AsyncMock()
     client.initialize.return_value = None
diff --git a/hud/cli/rl/viewer.py b/hud/cli/utils/viewer.py
similarity index 97%
rename from hud/cli/rl/viewer.py
rename to hud/cli/utils/viewer.py
index 4a817acc..2d6efe28 100644
--- a/hud/cli/rl/viewer.py
+++ b/hud/cli/utils/viewer.py
@@ -1,4 +1,4 @@
-"""Inline JSON preview with expandable view for RL flow.
+"""Inline JSON preview with expandable view.
 
 Uses minimal terminal interaction for inline display.
 """
@@ -46,7 +46,7 @@ def _truncate_value(value: Any, max_len: int = 60) -> str:
         if len(value) > max_len:
             return value[:max_len] + "…"
         return value
-    elif isinstance(value, (dict, list)):
+    elif isinstance(value, dict | list):
         s = json.dumps(value, separators=(",", ":"))
         if len(s) > max_len:
             return s[:max_len] + "…"
diff --git a/hud/clients/__init__.py b/hud/clients/__init__.py
index 0ffce9e4..bec6841b 100644
--- a/hud/clients/__init__.py
+++ b/hud/clients/__init__.py
@@ -3,15 +3,16 @@
 from __future__ import annotations
 
 from .base import AgentMCPClient, BaseHUDClient
+from .environment import EnvironmentClient
 from .fastmcp import FastMCPHUDClient
-from .mcp_use import MCPUseHUDClient
 
-# Default to MCP-use for new features
-MCPClient = MCPUseHUDClient
+# Default to FastMCP client (no optional dependencies)
+MCPClient = FastMCPHUDClient
 
 __all__ = [
     "AgentMCPClient",
     "BaseHUDClient",
+    "EnvironmentClient",
     "FastMCPHUDClient",
     "MCPClient",
 ]
diff --git a/hud/clients/base.py b/hud/clients/base.py
index f60b7b5a..2c3f6729 100644
--- a/hud/clients/base.py
+++ b/hud/clients/base.py
@@ -12,7 +12,6 @@
 from hud.shared.exceptions import HudAuthenticationError, HudException
 from hud.types import MCPToolCall, MCPToolResult
 from hud.utils.hud_console import HUDConsole
-from hud.utils.mcp import setup_hud_telemetry
 from hud.version import __version__ as hud_version
 
 if TYPE_CHECKING:
@@ -86,7 +85,6 @@ def __init__(
         mcp_config: dict[str, dict[str, Any]] | None = None,
         verbose: bool = False,
         strict_validation: bool = False,
-        auto_trace: bool = True,
     ) -> None:
         """
         Initialize base client.
@@ -99,12 +97,11 @@ def __init__(
         self.verbose = verbose
         self._mcp_config = mcp_config
         self._strict_validation = strict_validation
-        self._auto_trace = auto_trace
-        self._auto_trace_cm: Any | None = None  # Store auto-created trace context manager
 
         self._initialized = False
         self._telemetry_data = {}  # Initialize telemetry data
         self._cached_resources: list[types.Resource] = []  # Cache for resources
+        self._cached_prompts: list[types.Prompt] = []  # Cache for prompts
 
         if self.verbose:
             self._setup_verbose_logging()
@@ -127,8 +124,6 @@ async def initialize(self, mcp_config: dict[str, dict[str, Any]] | None = None)
                 "Either pass it to the constructor or call initialize with a configuration"
             )
 
-        self._auto_trace_cm = setup_hud_telemetry(self._mcp_config, auto_trace=self._auto_trace)
-
         hud_console.debug("Initializing MCP client...")
 
         try:
@@ -157,21 +152,11 @@ async def initialize(self, mcp_config: dict[str, dict[str, Any]] | None = None)
 
     async def shutdown(self) -> None:
         """Disconnect from the MCP server."""
-        # Clean up auto-created trace if any
-        if self._auto_trace_cm:
-            try:
-                self._auto_trace_cm.__exit__(None, None, None)
-                hud_console.info("Closed auto-created trace")
-            except Exception as e:
-                hud_console.warning(f"Failed to close auto-created trace: {e}")
-            finally:
-                self._auto_trace_cm = None
-
-        # Disconnect from server
         if self._initialized:
             await self._disconnect()
             self._initialized = False
             self._cached_resources.clear()
+            self._cached_prompts.clear()
             hud_console.info("Environment Shutdown completed")
         else:
             hud_console.debug("Client was not initialized, skipping disconnect")
@@ -231,6 +216,23 @@ async def _list_resources_impl(self) -> list[types.Resource]:
         """Implementation-specific resource listing. Subclasses must implement this."""
         raise NotImplementedError
 
+    async def list_prompts(self) -> list[types.Prompt]:
+        """List all available prompts.
+
+        Uses cached prompts if available, otherwise fetches from the server.
+        Prompts are optional in MCP; default implementation returns an empty list.
+        """
+        if not self._cached_prompts:
+            self._cached_prompts = await self._list_prompts_impl()
+        return self._cached_prompts
+
+    async def _list_prompts_impl(self) -> list[types.Prompt]:
+        """Implementation-specific prompt listing (optional).
+
+        Subclasses can override to support prompt discovery.
+        """
+        return []
+
     @abstractmethod
     async def _call_tool(self, tool_call: MCPToolCall) -> MCPToolResult:
         """Execute a tool by name."""
@@ -347,6 +349,9 @@ async def analyze_environment(self) -> dict[str, Any]:
             "hub_tools": {},
             "telemetry": self._telemetry_data,
             "resources": [],
+            "prompts": [],
+            "scenarios": [],
+            "verbose": self.verbose,
             "metadata": {
                 "servers": list(self._mcp_config.keys()),  # type: ignore
                 "initialized": self._initialized,
@@ -378,16 +383,125 @@ async def analyze_environment(self) -> dict[str, Any]:
         try:
             resources = await self.list_resources()
             for resource in resources:
-                resource_info = {
+                resource_info: dict[str, Any] = {
                     "uri": str(resource.uri),
                     "name": resource.name,
                     "description": resource.description,
                     "mime_type": getattr(resource, "mimeType", None),
                 }
+                # Include meta field if present (contains scenario source code)
+                meta = getattr(resource, "meta", None)
+                if meta:
+                    resource_info["meta"] = meta
                 analysis["resources"].append(resource_info)
         except Exception as e:
             if self.verbose:
-                hud_console.debug(f"Could not list resources: {e}")
+                hud_console.debug("Could not list resources: " + str(e))
+
+        # Get all prompts (optional)
+        try:
+            prompts = await self.list_prompts()
+            for prompt in prompts:
+                raw_args = getattr(prompt, "arguments", []) or []
+                args: list[dict[str, Any]] = [
+                    {
+                        "name": getattr(a, "name", None),
+                        "required": getattr(a, "required", None),
+                        "description": getattr(a, "description", None),
+                    }
+                    for a in raw_args
+                ]
+
+                prompt_info: dict[str, Any] = {
+                    "name": prompt.name,
+                    "description": prompt.description,
+                    "arguments": args,
+                }
+                # Include meta field if present
+                meta = getattr(prompt, "meta", None)
+                if meta:
+                    prompt_info["meta"] = meta
+                    # Merge type/default info from meta.arguments into the arguments array
+                    if isinstance(meta, dict) and "arguments" in meta:
+                        meta_args = {a["name"]: a for a in meta["arguments"] if "name" in a}
+                        for arg in args:
+                            arg_name = arg.get("name")
+                            if arg_name and arg_name in meta_args:
+                                meta_arg = meta_args[arg_name]
+                                if "default" in meta_arg:
+                                    arg["default"] = meta_arg["default"]
+                                if "type" in meta_arg:
+                                    arg["type"] = meta_arg["type"]
+                                if "inputSchema" in meta_arg:
+                                    arg["inputSchema"] = meta_arg["inputSchema"]
+                analysis["prompts"].append(prompt_info)
+        except Exception as e:
+            if self.verbose:
+                hud_console.debug("Could not list prompts: " + str(e))
+
+        # Derive "scenarios" from Environment.@scenario prompts/resources.
+        # A scenario is exposed as:
+        # - Prompt: name "{env}:{scenario}" with description prefix "[Setup]"
+        # - Resource: uri "{env}:{scenario}" with description prefix "[Evaluate]"
+        # Both prompt and resource contain meta.code with the scenario source code
+        scenarios_by_id: dict[str, dict[str, Any]] = {}
+
+        for p in analysis.get("prompts", []):
+            desc = (p.get("description") or "").strip()
+            if not desc.startswith("[Setup]"):
+                continue
+            scenario_id = p.get("name")
+            if not scenario_id:
+                continue
+            env_name, scenario_name = ([*scenario_id.split(":", 1), ""])[:2]
+            scenario_info: dict[str, Any] = {
+                "id": scenario_id,
+                "env": env_name,
+                "name": scenario_name or scenario_id,
+                "setup_description": desc,
+                "arguments": p.get("arguments") or [],
+                "has_setup_prompt": True,
+                "has_evaluate_resource": False,
+            }
+            # Extract code from meta field if present
+            meta = p.get("meta")
+            if meta and isinstance(meta, dict) and "code" in meta:
+                scenario_info["code"] = meta["code"]
+            scenarios_by_id[scenario_id] = scenario_info
+
+        for r in analysis.get("resources", []):
+            desc = (r.get("description") or "").strip()
+            if not desc.startswith("[Evaluate]"):
+                continue
+            scenario_id = r.get("uri")
+            if not scenario_id:
+                continue
+            env_name, scenario_name = ([*scenario_id.split(":", 1), ""])[:2]
+            if scenario_id not in scenarios_by_id:
+                scenarios_by_id[scenario_id] = {
+                    "id": scenario_id,
+                    "env": env_name,
+                    "name": scenario_name or scenario_id,
+                    "arguments": [],
+                    "has_setup_prompt": False,
+                    "has_evaluate_resource": True,
+                }
+            scenarios_by_id[scenario_id]["evaluate_description"] = desc
+            scenarios_by_id[scenario_id]["has_evaluate_resource"] = True
+            # Extract code from meta field if not already present (from prompt)
+            meta = r.get("meta")
+            if (
+                meta
+                and isinstance(meta, dict)
+                and "code" in meta
+                and "code" not in scenarios_by_id[scenario_id]
+            ):
+                scenarios_by_id[scenario_id]["code"] = meta["code"]
+
+        analysis["scenarios"] = sorted(
+            scenarios_by_id.values(),
+            key=lambda s: (str(s.get("env") or ""), str(s.get("name") or "")),
+        )
 
         return analysis
 
diff --git a/hud/clients/environment.py b/hud/clients/environment.py
new file mode 100644
index 00000000..6f42f368
--- /dev/null
+++ b/hud/clients/environment.py
@@ -0,0 +1,51 @@
+"""Environment-based client adapter for agents."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any
+
+import mcp.types as types
+
+from hud.types import MCPToolCall, MCPToolResult
+
+if TYPE_CHECKING:
+    from hud.environment import Environment
+    from hud.eval.context import EvalContext
+
+__all__ = ["EnvironmentClient"]
+
+
+class EnvironmentClient:
+    """Adapter wrapping Environment/EvalContext as AgentMCPClient."""
+
+    def __init__(self, env: Environment | EvalContext) -> None:
+        self._env = env
+        self._initialized = False
+
+    @property
+    def mcp_config(self) -> dict[str, dict[str, Any]]:
+        return {}
+
+    @property
+    def is_connected(self) -> bool:
+        return self._initialized
+
+    async def initialize(self, mcp_config: dict[str, dict[str, Any]] | None = None) -> None:
+        if not self._initialized:
+            await self._env.list_tools()
+            self._initialized = True
+
+    async def list_tools(self) -> list[types.Tool]:
+        return await self._env.list_tools()
+
+    async def call_tool(self, tool_call: MCPToolCall) -> MCPToolResult:
+        result = await self._env.call_tool(tool_call.name, **(tool_call.arguments or {}))
+        if isinstance(result, MCPToolResult):
+            return result
+        return MCPToolResult(
+            content=[types.TextContent(type="text", text=str(result))],
+            isError=False,
+        )
+
+    async def shutdown(self) -> None:
+        self._initialized = False
diff --git a/hud/clients/fastmcp.py b/hud/clients/fastmcp.py
index b285a037..19e157d8 100644
--- a/hud/clients/fastmcp.py
+++ b/hud/clients/fastmcp.py
@@ -111,7 +111,7 @@ async def _connect(self, mcp_config: dict[str, dict[str, Any]]) -> None:
                     hasattr(self._client, "_session_state")
                     and self._client._session_state.session is not None
                 ):
-                    self._client._session_state.session._validate_structured_outputs = (
+                    self._client._session_state.session._validate_structured_outputs = (  # type: ignore[attr-defined]
                         self._strict_validation
                     )
             except ImportError:
@@ -125,6 +125,12 @@ async def list_tools(self) -> list[types.Tool]:
             raise ValueError("Client is not connected, call initialize() first")
         return await self._client.list_tools()
 
+    async def _list_prompts_impl(self) -> list[types.Prompt]:
+        """List all available prompts (FastMCP supports this)."""
+        if self._client is None:
+            raise ValueError("Client is not connected, call initialize() first")
+        return await self._client.list_prompts()
+
     async def _call_tool(self, tool_call: MCPToolCall) -> MCPToolResult:
         """Execute a tool by name."""
         if self._client is None:
diff --git a/hud/clients/mcp_use.py b/hud/clients/mcp_use.py
index 5cd5caf1..9a9908c3 100644
--- a/hud/clients/mcp_use.py
+++ b/hud/clients/mcp_use.py
@@ -9,9 +9,8 @@
 
 from mcp import Implementation, types
 from mcp.shared.exceptions import McpError
-from mcp_use.client import MCPClient as MCPUseClient
-from mcp_use.session import MCPSession as MCPUseSession
-from mcp_use.types.http import HttpOptions
+from mcp_use.client.client import MCPClient as MCPUseClient
+from mcp_use.client.session import MCPSession as MCPUseSession
 from pydantic import AnyUrl
 
 from hud.settings import settings
@@ -20,7 +19,6 @@
 from hud.version import __version__ as hud_version
 
 from .base import BaseHUDClient
-from .utils.retry_transport import create_retry_httpx_client
 
 logger = logging.getLogger(__name__)
 hud_console = HUDConsole(logger=logger)
@@ -58,12 +56,6 @@ def __init__(
             str, tuple[str, types.Tool, types.Tool]
         ] = {}  # server_name, original_tool, prefixed_tool
         self._client: Any | None = None  # Will be MCPUseClient when available
-        # Transport options for MCP-use (disable_sse_fallback, httpx_client_factory, etc.)
-        # Default to retry-enabled HTTPX client if factory not provided
-        self._http_options: HttpOptions = HttpOptions(
-            httpx_client_factory=create_retry_httpx_client,
-            disable_sse_fallback=True,
-        )
 
     async def _connect(self, mcp_config: dict[str, dict[str, Any]]) -> None:
         """Create all sessions for MCP-use client."""
@@ -93,7 +85,7 @@ async def _connect(self, mcp_config: dict[str, dict[str, Any]]) -> None:
         config = {"mcpServers": mcp_config}
         if MCPUseClient is None:
             raise ImportError("MCPUseClient is not available")
-        self._client = MCPUseClient.from_dict(config, http_options=self._http_options)
+        self._client = MCPUseClient.from_dict(config)
         try:
             assert self._client is not None
             self._sessions = await self._client.create_all_sessions()
@@ -275,6 +267,32 @@ async def _list_resources_impl(self) -> list[types.Resource]:
                 continue
         return []
 
+    async def _list_prompts_impl(self) -> list[types.Prompt]:
+        """Implementation of prompt listing for MCP-use client (best-effort)."""
+        if self._client is None or not self._sessions:
+            raise ValueError("Client is not connected, call initialize() first")
+
+        all_prompts: list[types.Prompt] = []
+        for server_name, session in self._sessions.items():
+            try:
+                if not hasattr(session, "connector") or not hasattr(
+                    session.connector, "client_session"
+                ):
+                    continue
+                if session.connector.client_session is None:
+                    continue
+
+                if not hasattr(session.connector.client_session, "list_prompts"):
+                    continue
+
+                prompts_result = await session.connector.client_session.list_prompts()
+                all_prompts.extend(prompts_result.prompts)
+            except Exception as e:
+                if self.verbose:
+                    hud_console.debug(f"Could not list prompts from server '{server_name}': {e}")
+                continue
+        return all_prompts
+
     async def read_resource(self, uri: str | AnyUrl) -> types.ReadResourceResult | None:
         """Read a resource by URI from any server that provides it."""
         if self._client is None or not self._sessions:
diff --git a/hud/clients/tests/test_analyze_scenarios.py b/hud/clients/tests/test_analyze_scenarios.py
new file mode 100644
index 00000000..67ae6a47
--- /dev/null
+++ b/hud/clients/tests/test_analyze_scenarios.py
@@ -0,0 +1,206 @@
+"""Tests for scenario discovery via prompts/resources in analyze_environment()."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any
+
+import pytest
+from mcp import types
+from pydantic import AnyUrl
+
+from hud.clients.base import BaseHUDClient
+
+if TYPE_CHECKING:
+    from hud.types import MCPToolCall, MCPToolResult
+
+
+class _MockClient(BaseHUDClient):
+    """Minimal BaseHUDClient for testing analyze_environment scenario derivation."""
+
+    def __init__(
+        self,
+        *,
+        prompts: list[types.Prompt],
+        resources: list[types.Resource],
+    ) -> None:
+        super().__init__(mcp_config={"test": {"url": "mock://test"}}, verbose=True)
+        self._mock_prompts = prompts
+        self._mock_resources = resources
+        # Skip initialize() (which fetches telemetry); we just need analyze_environment().
+        self._initialized = True
+
+    async def _connect(self, mcp_config: dict[str, dict[str, Any]]) -> None:  # pragma: no cover
+        return None
+
+    async def list_tools(self) -> list[types.Tool]:
+        return []
+
+    async def _list_resources_impl(self) -> list[types.Resource]:
+        return self._mock_resources
+
+    async def _list_prompts_impl(self) -> list[types.Prompt]:
+        return self._mock_prompts
+
+    async def _call_tool(self, tool_call: MCPToolCall) -> MCPToolResult:  # pragma: no cover
+        raise NotImplementedError
+
+    async def read_resource(self, uri: str) -> types.ReadResourceResult | None:  # pragma: no cover
+        return None
+
+    async def _disconnect(self) -> None:  # pragma: no cover
+        return None
+
+
+@pytest.mark.asyncio
+async def test_analyze_environment_derives_scenarios_from_scenario_prompt_and_resource() -> None:
+    prompts = [
+        types.Prompt(
+            name="my-env:checkout",
+            description="[Setup] Checkout flow",
+            arguments=[],
+        )
+    ]
+    resources = [
+        types.Resource(
+            uri=AnyUrl("my-env:checkout"),
+            name="checkout",
+            description="[Evaluate] Checkout flow",
+        )
+    ]
+
+    client = _MockClient(prompts=prompts, resources=resources)
+    analysis = await client.analyze_environment()
+
+    assert "scenarios" in analysis
+    assert len(analysis["scenarios"]) == 1
+    scenario = analysis["scenarios"][0]
+    assert scenario["id"] == "my-env:checkout"
+    assert scenario["env"] == "my-env"
+    assert scenario["name"] == "checkout"
+    assert scenario["has_setup_prompt"] is True
+    assert scenario["has_evaluate_resource"] is True
+
+
+@pytest.mark.asyncio
+async def test_analyze_environment_scenario_from_setup_only() -> None:
+    prompts = [
+        types.Prompt(
+            name="env-x:only_setup",
+            description="[Setup] Setup only scenario",
+            arguments=[],
+        )
+    ]
+    resources: list[types.Resource] = []
+
+    client = _MockClient(prompts=prompts, resources=resources)
+    analysis = await client.analyze_environment()
+
+    assert len(analysis["scenarios"]) == 1
+    scenario = analysis["scenarios"][0]
+    assert scenario["id"] == "env-x:only_setup"
+    assert scenario["has_setup_prompt"] is True
+    assert scenario["has_evaluate_resource"] is False
+
+
+@pytest.mark.asyncio
+async def test_analyze_environment_scenario_from_evaluate_only() -> None:
+    prompts: list[types.Prompt] = []
+    resources = [
+        types.Resource(
+            uri=AnyUrl("env-y:only_eval"),
+            name="only_eval",
+            description="[Evaluate] Evaluate only scenario",
+        )
+    ]
+
+    client = _MockClient(prompts=prompts, resources=resources)
+    analysis = await client.analyze_environment()
+
+    assert len(analysis["scenarios"]) == 1
+    scenario = analysis["scenarios"][0]
+    assert scenario["id"] == "env-y:only_eval"
+    assert scenario["has_setup_prompt"] is False
+    assert scenario["has_evaluate_resource"] is True
+
+
+@pytest.mark.asyncio
+async def test_analyze_environment_extracts_scenario_code_from_meta() -> None:
+    """Test that scenario code is extracted from the meta field."""
+    scenario_code = """@env.scenario()
+async def checkout(product_id: str):
+    await env.call_tool("navigate", url="/checkout")
+    yield "Complete the checkout"
+    result = await env.call_tool("check_order")
+    yield 1.0 if result else 0.0
+"""
+    # Use model_validate with _meta alias (Pydantic alias for the meta field)
+    prompts = [
+        types.Prompt.model_validate(
+            {
+                "name": "my-env:checkout",
+                "description": "[Setup] Checkout flow",
+                "arguments": [{"name": "product_id", "required": True}],
+                "_meta": {"code": scenario_code},
+            }
+        )
+    ]
+    resources = [
+        types.Resource.model_validate(
+            {
+                "uri": "my-env:checkout",
+                "name": "checkout",
+                "description": "[Evaluate] Checkout flow",
+                "_meta": {"code": scenario_code},
+            }
+        )
+    ]
+
+    client = _MockClient(prompts=prompts, resources=resources)
+    analysis = await client.analyze_environment()
+
+    assert len(analysis["scenarios"]) == 1
+    scenario = analysis["scenarios"][0]
+    assert scenario["id"] == "my-env:checkout"
+    assert "code" in scenario
+    assert scenario["code"] == scenario_code
+    assert "async def checkout" in scenario["code"]
+
+
+@pytest.mark.asyncio
+async def test_analyze_environment_extracts_meta_on_prompts_and_resources() -> None:
+    """Test that meta field is included in prompts and resources analysis."""
+    meta_data = {"code": "test code", "extra": "value"}
+    # Use model_validate with _meta alias (Pydantic alias for the meta field)
+    prompts = [
+        types.Prompt.model_validate(
+            {
+                "name": "test-prompt",
+                "description": "A test prompt",
+                "arguments": [],
+                "_meta": meta_data,
+            }
+        )
+    ]
+    resources = [
+        types.Resource.model_validate(
+            {
+                "uri": "file:///test",
+                "name": "test-resource",
+                "description": "A test resource",
+                "_meta": meta_data,
+            }
+        )
+    ]
+
+    client = _MockClient(prompts=prompts, resources=resources)
+    analysis = await client.analyze_environment()
+
+    # Check prompts have meta
+    assert len(analysis["prompts"]) == 1
+    assert "meta" in analysis["prompts"][0]
+    assert analysis["prompts"][0]["meta"] == meta_data
+
+    # Check resources have meta
+    assert len(analysis["resources"]) == 1
+    assert "meta" in analysis["resources"][0]
+    assert analysis["resources"][0]["meta"] == meta_data
diff --git a/hud/datasets/__init__.py b/hud/datasets/__init__.py
index 951a32d7..6bf88851 100644
--- a/hud/datasets/__init__.py
+++ b/hud/datasets/__init__.py
@@ -1,33 +1,36 @@
 """HUD datasets module.
 
-Provides data models, utilities, and execution functions for working with HUD datasets.
+Provides unified task loading, saving, and execution for HUD evaluations.
+
+Key functions:
+- load_tasks(): Load tasks from JSON, JSONL, HuggingFace, or HUD API
+- save_tasks(): Save tasks to the HUD API
+- run_dataset(): Run an agent on a dataset of tasks
+- submit_rollouts(): Submit tasks for remote execution
+
+Supports both v4 (LegacyTask) and v5 (Task) formats with automatic conversion.
 """
 
-# Data models
-# Execution functions
 from __future__ import annotations
 
-from hud.types import Task
-from hud.utils.tasks import save_tasks
+from hud.eval.display import display_results
 
-from .runner import run_dataset, run_single_task, run_tasks
+from .loader import load_dataset, load_tasks, save_tasks
+from .runner import run_dataset, run_single_task
 from .utils import (
     BatchRequest,
     SingleTaskRequest,
-    calculate_group_stats,
-    display_results,
     submit_rollouts,
 )
 
 __all__ = [
     "BatchRequest",
     "SingleTaskRequest",
-    "Task",
-    "calculate_group_stats",
     "display_results",
+    "load_dataset",  # Deprecated alias
+    "load_tasks",
     "run_dataset",
     "run_single_task",
-    "run_tasks",
     "save_tasks",
     "submit_rollouts",
 ]
diff --git a/hud/datasets/loader.py b/hud/datasets/loader.py
new file mode 100644
index 00000000..93acf11e
--- /dev/null
+++ b/hud/datasets/loader.py
@@ -0,0 +1,327 @@
+"""Task loading utilities for HUD.
+
+Unified interface for loading evaluation tasks from:
+- HUD API (v5 format)
+- Local JSON/JSONL files (v4 LegacyTask format, auto-converted)
+- HuggingFace datasets (v4 LegacyTask format, auto-converted)
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import warnings
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, overload
+
+if TYPE_CHECKING:
+    from hud.eval.task import Task
+
+logger = logging.getLogger(__name__)
+
+__all__ = ["load_dataset", "load_tasks", "save_tasks"]
+
+
+def _load_raw_from_file(path: Path) -> list[dict[str, Any]]:
+    """Load raw task dicts from a local JSON or JSONL file."""
+    raw_items: list[dict[str, Any]] = []
+
+    if path.suffix == ".jsonl":
+        # JSONL: one task per line
+        with open(path, encoding="utf-8") as f:
+            for line in f:
+                line = line.strip()
+                if not line:
+                    continue
+                item = json.loads(line)
+                # Handle case where line contains a list
+                if isinstance(item, list):
+                    raw_items.extend(i for i in item if isinstance(i, dict))
+                elif isinstance(item, dict):
+                    raw_items.append(item)
+                else:
+                    raise ValueError(
+                        f"Invalid JSONL format: expected dict or list, got {type(item)}"
+                    )
+    else:
+        # JSON: array of tasks
+        with open(path, encoding="utf-8") as f:
+            data = json.load(f)
+
+        if isinstance(data, list):
+            raw_items = [item for item in data if isinstance(item, dict)]
+        elif isinstance(data, dict):
+            raw_items = [data]
+        else:
+            raise ValueError(f"JSON file must contain an array or object, got {type(data)}")
+
+    return raw_items
+
+
+def _load_from_file(path: Path) -> list[Task]:
+    """Load tasks from a local JSON or JSONL file."""
+    from hud.eval.task import Task
+
+    raw_items = _load_raw_from_file(path)
+    return [Task(**item) for item in raw_items]
+
+
+def _load_raw_from_huggingface(dataset_name: str) -> list[dict[str, Any]]:
+    """Load raw task dicts from HuggingFace dataset."""
+    try:
+        from datasets import load_dataset as hf_load_dataset
+    except ImportError as e:
+        raise ImportError(
+            "Please install 'datasets' to load from HuggingFace: uv pip install datasets"
+        ) from e
+
+    # Parse dataset name and optional split
+    if ":" in dataset_name:
+        name, split = dataset_name.split(":", 1)
+    else:
+        name = dataset_name
+        split = "train"  # Default split
+
+    logger.info("Loading from HuggingFace dataset: %s (split=%s)", name, split)
+    dataset = hf_load_dataset(name, split=split)
+
+    raw_items: list[dict[str, Any]] = []
+    for item in dataset:
+        if not isinstance(item, dict):
+            raise ValueError(f"Invalid HuggingFace dataset: expected dict, got {type(item)}")
+        raw_items.append(dict(item))
+
+    return raw_items
+
+
+def _load_from_huggingface(dataset_name: str) -> list[Task]:
+    """Load tasks from HuggingFace dataset."""
+    raw_items = _load_raw_from_huggingface(dataset_name)
+    from hud.eval.task import Task
+
+    return [Task(**item) for item in raw_items]
+
+
+def _load_raw_from_api(dataset_name: str) -> list[dict[str, Any]]:
+    """Load raw task dicts from HUD API."""
+    import httpx
+
+    from hud.settings import settings
+
+    headers = {}
+    if settings.api_key:
+        headers["Authorization"] = f"Bearer {settings.api_key}"
+
+    with httpx.Client() as client:
+        response = client.get(
+            f"{settings.hud_api_url}/tasks/evalset/{dataset_name}",
+            headers=headers,
+            params={"all": "true"},
+        )
+        response.raise_for_status()
+        data = response.json()
+
+        # Extract tasks dict from response
+        tasks_dict = data.get("tasks", {})
+
+        raw_items: list[dict[str, Any]] = []
+        for task_id, task_data in tasks_dict.items():
+            if task_data.get("id") is None:
+                task_data["id"] = task_id
+            raw_items.append(task_data)
+
+        return raw_items
+
+
+def _load_from_api(dataset_name: str) -> list[Task]:
+    """Load tasks from HUD API."""
+    from hud.eval.task import Task
+
+    raw_items = _load_raw_from_api(dataset_name)
+    return [Task(**item) for item in raw_items]
+
+
+@overload
+def load_tasks(source: str, *, raw: bool = False) -> list[Task]: ...
+
+
+@overload
+def load_tasks(source: str, *, raw: bool = True) -> list[dict[str, Any]]: ...
+
+
+def load_tasks(source: str, *, raw: bool = False) -> list[Task] | list[dict[str, Any]]:
+    """Load tasks from a source.
+
+    Supports multiple sources with auto-detection:
+    - Local file path (JSON or JSONL)
+    - HUD API dataset slug (e.g., "hud-evals/SheetBench-50")
+    - HuggingFace dataset (e.g., "username/dataset" or "username/dataset:split")
+
+    Automatically detects and converts v4 LegacyTask format to v5 Task.
+
+    Args:
+        source: Task source. Can be:
+            - Path to a local JSON/JSONL file
+            - HUD API dataset slug (e.g., "hud-evals/SheetBench-50")
+            - HuggingFace dataset name (e.g., "hud-evals/tasks" or "hud-evals/tasks:train")
+        raw: If True, return raw dicts without validation or env var substitution.
+            Useful for preserving template strings like "${HUD_API_KEY}".
+
+    Returns:
+        - If raw=False (default): list[Task] ready to use with hud.eval()
+        - If raw=True: list[dict] with raw task data
+
+    Example:
+        ```python
+        import hud
+        from hud.datasets import load_tasks
+
+        # Load from HUD API
+        tasks = load_tasks("hud-evals/SheetBench-50")
+
+        # Load from local file (v4 format auto-converted)
+        tasks = load_tasks("./my-tasks.json")
+
+        # Load from HuggingFace
+        tasks = load_tasks("hud-evals/benchmark:test")
+
+        # Load raw dicts (preserves env var placeholders)
+        raw_tasks = load_tasks("./tasks.json", raw=True)
+
+        # Run evaluation
+        async with hud.eval(tasks) as ctx:
+            await agent.run(ctx)
+        ```
+
+    Raises:
+        ValueError: If task loading fails
+    """
+    # Check if it's a local file
+    path = Path(source)
+    if path.exists() and path.suffix in {".json", ".jsonl"}:
+        logger.info("Loading tasks from file: %s", source)
+        items = _load_raw_from_file(path) if raw else _load_from_file(path)
+        logger.info("Loaded %d tasks from %s", len(items), source)
+        return items
+
+    # Try HUD API first
+    try:
+        logger.info("Trying HUD API: %s", source)
+        items = _load_raw_from_api(source) if raw else _load_from_api(source)
+        logger.info("Loaded %d tasks from HUD API: %s", len(items), source)
+        return items
+    except Exception as hud_error:
+        logger.debug("HUD API load failed (%s), trying HuggingFace", hud_error)
+
+    # Try HuggingFace as fallback
+    try:
+        logger.info("Trying HuggingFace dataset: %s", source)
+        items = _load_raw_from_huggingface(source) if raw else _load_from_huggingface(source)
+        logger.info("Loaded %d tasks from HuggingFace: %s", len(items), source)
+        return items
+    except ImportError:
+        raise ValueError(
+            f"Failed to load tasks from '{source}'. "
+            "Install 'datasets' package for HuggingFace support."
+        ) from None
+    except Exception as hf_error:
+        raise ValueError(f"Failed to load tasks from '{source}': {hf_error}") from hf_error
+
+
+def save_tasks(
+    name: str,
+    tasks: list[Task],
+) -> str:
+    """Save tasks to the HUD API.
+
+    Creates or updates an evalset with the given tasks.
+
+    Args:
+        name: Evalset name/slug (e.g., "my-evals/benchmark-v1").
+            If no org prefix, uses user's default org.
+        tasks: List of Task objects (v5 format) to save.
+
+    Returns:
+        The evalset ID of the created/updated evalset.
+
+    Example:
+        ```python
+        from hud.datasets import save_tasks, load_tasks
+        from hud.eval.task import Task
+        from hud.environment import Environment
+
+        # Create tasks
+        env = Environment("my-env")
+        tasks = [
+            Task(env=env, scenario="checkout", args={"user": "alice"}),
+            Task(env=env, scenario="checkout", args={"user": "bob"}),
+        ]
+
+        # Save to HUD API
+        evalset_id = save_tasks("my-evals/benchmark-v1", tasks)
+
+        # Later, load them back
+        loaded = load_tasks("my-evals/benchmark-v1")
+        ```
+
+    Raises:
+        TypeError: If any task is not a v5 Task object (must have 'scenario')
+        ValueError: If API key is not set or save fails
+    """
+    import httpx
+
+    from hud.settings import settings
+
+    if not settings.api_key:
+        raise ValueError("HUD_API_KEY is required to save tasks")
+
+    # Validate all tasks are v5 format (must have 'scenario')
+    for i, task in enumerate(tasks):
+        if not hasattr(task, "scenario"):
+            raise TypeError(
+                f"Task at index {i} is missing 'scenario' - only v5 Task objects can be saved. "
+                "Use Task.from_v4(legacy_task) to convert from LegacyTask."
+            )
+
+    # Convert tasks to dicts (Task is a Pydantic model)
+    task_dicts = [task.model_dump(mode="json", exclude_none=True) for task in tasks]
+
+    # Build request payload
+    payload: dict[str, Any] = {
+        "name": name,
+        "tasks": task_dicts,
+    }
+
+    headers = {"Authorization": f"Bearer {settings.api_key}"}
+
+    try:
+        with httpx.Client(timeout=60) as client:
+            response = client.post(
+                f"{settings.hud_api_url}/tasks/evalset",
+                json=payload,
+                headers=headers,
+            )
+            response.raise_for_status()
+            data = response.json()
+            evalset_id = data.get("evalset_id") or data.get("id") or name
+            logger.info("Saved %d tasks to evalset: %s", len(tasks), evalset_id)
+            return evalset_id
+    except httpx.HTTPStatusError as e:
+        raise ValueError(f"Failed to save tasks: {e.response.text}") from e
+    except Exception as e:
+        raise ValueError(f"Failed to save tasks: {e}") from e
+
+
+# Deprecated alias for backwards compatibility
+def load_dataset(source: str, *, raw: bool = False) -> list[Task] | list[dict[str, Any]]:
+    """Deprecated: Use load_tasks() instead.
+
+    .. deprecated:: 0.6.0
+        load_dataset() is deprecated. Use load_tasks() instead.
+    """
+    warnings.warn(
+        "load_dataset() is deprecated. Use load_tasks() instead.",
+        DeprecationWarning,
+        stacklevel=2,
+    )
+    return load_tasks(source, raw=raw)
diff --git a/hud/datasets/runner.py b/hud/datasets/runner.py
index 5b20092c..6b2f3fb0 100644
--- a/hud/datasets/runner.py
+++ b/hud/datasets/runner.py
@@ -1,298 +1,212 @@
-"""Core task runner for evaluating agents on datasets."""
+"""Core task runner for evaluating agents on datasets.
+
+Requires the [agents] extra: pip install hud-python[agents]
+"""
 
 from __future__ import annotations
 
-import asyncio
-import json
 import logging
-import uuid
-import warnings
-from typing import TYPE_CHECKING, Any, cast
-
-from datasets import Dataset, load_dataset
+from typing import TYPE_CHECKING, Any
 
-from hud import async_job, async_trace
-from hud.datasets.utils import calculate_group_stats, submit_rollouts
-from hud.types import AgentType, Task, Trace
+import hud
+from hud.types import AgentType, LegacyTask, TaskInput, Trace
 
 if TYPE_CHECKING:
-    from hud.agents import MCPAgent
+    from collections.abc import Sequence
+
+    from hud.eval.context import EvalContext
+    from hud.eval.task import Task
 
 logger = logging.getLogger("hud.datasets")
 
 
-async def run_single_task(
-    task: Task,
-    agent_type: AgentType,
+async def run_dataset(
+    tasks: str | TaskInput | Sequence[TaskInput],
+    agent_type: str | AgentType,
+    *,
     agent_params: dict[str, Any] | None = None,
     max_steps: int = 10,
-    job_id: str | None = None,
-    task_id: str | None = None,
-    group_id: str | None = None,
-    trace_id: str | None = None,
-    trace_name: str | None = None,
-    metadata: dict[str, Any] | None = None,
-) -> Trace:
-    """Execute a single task with tracing.
-
-    This is the core execution primitive for running a single task.
-
-    Args:
-        task: Task to execute
-        agent_type: Agent type to use
-        agent_params: Parameters passed to agent.create(). Should include fields
-            from BaseCreateParams (auto_trace, auto_respond, verbose) plus
-            agent-specific config fields (e.g., use_computer_beta for ClaudeConfig).
-        max_steps: Maximum steps for agent execution
-        job_id: Job ID for telemetry grouping
-        task_id: Task ID for telemetry
-        group_id: Group ID for variance estimation runs
-        trace_id: Trace ID for telemetry (auto-generated if not provided)
-        trace_name: Name for the trace (defaults to task prompt)
-        metadata: Additional trace metadata
-
-    Returns:
-        Trace result from agent execution
-    """
-    name = trace_name or task.prompt or task_id or "task"
-
-    async with async_trace(
-        name,
-        job_id=job_id,
-        task_id=task_id,
-        group_id=group_id,
-        trace_id=trace_id,
-        attrs=metadata or {},
-    ):
-        agent = agent_type.cls.create(**(agent_params or {}))
-        return await agent.run(task, max_steps=max_steps)
-
-
-async def run_tasks(
-    tasks: list[Task],
-    agent_type: AgentType,
-    agent_params: dict[str, Any] | None = None,
-    *,
-    name: str = "Evaluation",
     max_concurrent: int = 30,
-    metadata: dict[str, Any] | None = None,
-    max_steps: int = 10,
     group_size: int = 1,
-    remote: bool = False,
-) -> list[Any]:
-    """Run a list of tasks with automatic job and telemetry tracking.
+    quiet: bool = True,
+) -> list[EvalContext]:
+    """Run an agent on a dataset of tasks.
 
-    This is the core evaluation function. Use this when you have a list of tasks
-    to run, whether loaded from a dataset, filtered, or constructed programmatically.
+    This is the primary entry point for running evaluations programmatically.
+    The agent is created fresh for each task context to ensure correct tool initialization.
 
     Args:
-        tasks: List of Task objects
-        agent_type: AgentType specifying which agent to use
-        agent_params: Parameters passed to agent.create(). Should include fields
-            from BaseCreateParams (auto_trace, auto_respond, verbose) plus
-            agent-specific config fields (e.g., checkpoint_name for ClaudeConfig).
-        name: Name for the job
-        max_concurrent: Maximum concurrent tasks
-        metadata: Optional job metadata
-        max_steps: Maximum steps per task
-        group_size: Number of times to run each task (for variance estimation)
-        remote: If True, submit tasks to HUD platform for remote execution
+        tasks: Tasks to run. Can be:
+            - A source string (file path, API slug) - loaded via load_tasks()
+            - A single TaskInput (Task, LegacyTask, or dict)
+            - A list of TaskInput objects
+        agent_type: Type of agent to create (e.g., "claude", "openai", AgentType.CLAUDE).
+        agent_params: Parameters to pass to agent.create().
+        max_steps: Maximum steps per task.
+        max_concurrent: Maximum concurrent tasks (for parallel execution).
+        group_size: Number of times to run each task (for variance estimation).
+        quiet: Whether to suppress printing eval links and opening browser (default True).
 
     Returns:
-        If remote: Empty list (fire-and-forget submission)
-        If group_size == 1: List of Trace results in task order.
-        If group_size > 1: List of statistics dicts for each task group.
+        List of EvalContext results from each task execution. Access `.reward` on each.
 
     Example:
-        # Run specific tasks locally
-        all_tasks = load_tasks("hud-evals/SheetBench-50")
-        selected = [t for t in all_tasks if t.id in ["task_1", "task_5"]]
-        results = await run_tasks(selected, AgentType.CLAUDE, {"checkpoint_name": "..."})
-
-        # Run with variance estimation
-        stats = await run_tasks(tasks, AgentType.CLAUDE, group_size=3)
+        ```python
+        from hud.datasets import load_tasks, run_dataset
+
+        # Load tasks and run
+        tasks = load_tasks("my-tasks.json")
+        results = await run_dataset(
+            tasks,
+            agent_type="claude",
+            agent_params={"checkpoint_name": "claude-sonnet-4-20250514"},
+            max_steps=50,
+        )
 
-        # Submit for remote execution
-        await run_tasks(tasks, AgentType.CLAUDE, remote=True)
+        for ctx in results:
+            print(f"Reward: {ctx.reward}")
+        ```
     """
-    import hud
-    from hud.utils.hud_console import HUDConsole
-
-    job_metadata = metadata or {}
-    job_metadata["agent_params"] = json.dumps(agent_params or {})
-    job_metadata["agent_type"] = agent_type.value
-    if group_size > 1:
-        job_metadata["group_size"] = group_size
-        job_metadata["total_episodes"] = len(tasks) * group_size
-
-    if remote:
-        hud_console = HUDConsole()
-
-        job = hud.create_job(name, metadata=job_metadata)
-        job.update_status_sync("created")
-
-        await submit_rollouts(
-            tasks=tasks,
-            job_id=job.id,
-            agent_type=agent_type,
-            agent_params=agent_params,
-            max_steps=max_steps,
-            group_size=group_size,
-            metadata=metadata,
-        )
-        hud_console.success(f"Submitted {len(tasks) * group_size} rollouts for remote execution")
-        hud_console.info(f"Monitor progress at: https://hud.ai/jobs/{job.id}")
-        return []
+    from hud.datasets.loader import load_tasks
+    from hud.eval.task import Task
+
+    # Normalize tasks to list[Task]
+    task_list: list[Task]
+    if isinstance(tasks, str):
+        task_list = load_tasks(tasks)
+    elif isinstance(tasks, Task):
+        task_list = [tasks]
+    elif isinstance(tasks, LegacyTask | dict):
+        # Single LegacyTask or dict - convert to Task
+        task_list = [Task.from_v4(tasks)]
+    else:
+        # Sequence of TaskInput - convert each to Task
+        task_list = [t if isinstance(t, Task) else Task.from_v4(t) for t in tasks]
 
-    # Local execution
-    agent_class = agent_type.cls
+    if not task_list:
+        raise ValueError("No tasks to run")
 
-    async with async_job(name, metadata=job_metadata) as job_obj:
-        return await _run_tasks(
-            tasks, agent_class, agent_params, max_concurrent, max_steps, group_size, job_obj
-        )
+    # Resolve agent class
+    agent_type_enum = agent_type if isinstance(agent_type, AgentType) else AgentType(agent_type)
+    agent_cls = agent_type_enum.cls
 
+    # Use hud.eval() for both single and parallel execution
+    async with hud.eval(
+        task_list,
+        group=group_size,
+        max_concurrent=max_concurrent,
+        quiet=quiet,
+    ) as ctx:
+        # Create agent fresh for each context (ensures correct tool initialization)
+        agent = agent_cls.create(**(agent_params or {}))
+        result = await agent.run(ctx, max_steps=max_steps)
+        ctx.reward = result.reward
 
-async def run_dataset(
-    name: str,
-    dataset: str | Dataset | list[dict[str, Any]],
-    agent_class: type[MCPAgent],
-    agent_config: dict[str, Any] | None = None,
-    max_concurrent: int = 30,
-    metadata: dict[str, Any] | None = None,
+    # For parallel execution, results are collected via ctx.results
+    if hasattr(ctx, "results") and ctx.results:
+        return ctx.results
+
+    return [ctx]
+
+
+async def run_single_task(
+    task: Task,
+    *,
+    agent_type: AgentType,
+    agent_params: dict[str, Any] | None = None,
     max_steps: int = 10,
-    split: str = "train",
-    auto_respond: bool = False,
-    group_size: int = 1,
-) -> list[Any]:
-    """Load and run all tasks from a dataset.
+    job_id: str | None = None,
+    task_id: str | None = None,
+    group_id: str | None = None,
+    trace_name: str | None = None,
+    metadata: dict[str, Any] | None = None,
+    trace_id: str | None = None,
+    api_key: str | None = None,
+    trace: bool = True,
+    quiet: bool = False,
+) -> Trace:
+    """Run a single task with full control over eval context parameters.
 
-    .. deprecated::
-        Use `run_tasks()` for new code. This function remains for backwards
-        compatibility but `run_tasks()` offers more flexibility (filtering,
-        custom task lists, etc.).
+    This is the low-level entry point for running individual tasks with explicit
+    trace/job/group IDs. Used by remote execution workers.
 
     Args:
-        name: Name for the job
-        dataset: HuggingFace dataset identifier, Dataset object, or list of dicts
-        agent_class: Agent class to instantiate
-        agent_config: Configuration kwargs for agent initialization
-        max_concurrent: Maximum concurrent tasks
-        metadata: Optional job metadata
-        max_steps: Maximum steps per task
-        split: Dataset split to use when loading from string
-        auto_respond: Whether to use auto-response agent
-        group_size: Number of times to run each task (for variance estimation)
+        task: Task object to run. Use Task.from_v4() or load_tasks() to create.
+        agent_type: AgentType enum specifying the agent to use.
+        agent_params: Parameters passed to agent.create(). Should include
+            pre-configured model_client for inference gateway usage.
+        max_steps: Maximum steps allowed for the agent.
+        job_id: HUD job identifier for telemetry association.
+        task_id: Task identifier (used in trace name if trace_name not provided).
+        group_id: Optional group identifier for parallel runs.
+        trace_name: Name for the trace (defaults to task_id or task.id).
+        metadata: Additional metadata for the trace context.
+        trace_id: Pre-assigned trace ID (if provided by backend).
+        api_key: API key override for telemetry and backend calls.
+        trace: Whether to send trace data to backend (default True).
+        quiet: Whether to suppress printing eval link (default False).
 
     Returns:
-        If group_size == 1: List of results from agent.run() in dataset order.
-        If group_size > 1: List of statistics dicts for each task group.
-    """
-    warnings.warn(
-        "run_dataset() is deprecated. Use run_tasks() instead for more flexibility.",
-        DeprecationWarning,
-        stacklevel=2,
-    )
-
-    # Load dataset and convert to Task objects
-    task_dicts: list[dict[str, Any]]
-    dataset_link: str | None = None
-
-    if isinstance(dataset, str):
-        logger.info("Loading dataset %s from HuggingFace...", dataset)
-        dataset_link = dataset
-        loaded = cast("Dataset", load_dataset(dataset, split=split))
-        task_dicts = cast("list[dict[str, Any]]", list(loaded))
-    elif isinstance(dataset, Dataset):
-        task_dicts = cast("list[dict[str, Any]]", list(dataset))
-        # Try to extract dataset link
-        try:
-            general_info = next(iter(dataset.info.__dict__["download_checksums"].keys())).split("/")
-            dataset_link = f"{general_info[3]}/{general_info[4].split('@')[0]}"
-        except Exception:  # noqa: S110
-            pass
-    else:
-        task_dicts = dataset
-
-    # Convert dicts to Task objects
-    tasks = [Task(**d) for d in task_dicts]
-
-    # Add dataset link to metadata
-    job_metadata = metadata or {}
-    job_metadata["agent_config"] = agent_config or {}
-    if dataset_link:
-        job_metadata["dataset_link"] = dataset_link
-    if group_size > 1:
-        job_metadata["group_size"] = group_size
-        job_metadata["total_episodes"] = len(tasks) * group_size
-
-    async with async_job(name, metadata=job_metadata) as job_obj:
-        return await _run_tasks(
-            tasks, agent_class, agent_config, max_concurrent, max_steps, group_size, job_obj
-        )
+        Trace result from the agent run.
 
+    Example:
+        ```python
+        from hud.datasets import run_single_task
+        from hud.eval.task import Task
+        from hud.types import AgentType
+        from openai import AsyncOpenAI
+
+        # Create task (from v4 dict or directly)
+        task = Task.from_v4({"prompt": "...", "mcp_config": {...}, "evaluate_tool": {...}})
+
+        # Configure agent with inference gateway
+        agent_params = {
+            "checkpoint_name": "gpt-4o",
+            "validate_api_key": False,
+            "model_client": AsyncOpenAI(
+                api_key=hud_api_key,
+                base_url=settings.hud_gateway_url,
+            ),
+        }
+
+        result = await run_single_task(
+            task=task,
+            agent_type=AgentType.OPENAI,
+            agent_params=agent_params,
+            max_steps=20,
+            job_id="job-123",
+            task_id="task-456",
+        )
+        ```
+    """
+    # Determine trace name
+    effective_trace_name = trace_name or task_id or task.id or "single_task"
 
-async def _run_tasks(
-    tasks: list[Task],
-    agent_class: type[MCPAgent],
-    agent_params: dict[str, Any] | None,
-    max_concurrent: int,
-    max_steps: int,
-    group_size: int,
-    job_obj: Any,
-) -> list[Any]:
-    sem = asyncio.Semaphore(max_concurrent)
-    params = agent_params or {}
-
-    # Generate group IDs for each task (used for telemetry grouping)
-    group_ids = {i: str(uuid.uuid4()) for i in range(len(tasks))}
-
-    # Expand tasks: each task runs group_size times
-    expanded: list[tuple[int, int, Task]] = []  # (flat_idx, task_idx, task)
-    for task_idx, task in enumerate(tasks):
-        for _ in range(group_size):
-            expanded.append((len(expanded), task_idx, task))
-
-    traces: list[Trace | None] = [None] * len(expanded)
-
-    async def worker(flat_idx: int, task_idx: int, run_idx: int, task: Task) -> None:
-        async with sem:
-            try:
-                base_task_id = str(task.id) if task.id is not None else f"task_{task_idx}"
-                trace_name = task.prompt or base_task_id
-
-                if group_size == 1:
-                    async with async_trace(trace_name, job_id=job_obj.id, task_id=base_task_id):
-                        agent = agent_class.create(**params)
-                        traces[flat_idx] = await agent.run(task, max_steps=max_steps)
-                else:
-                    task_id_with_run = f"{base_task_id}_{run_idx}"
-                    async with async_trace(
-                        trace_name,
-                        job_id=job_obj.id,
-                        task_id=task_id_with_run,
-                        group_id=group_ids[task_idx],
-                    ):
-                        agent = agent_class.create(**params)
-                        traces[flat_idx] = await agent.run(task, max_steps=max_steps)
-            except Exception as e:
-                if group_size == 1:
-                    logger.exception("Task %s failed: %s", task_idx, e)
-                    traces[flat_idx] = None
-                else:
-                    logger.warning("Episode %s failed: %s", flat_idx, e)
-                    traces[flat_idx] = Trace(isError=True, content=str(e), reward=0.0, done=True)
-
-    await asyncio.gather(
-        *[
-            worker(flat_idx, task_idx, flat_idx % group_size, task)
-            for flat_idx, task_idx, task in expanded
-        ],
-        return_exceptions=True,
-    )
-
-    # Return format depends on group_size
-    if group_size == 1:
-        return list(traces)
-    else:
-        return calculate_group_stats(tasks, traces, group_size, group_ids)
+    # Run with explicit eval context parameters
+    async with hud.eval(
+        task,
+        name=effective_trace_name,
+        job_id=job_id,
+        group_id=group_id,
+        trace_id=trace_id,
+        api_key=api_key,
+        trace=trace,
+        quiet=quiet,
+    ) as ctx:
+        # Build agent params - use system_prompt from ctx (set from task.agent_config)
+        final_agent_params = dict(agent_params or {})
+        if ctx.system_prompt and "system_prompt" not in final_agent_params:
+            final_agent_params["system_prompt"] = ctx.system_prompt
+
+        # Create agent inside ctx so it has access to context-derived values
+        agent_cls = agent_type.cls
+        agent = agent_cls.create(**final_agent_params)
+
+        # Store metadata if provided
+        if metadata:
+            ctx.metadata.update(metadata)
+
+        result = await agent.run(ctx, max_steps=max_steps)
+        ctx.reward = result.reward
+
+    return result
diff --git a/hud/datasets/tests/test_loader.py b/hud/datasets/tests/test_loader.py
new file mode 100644
index 00000000..5a8ffe5a
--- /dev/null
+++ b/hud/datasets/tests/test_loader.py
@@ -0,0 +1,221 @@
+"""Tests for hud.datasets.loader module."""
+
+from __future__ import annotations
+
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from hud.datasets.loader import load_tasks
+
+
+class TestLoadTasks:
+    """Tests for load_tasks() function."""
+
+    @patch("httpx.Client")
+    @patch("hud.settings.settings")
+    def test_load_tasks_success(
+        self, mock_settings: MagicMock, mock_client_class: MagicMock
+    ) -> None:
+        """load_tasks() successfully loads tasks from API."""
+        mock_settings.hud_api_url = "https://api.hud.ai"
+        mock_settings.api_key = "test_key"
+
+        mock_response = MagicMock()
+        # New EvalsetTasksResponse format: tasks keyed by task ID
+        mock_response.json.return_value = {
+            "evalset_id": "evalset-123",
+            "evalset_name": "test-dataset",
+            "tasks": {
+                "task-1": {
+                    "env": {"name": "test"},
+                    "scenario": "checkout",
+                    "args": {"user": "alice"},
+                },
+                "task-2": {
+                    "env": {"name": "test"},
+                    "scenario": "login",
+                    "args": {"user": "bob"},
+                },
+            },
+        }
+        mock_response.raise_for_status = MagicMock()
+
+        mock_client = MagicMock()
+        mock_client.get.return_value = mock_response
+        mock_client.__enter__.return_value = mock_client
+        mock_client.__exit__.return_value = None
+        mock_client_class.return_value = mock_client
+
+        tasks = load_tasks("test-org/test-dataset")
+
+        assert len(tasks) == 2
+        # Tasks are keyed by ID in dict, order may vary
+        scenarios = {t.scenario for t in tasks}
+        assert scenarios == {"checkout", "login"}
+        # Check task IDs are set from dict keys
+        task_ids = {t.id for t in tasks}
+        assert task_ids == {"task-1", "task-2"}
+        mock_client.get.assert_called_once_with(
+            "https://api.hud.ai/tasks/evalset/test-org/test-dataset",
+            headers={"Authorization": "Bearer test_key"},
+            params={"all": "true"},
+        )
+
+    @patch("httpx.Client")
+    @patch("hud.settings.settings")
+    def test_load_tasks_single_task(
+        self, mock_settings: MagicMock, mock_client_class: MagicMock
+    ) -> None:
+        """load_tasks() handles single task in EvalsetTasksResponse."""
+        mock_settings.hud_api_url = "https://api.hud.ai"
+        mock_settings.api_key = "test_key"
+
+        mock_response = MagicMock()
+        mock_response.json.return_value = {
+            "evalset_id": "evalset-123",
+            "evalset_name": "test-dataset",
+            "tasks": {
+                "task-1": {
+                    "env": {"name": "test"},
+                    "scenario": "checkout",
+                    "args": {"user": "alice"},
+                },
+            },
+        }
+        mock_response.raise_for_status = MagicMock()
+
+        mock_client = MagicMock()
+        mock_client.get.return_value = mock_response
+        mock_client.__enter__.return_value = mock_client
+        mock_client.__exit__.return_value = None
+        mock_client_class.return_value = mock_client
+
+        tasks = load_tasks("test-org/test-dataset")
+
+        assert len(tasks) == 1
+        assert tasks[0].scenario == "checkout"
+        assert tasks[0].id == "task-1"
+
+    @patch("httpx.Client")
+    @patch("hud.settings.settings")
+    def test_load_tasks_no_api_key(
+        self, mock_settings: MagicMock, mock_client_class: MagicMock
+    ) -> None:
+        """load_tasks() works without API key."""
+        mock_settings.hud_api_url = "https://api.hud.ai"
+        mock_settings.api_key = None
+
+        mock_response = MagicMock()
+        mock_response.json.return_value = {
+            "evalset_id": "evalset-123",
+            "evalset_name": "test-dataset",
+            "tasks": {},
+        }
+        mock_response.raise_for_status = MagicMock()
+
+        mock_client = MagicMock()
+        mock_client.get.return_value = mock_response
+        mock_client.__enter__.return_value = mock_client
+        mock_client.__exit__.return_value = None
+        mock_client_class.return_value = mock_client
+
+        tasks = load_tasks("test-org/test-dataset")
+
+        assert len(tasks) == 0
+        mock_client.get.assert_called_once_with(
+            "https://api.hud.ai/tasks/evalset/test-org/test-dataset",
+            headers={},
+            params={"all": "true"},
+        )
+
+    @patch("httpx.Client")
+    @patch("hud.settings.settings")
+    def test_load_tasks_http_error(
+        self, mock_settings: MagicMock, mock_client_class: MagicMock
+    ) -> None:
+        """load_tasks() raises ValueError on HTTP error."""
+        import httpx
+
+        mock_settings.hud_api_url = "https://api.hud.ai"
+        mock_settings.api_key = "test_key"
+
+        mock_client = MagicMock()
+        mock_client.get.side_effect = httpx.HTTPError("Network error")
+        mock_client.__enter__.return_value = mock_client
+        mock_client.__exit__.return_value = None
+        mock_client_class.return_value = mock_client
+
+        with pytest.raises(ValueError, match="Failed to load tasks"):
+            load_tasks("test-org/test-dataset")
+
+    @patch("httpx.Client")
+    @patch("hud.settings.settings")
+    def test_load_tasks_json_error(
+        self, mock_settings: MagicMock, mock_client_class: MagicMock
+    ) -> None:
+        """load_tasks() raises ValueError on JSON processing error."""
+        mock_settings.hud_api_url = "https://api.hud.ai"
+        mock_settings.api_key = "test_key"
+
+        mock_response = MagicMock()
+        mock_response.json.side_effect = Exception("Invalid JSON")
+        mock_response.raise_for_status = MagicMock()
+
+        mock_client = MagicMock()
+        mock_client.get.return_value = mock_response
+        mock_client.__enter__.return_value = mock_client
+        mock_client.__exit__.return_value = None
+        mock_client_class.return_value = mock_client
+
+        with pytest.raises(ValueError, match="Failed to load tasks"):
+            load_tasks("test-org/test-dataset")
+
+    @patch("httpx.Client")
+    @patch("hud.settings.settings")
+    def test_load_tasks_empty(self, mock_settings: MagicMock, mock_client_class: MagicMock) -> None:
+        """load_tasks() handles empty dataset."""
+        mock_settings.hud_api_url = "https://api.hud.ai"
+        mock_settings.api_key = "test_key"
+
+        mock_response = MagicMock()
+        mock_response.json.return_value = {"tasks": {}}
+        mock_response.raise_for_status = MagicMock()
+
+        mock_client = MagicMock()
+        mock_client.get.return_value = mock_response
+        mock_client.__enter__.return_value = mock_client
+        mock_client.__exit__.return_value = None
+        mock_client_class.return_value = mock_client
+
+        tasks = load_tasks("test-org/test-dataset")
+
+        assert len(tasks) == 0
+
+    @patch("httpx.Client")
+    @patch("hud.settings.settings")
+    def test_load_tasks_missing_fields(
+        self, mock_settings: MagicMock, mock_client_class: MagicMock
+    ) -> None:
+        """load_tasks() handles tasks with missing optional fields (but env is required)."""
+        mock_settings.hud_api_url = "https://api.hud.ai"
+        mock_settings.api_key = "test_key"
+
+        mock_response = MagicMock()
+        mock_response.json.return_value = {
+            "tasks": {"task-1": {"env": {"name": "test-env"}, "scenario": "test"}},
+        }
+        mock_response.raise_for_status = MagicMock()
+
+        mock_client = MagicMock()
+        mock_client.get.return_value = mock_response
+        mock_client.__enter__.return_value = mock_client
+        mock_client.__exit__.return_value = None
+        mock_client_class.return_value = mock_client
+
+        tasks = load_tasks("test-org/test-dataset")
+
+        assert len(tasks) == 1
+        assert tasks[0].scenario == "test"
+        assert tasks[0].id == "task-1"
+        assert tasks[0].args == {}
diff --git a/hud/datasets/tests/test_runner.py b/hud/datasets/tests/test_runner.py
deleted file mode 100644
index 31595839..00000000
--- a/hud/datasets/tests/test_runner.py
+++ /dev/null
@@ -1,67 +0,0 @@
-from __future__ import annotations
-
-from unittest.mock import MagicMock, patch
-
-import pytest
-
-from hud.telemetry.utils import flush_telemetry
-
-
-@pytest.mark.asyncio
-async def test_flush_telemetry():
-    """Test flush_telemetry function."""
-    with (
-        patch("hud.otel.config.is_telemetry_configured", return_value=True),
-        patch("hud.utils.hud_console.hud_console"),
-        patch("opentelemetry.trace.get_tracer_provider") as mock_get_provider,
-    ):
-        from opentelemetry.sdk.trace import TracerProvider
-
-        mock_provider = MagicMock(spec=TracerProvider)
-        mock_provider.force_flush.return_value = True
-        mock_get_provider.return_value = mock_provider
-
-        await flush_telemetry()
-
-        mock_provider.force_flush.assert_called_once_with(timeout_millis=5000)
-
-
-@pytest.mark.asyncio
-async def test_flush_telemetry_not_configured():
-    """Test flush_telemetry when telemetry is not configured."""
-    with patch("hud.otel.config.is_telemetry_configured", return_value=False):
-        await flush_telemetry()
-
-
-@pytest.mark.asyncio
-async def test_flush_telemetry_exception():
-    """Test flush_telemetry handles exceptions gracefully."""
-    with (
-        patch("hud.otel.config.is_telemetry_configured", return_value=True),
-        patch("hud.utils.hud_console.hud_console"),
-        patch("opentelemetry.trace.get_tracer_provider") as mock_get_provider,
-    ):
-        from opentelemetry.sdk.trace import TracerProvider
-
-        mock_provider = MagicMock(spec=TracerProvider)
-        mock_provider.force_flush.side_effect = Exception("Flush failed")
-        mock_get_provider.return_value = mock_provider
-
-        await flush_telemetry()
-
-
-@pytest.mark.asyncio
-async def test_flush_telemetry_timeout():
-    """Test flush_telemetry when force_flush times out."""
-    with (
-        patch("hud.otel.config.is_telemetry_configured", return_value=True),
-        patch("hud.utils.hud_console.hud_console"),
-        patch("opentelemetry.trace.get_tracer_provider") as mock_get_provider,
-    ):
-        from opentelemetry.sdk.trace import TracerProvider
-
-        mock_provider = MagicMock(spec=TracerProvider)
-        mock_provider.force_flush.return_value = False
-        mock_get_provider.return_value = mock_provider
-
-        await flush_telemetry()
diff --git a/hud/datasets/tests/test_utils.py b/hud/datasets/tests/test_utils.py
index cda201ac..50218f2b 100644
--- a/hud/datasets/tests/test_utils.py
+++ b/hud/datasets/tests/test_utils.py
@@ -9,23 +9,22 @@
 from hud.datasets.utils import (
     BatchRequest,
     SingleTaskRequest,
-    calculate_group_stats,
     cancel_all_jobs,
     cancel_job,
     cancel_task,
-    display_results,
     submit_rollouts,
 )
-from hud.types import AgentType, Task, Trace
+from hud.eval.display import display_results
+from hud.types import AgentType, LegacyTask, Trace
 
 
 class TestSingleTaskRequest:
     """Tests for SingleTaskRequest schema."""
 
     def test_valid_request(self):
-        """Test creating a valid SingleTaskRequest."""
+        """Test creating a valid SingleTaskRequest with v5 task."""
         request = SingleTaskRequest(
-            task={"prompt": "test", "mcp_config": {}},
+            task={"env": {"name": "browser"}, "scenario": "checkout"},
             agent_type=AgentType.CLAUDE,
             agent_params={"checkpoint_name": "claude-sonnet-4-5"},
             max_steps=10,
@@ -48,8 +47,8 @@ def test_empty_job_id_rejected(self):
             )
 
     def test_invalid_task_rejected(self):
-        """Test that invalid task payload is rejected."""
-        with pytest.raises(ValueError, match="Invalid task payload"):
+        """Test that invalid task payload is rejected (neither v4 nor v5)."""
+        with pytest.raises(ValueError, match="Task must have 'env'"):
             SingleTaskRequest(
                 task={"invalid_field": "test"},  # Missing required fields
                 agent_type=AgentType.CLAUDE,
@@ -58,6 +57,49 @@ def test_invalid_task_rejected(self):
                 trace_name="Test",
             )
 
+    def test_incomplete_v4_task_rejected(self):
+        """Test that incomplete v4 task (missing evaluate_tool) is rejected."""
+        # When prompt + mcp_config is present but evaluate_tool is missing,
+        # it's detected as v4 format but fails validation
+        with pytest.raises(ValueError, match="v4 task missing required fields"):
+            SingleTaskRequest(
+                task={
+                    "prompt": "test",
+                    "mcp_config": {"server": {"url": "http://localhost"}},
+                    # Missing evaluate_tool
+                },
+                agent_type=AgentType.CLAUDE,
+                job_id="job-123",
+                task_id="task-1",
+                trace_name="Test",
+            )
+
+    def test_valid_v4_task_accepted(self):
+        """Test that complete v4 task is accepted."""
+        request = SingleTaskRequest(
+            task={
+                "prompt": "test",
+                "mcp_config": {"server": {"url": "http://localhost"}},
+                "evaluate_tool": {"name": "check", "arguments": {}},
+            },
+            agent_type=AgentType.CLAUDE,
+            job_id="job-123",
+            task_id="task-1",
+            trace_name="Test",
+        )
+        assert request.task_id == "task-1"
+
+    def test_valid_v5_task_accepted(self):
+        """Test that v5 task with env is accepted."""
+        request = SingleTaskRequest(
+            task={"env": {"name": "browser"}, "scenario": "login"},
+            agent_type=AgentType.CLAUDE,
+            job_id="job-123",
+            task_id="task-1",
+            trace_name="Test",
+        )
+        assert request.task_id == "task-1"
+
 
 class TestBatchRequest:
     """Tests for BatchRequest schema."""
@@ -66,7 +108,7 @@ def test_valid_batch(self):
         """Test creating a valid batch request."""
         requests = [
             SingleTaskRequest(
-                task={"prompt": "test", "mcp_config": {}},
+                task={"env": {"name": "browser"}, "scenario": "test"},
                 agent_type=AgentType.CLAUDE,
                 job_id="job-123",
                 task_id=f"task-{i}",
@@ -155,64 +197,14 @@ async def test_cancel_all_jobs(self):
             assert result["total_tasks_cancelled"] == 10
 
 
-class TestCalculateGroupStats:
-    """Tests for calculate_group_stats function."""
-
-    def test_basic_stats(self):
-        """Test basic group statistics calculation."""
-        tasks = [
-            Task(prompt="Task 1", mcp_config={}),
-            Task(prompt="Task 2", mcp_config={}),
-        ]
-        traces: list[Trace | None] = [
-            Trace(reward=0.8, done=True),
-            Trace(reward=0.9, done=True),
-            Trace(reward=0.6, done=True),
-            Trace(reward=0.7, done=True),
-        ]
-        group_ids = {0: "group-0", 1: "group-1"}
-
-        stats = calculate_group_stats(tasks, traces, group_size=2, group_ids=group_ids)
-
-        assert len(stats) == 2
-        assert stats[0]["mean_reward"] == pytest.approx(0.85, rel=0.01)
-        assert stats[1]["mean_reward"] == pytest.approx(0.65, rel=0.01)
-
-    def test_all_none_traces(self):
-        """Test when all traces are None."""
-        tasks = [Task(prompt="Task 1", mcp_config={})]
-        traces: list[Trace | None] = [None, None]
-        group_ids = {0: "group-0"}
-
-        stats = calculate_group_stats(tasks, traces, group_size=2, group_ids=group_ids)
-
-        assert len(stats) == 1
-        assert stats[0]["error_rate"] == 1.0
-        assert stats[0]["mean_reward"] == 0.0
-
-    def test_mixed_success_failure(self):
-        """Test with mixed success and failure traces."""
-        tasks = [Task(prompt="Task 1", mcp_config={})]
-        traces: list[Trace | None] = [
-            Trace(reward=1.0, done=True),
-            Trace(reward=0.0, done=True, isError=True),
-        ]
-        group_ids = {0: "group-0"}
-
-        stats = calculate_group_stats(tasks, traces, group_size=2, group_ids=group_ids)
-
-        assert stats[0]["success_rate"] == 0.5
-        assert stats[0]["error_rate"] == 0.5
-
-
 class TestDisplayResults:
     """Tests for display_results function."""
 
     def test_display_with_traces(self):
         """Test displaying single-run trace results."""
         tasks = [
-            Task(id="t1", prompt="Test task 1", mcp_config={}),
-            Task(id="t2", prompt="Test task 2", mcp_config={}),
+            LegacyTask(id="t1", prompt="Test task 1", mcp_config={}),
+            LegacyTask(id="t2", prompt="Test task 2", mcp_config={}),
         ]
         results = [
             Trace(reward=0.9, done=True),
@@ -225,7 +217,7 @@ def test_display_with_traces(self):
     def test_display_with_group_stats(self):
         """Test displaying group statistics."""
         tasks = [
-            Task(id="t1", prompt="Test task 1", mcp_config={}),
+            LegacyTask(id="t1", prompt="Test task 1", mcp_config={}),
         ]
         results = [
             {
@@ -246,7 +238,7 @@ def test_display_with_group_stats(self):
 
     def test_display_empty_results(self):
         """Test displaying when no valid results."""
-        tasks = [Task(prompt="Test", mcp_config={})]
+        tasks = [LegacyTask(prompt="Test", mcp_config={})]
         results: list[Trace | None] = [None]
 
         # Should not raise
@@ -258,8 +250,10 @@ class TestSubmitRollouts:
 
     @pytest.mark.asyncio
     async def test_submit_single_task(self):
-        """Test submitting a single task."""
-        tasks = [Task(id="task-1", prompt="Test prompt", mcp_config={})]
+        """Test submitting a single task (v5 format)."""
+        from hud.eval.task import Task
+
+        tasks = [Task(env={"name": "browser"}, scenario="test", id="task-1")]
 
         with patch("hud.datasets.utils.httpx.AsyncClient") as mock_client_cls:
             mock_response = MagicMock()
@@ -288,7 +282,9 @@ async def test_submit_single_task(self):
     @pytest.mark.asyncio
     async def test_submit_with_group_size(self):
         """Test submitting with group_size > 1 creates multiple requests per task."""
-        tasks = [Task(id="task-1", prompt="Test prompt", mcp_config={})]
+        from hud.eval.task import Task
+
+        tasks = [Task(env={"name": "browser"}, scenario="test", id="task-1")]
 
         with patch("hud.datasets.utils.httpx.AsyncClient") as mock_client_cls:
             mock_response = MagicMock()
diff --git a/hud/datasets/utils.py b/hud/datasets/utils.py
index c724ce45..fabcbfa9 100644
--- a/hud/datasets/utils.py
+++ b/hud/datasets/utils.py
@@ -3,26 +3,36 @@
 from __future__ import annotations
 
 import logging
-from statistics import mean, stdev
-from typing import Any
+from typing import TYPE_CHECKING, Any
 
 import httpx
-import numpy as np
 from pydantic import BaseModel, Field, field_validator, model_validator
 
 from hud.settings import settings
-from hud.types import AgentType, Task, Trace
+from hud.types import AgentType, TaskInput
 from hud.utils.hud_console import HUDConsole
 
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+
 logger = logging.getLogger(__name__)
 hud_console = HUDConsole()
 
+__all__ = [
+    "BatchRequest",
+    "SingleTaskRequest",
+    "cancel_all_jobs",
+    "cancel_job",
+    "cancel_task",
+    "submit_rollouts",
+]
+
 
 class SingleTaskRequest(BaseModel):
     """Request to run a single task remotely - mirrors run_single_task() args."""
 
     task: dict[str, Any] = Field(
-        description="Task definition compatible with hud.types.Task.",
+        description="Task definition (v4 LegacyTask or v5 Task format).",
     )
     agent_type: AgentType = Field(description="Agent type to execute the task.")
     agent_params: dict[str, Any] = Field(
@@ -33,21 +43,32 @@ class SingleTaskRequest(BaseModel):
     )
     max_steps: int = Field(default=10, description="Maximum steps allowed for the agent.")
     job_id: str = Field(description="HUD job identifier for telemetry association.")
-    task_id: str = Field(description="Task identifier.")
-    trace_name: str = Field(description="Trace name.")
+    task_id: str | None = Field(default=None, description="Task identifier.")
+    trace_name: str | None = Field(default=None, description="Trace name.")
     group_id: str | None = Field(default=None, description="Optional HUD group identifier.")
     metadata: dict[str, Any] = Field(
         default_factory=dict,
         description="Additional metadata to inject into the trace context.",
     )
+    trace_id: str | None = Field(default=None, description="Pre-assigned trace ID.")
 
     @model_validator(mode="after")
     def _validate_task(self) -> SingleTaskRequest:
-        try:
-            Task(**self.task)
-        except Exception as exc:
-            raise ValueError(f"Invalid task payload: {exc}") from exc
-        return self
+        """Validate task is either v4 LegacyTask or v5 Task format."""
+        from hud.eval.utils import is_v4_format, validate_v4_task
+
+        # v4 format: looks like v4 (prompt + mcp_config)?
+        if is_v4_format(self.task):
+            # Validate completeness (requires evaluate_tool too)
+            validate_v4_task(self.task)
+            return self
+
+        # v5 format: env required
+        if "env" in self.task:
+            return self
+
+        # Neither v4 nor v5
+        raise ValueError("Task must have 'env' (v5) or 'prompt'+'mcp_config'+'evaluate_tool' (v4)")
 
     @field_validator("job_id")
     @classmethod
@@ -67,8 +88,21 @@ class BatchRequest(BaseModel):
     )
 
 
+def _normalize_tasks(tasks: Sequence[TaskInput]) -> list[dict[str, Any]]:
+    """Convert tasks to list of dicts for remote API submission."""
+    result = []
+    for t in tasks:
+        if isinstance(t, dict):
+            result.append(t)
+        elif hasattr(t, "model_dump"):
+            result.append(t.model_dump(mode="json"))
+        else:
+            raise TypeError(f"Cannot convert {type(t).__name__} to dict")
+    return result
+
+
 async def submit_rollouts(
-    tasks: list[Task],
+    tasks: Sequence[TaskInput],
     job_id: str,
     agent_type: AgentType,
     agent_params: dict[str, Any] | None = None,
@@ -80,60 +114,58 @@ async def submit_rollouts(
     """Submit rollouts to the HUD platform API for remote execution (fire-and-forget).
 
     Args:
-        tasks: List of Task objects to execute
+        tasks: List of tasks (v5 Task, v4 LegacyTask, or dicts)
         job_id: HUD job ID for telemetry grouping
         agent_type: Agent type to use for execution
-        agent_params: Parameters passed to agent.create(). Should include fields
-            from BaseCreateParams (auto_trace, auto_respond, verbose) plus
-            agent-specific config fields (e.g., checkpoint_name for ClaudeConfig).
+        agent_params: Parameters passed to agent.create()
         max_steps: Maximum steps per rollout
         group_size: Number of rollouts per task (for variance estimation)
         batch_size: Number of rollouts per API batch request
         metadata: Additional metadata for each rollout
     """
+    from hud.eval.utils import is_v4_format
+
     if not settings.api_key:
         raise ValueError("HUD_API_KEY is required for remote execution")
 
-    # Validate tasks have remote-compatible mcp_config (URL-based, not command-based)
-    local_task_servers: list[tuple[int, str, str]] = []  # (task_idx, task_id, server_name)
-    affected_task_indices: set[int] = set()
-    for i, task in enumerate(tasks):
-        if task.mcp_config:
-            for server_name, server_cfg in task.mcp_config.items():
-                if (
-                    isinstance(server_cfg, dict)
-                    and "command" in server_cfg
-                    and not server_cfg.get("url")
-                ):
-                    local_task_servers.append((i, task.id or f"task_{i}", server_name))
-                    affected_task_indices.add(i)
-
-    if local_task_servers:
-        task_details = ", ".join(f"{tid} ({srv})" for _, tid, srv in local_task_servers[:3])
-        if len(local_task_servers) > 3:
-            task_details += f", ... and {len(local_task_servers) - 3} more"
-        raise ValueError(
-            f"Remote execution requires URL-based mcp_config, but "
-            f"{len(affected_task_indices)} task(s) use local Docker configs "
-            f"(command-based): {task_details}. "
-            "Convert to remote with: hud convert <tasks_file>"
-        )
+    # Convert to dicts once for uniform processing
+    task_dicts = _normalize_tasks(tasks)
+
+    # Validate v4 tasks have remote-compatible mcp_config (URL-based, not command-based)
+    for i, td in enumerate(task_dicts):
+        if not is_v4_format(td):
+            continue  # v5 tasks use env config, no mcp_config to check
+        mcp_config = td.get("mcp_config") or {}
+        for server_name, server_cfg in mcp_config.items():
+            is_local = (
+                isinstance(server_cfg, dict)
+                and "command" in server_cfg
+                and not server_cfg.get("url")
+            )
+            if is_local:
+                raise ValueError(
+                    f"Remote execution requires URL-based mcp_config. "
+                    f"Task {td.get('id') or i} uses local Docker config for '{server_name}'. "
+                    "Convert to remote with: hud convert <tasks_file>"
+                )
 
     # Build single task requests
     requests: list[SingleTaskRequest] = []
-    for task_idx, task in enumerate(tasks):
-        base_task_id = task.id or f"task_{task_idx}"
+    for task_idx, td in enumerate(task_dicts):
+        base_task_id = td.get("id") or f"task_{task_idx}"
+        trace_name = td.get("prompt") or td.get("scenario") or base_task_id
+
         for rollout_idx in range(group_size):
             task_id = f"{base_task_id}_r{rollout_idx}" if group_size > 1 else base_task_id
             requests.append(
                 SingleTaskRequest(
-                    task=task.model_dump(mode="json"),
+                    task=td,
                     agent_type=agent_type,
                     agent_params=agent_params or {},
                     max_steps=max_steps,
                     job_id=job_id,
                     task_id=task_id,
-                    trace_name=task.prompt or task_id,
+                    trace_name=trace_name,
                     group_id=base_task_id if group_size > 1 else None,
                     metadata=metadata or {},
                 )
@@ -257,185 +289,3 @@ async def cancel_all_jobs() -> dict[str, Any]:
         )
         response.raise_for_status()
         return response.json()
-
-
-def calculate_group_stats(
-    tasks: list[Task],
-    traces: list[Trace | None],
-    group_size: int,
-    group_ids: dict[int, str],
-) -> list[dict[str, Any]]:
-    """Calculate statistics for each task group.
-
-    Args:
-        tasks: List of Task objects
-        traces: List of Trace results (may contain None for failed tasks)
-        group_size: Number of runs per task
-        group_ids: Mapping from task index to group ID
-
-    Returns:
-        List of statistics dicts, one per task, containing:
-        - task_id, prompt, group_id, group_size
-        - rewards: list of individual rewards
-        - mean_reward, std_reward, min_reward, max_reward
-        - success_rate, error_rate
-        - traces: list of Trace objects for this group
-    """
-    stats = []
-
-    for task_idx, task in enumerate(tasks):
-        # Get traces for this task
-        start = task_idx * group_size
-        task_traces = [t for t in traces[start : start + group_size] if t is not None]
-
-        if not task_traces:
-            stats.append(
-                {
-                    "task_id": task.id or f"task_{task_idx}",
-                    "prompt": task.prompt or "",
-                    "group_id": group_ids[task_idx],
-                    "group_size": group_size,
-                    "rewards": [],
-                    "mean_reward": 0.0,
-                    "std_reward": 0.0,
-                    "success_rate": 0.0,
-                    "error_rate": 1.0,
-                }
-            )
-            continue
-
-        rewards = np.array([t.reward for t in task_traces])
-        errors = [t for t in task_traces if t.isError]
-
-        task_stats = {
-            "task_id": task.id or f"task_{task_idx}",
-            "prompt": task.prompt or "",
-            "group_id": group_ids[task_idx],
-            "group_size": group_size,
-            "rewards": rewards.tolist(),
-            "mean_reward": float(np.mean(rewards)),
-            "std_reward": float(np.std(rewards)) if len(rewards) > 1 else 0.0,
-            "min_reward": float(np.min(rewards)),
-            "max_reward": float(np.max(rewards)),
-            "success_rate": float(np.sum(rewards > 0) / len(rewards)),
-            "error_rate": len(errors) / len(task_traces),
-            "traces": task_traces,
-        }
-        stats.append(task_stats)
-
-    return stats
-
-
-def display_results(
-    results: list[Any],
-    *,
-    tasks: list[Task],
-    elapsed: float | None = None,
-    show_details: bool = True,
-) -> None:
-    """Display evaluation results in a formatted table.
-
-    Args:
-        results: List of Trace objects or grouped statistics dicts
-        tasks: List of Task objects corresponding to results
-        elapsed: Optional elapsed time in seconds
-        show_details: Whether to show per-task details table
-    """
-    from rich.console import Console
-    from rich.table import Table
-
-    from hud.utils.hud_console import HUDConsole
-
-    hud_console = HUDConsole()
-    console = Console()
-
-    if not results:
-        hud_console.warning("No results to display")
-        return
-
-    # Detect if this is grouped results (list of dicts with 'mean_reward') or traces
-    is_grouped = isinstance(results[0], dict) and "mean_reward" in results[0]
-
-    if is_grouped:
-        # Grouped evaluation stats
-        all_means = [s["mean_reward"] for s in results]
-        overall_mean = mean(all_means) if all_means else 0.0
-        overall_std = stdev(all_means) if len(all_means) > 1 else 0.0
-        group_size = results[0].get("group_size", 1)
-        total_episodes = sum(len(s.get("rewards", [])) for s in results)
-
-        hud_console.success("\n📊 Evaluation Complete")
-        hud_console.info(f"Tasks: {len(results)} x {group_size} runs = {total_episodes} episodes")
-        if elapsed:
-            hud_console.info(f"Time: {elapsed:.1f}s ({total_episodes / elapsed:.1f} episodes/s)")
-        hud_console.info(f"Mean reward: {overall_mean:.3f} ± {overall_std:.3f}")
-
-        if show_details and len(results) <= 50:
-            table = Table(title="\nPer-Task Performance")
-            table.add_column("#", style="dim", justify="right")
-            table.add_column("Task ID", style="cyan", no_wrap=True)
-            table.add_column("Prompt", style="dim", max_width=40)
-            table.add_column("Mean±Std", justify="right", style="green")
-            table.add_column("Min/Max", justify="right")
-            table.add_column("Success%", justify="right", style="yellow")
-
-            for i, (stat, task) in enumerate(zip(results, tasks, strict=False)):
-                task_id = (task.id or "")[:20]
-                prompt = (task.prompt or "")[:40]
-                if len(task.prompt or "") > 40:
-                    prompt += "..."
-                table.add_row(
-                    str(i + 1),
-                    task_id,
-                    prompt,
-                    f"{stat.get('mean_reward', 0):.3f}±{stat.get('std_reward', 0):.3f}",
-                    f"{stat.get('min_reward', 0):.2f}/{stat.get('max_reward', 0):.2f}",
-                    f"{stat.get('success_rate', 0) * 100:.0f}%",
-                )
-            console.print(table)
-
-        high_var = [s for s in results if s.get("std_reward", 0) > 0.3]
-        if high_var:
-            hud_console.warning(f"\n⚠️  {len(high_var)} tasks show high variance (std > 0.3)")
-
-    else:
-        # Single-run traces
-        valid_results = [r for r in results if r is not None]
-        rewards = [getattr(r, "reward", 0) for r in valid_results]
-
-        if not rewards:
-            hud_console.warning("No valid results")
-            return
-
-        mean_reward = sum(rewards) / len(rewards)
-        successful = sum(1 for r in rewards if r > 0.7)
-        success_rate = successful / len(results)
-
-        hud_console.success("\n📊 Evaluation Complete")
-        hud_console.info(f"Tasks: {len(results)}")
-        if elapsed:
-            hud_console.info(f"Time: {elapsed:.1f}s ({len(results) / elapsed:.1f} tasks/s)")
-        hud_console.info(f"Mean reward: {mean_reward:.3f}")
-        hud_console.info(f"Success rate: {success_rate * 100:.1f}% ({successful}/{len(results)})")
-
-        if show_details and len(results) <= 50:
-            table = Table(title="\nPer-Task Results")
-            table.add_column("#", style="dim", justify="right")
-            table.add_column("Task ID", style="cyan", no_wrap=True)
-            table.add_column("Prompt", style="dim", max_width=40)
-            table.add_column("Reward", justify="right", style="green")
-            table.add_column("Status", justify="center")
-
-            for i, r in enumerate(results):
-                task = tasks[i]
-                task_id = (task.id or "")[:20]
-                prompt = (task.prompt or "")[:40]
-                if len(task.prompt or "") > 40:
-                    prompt += "..."
-
-                if r is None:
-                    table.add_row(str(i + 1), task_id, prompt, "—", "[red]Error[/red]")
-                else:
-                    reward = getattr(r, "reward", 0)
-                    status = "[green]✓[/green]" if reward > 0.7 else "[yellow]✗[/yellow]"
-                    table.add_row(str(i + 1), task_id, prompt, f"{reward:.3f}", status)
diff --git a/hud/environment/__init__.py b/hud/environment/__init__.py
new file mode 100644
index 00000000..9aad37a0
--- /dev/null
+++ b/hud/environment/__init__.py
@@ -0,0 +1,50 @@
+"""
+HUD Environment - A unified abstraction for MCP environments.
+
+The Environment class is a server that you can also use as a client.
+It subclasses MCPServer to get server capabilities (@env.tool, serve())
+and composes FastMCP Client instances for remote connections.
+
+Usage:
+    from hud.environment import Environment
+
+    # Create and connect
+    env = Environment("my-env").connect_hub("browser", prefix="web")
+
+    async with env:
+        # Get tools in any format
+        openai_tools = env.as_openai_chat_tools()
+        claude_tools = env.as_claude_tools()
+
+        # Call tools with any format - auto-parses and returns matching format
+        result = await env.call_tool("web_navigate", url="https://google.com")
+
+        # Framework integrations (requires external deps)
+        agent_tools = env.as_openai_agent_tools()   # needs openai-agents
+        lc_tools = env.as_langchain_tools()         # needs langchain-core
+"""
+
+from hud.environment.connection import ConnectionConfig, ConnectionType, Connector
+from hud.environment.environment import Environment
+from hud.environment.mock import MockMixin, generate_mock_value
+from hud.environment.router import ConflictResolution, ToolRouter
+from hud.environment.scenarios import ScenarioMixin
+from hud.environment.types import EnvConfig
+from hud.environment.utils import ToolFormat, format_result, parse_tool_call, parse_tool_calls
+
+__all__ = [
+    "ConflictResolution",
+    "ConnectionConfig",
+    "ConnectionType",
+    "Connector",
+    "EnvConfig",
+    "Environment",
+    "MockMixin",
+    "ScenarioMixin",
+    "ToolFormat",
+    "ToolRouter",
+    "format_result",
+    "generate_mock_value",
+    "parse_tool_call",
+    "parse_tool_calls",
+]
diff --git a/hud/environment/connection.py b/hud/environment/connection.py
new file mode 100644
index 00000000..95839eb6
--- /dev/null
+++ b/hud/environment/connection.py
@@ -0,0 +1,206 @@
+"""Connection management for MCP servers."""
+
+from __future__ import annotations
+
+import logging
+from enum import Enum
+from typing import TYPE_CHECKING, Any
+
+import mcp.types as mcp_types
+
+if TYPE_CHECKING:
+    from collections.abc import Callable
+
+    from fastmcp.client import Client as FastMCPClient
+    from fastmcp.tools.tool import Tool
+
+__all__ = ["ConnectionConfig", "ConnectionType", "Connector"]
+
+logger = logging.getLogger(__name__)
+
+
+class ConnectionType(str, Enum):
+    """Type of connection - determines parallelization capability."""
+
+    LOCAL = "local"  # Stdio/Docker - single instance, not parallelizable
+    REMOTE = "remote"  # HTTP/URL - can spawn multiple instances
+
+
+class ConnectionConfig:
+    """Configuration for filtering/transforming tools from a remote connection."""
+
+    def __init__(
+        self,
+        *,
+        prefix: str | None = None,
+        include: list[str] | None = None,
+        exclude: list[str] | None = None,
+        transform: Callable[[Tool], Tool | None] | None = None,
+    ) -> None:
+        self.prefix = prefix
+        self.include = include
+        self.exclude = exclude
+        self.transform = transform
+
+
+class Connector:
+    """Manages a connection to an MCP server with tool caching.
+
+    Client creation is deferred to connect() so that:
+    1. Each parallel trace gets fresh client instances
+    2. Connection happens inside trace context (for header injection)
+    """
+
+    def __init__(
+        self,
+        transport: Any,
+        config: ConnectionConfig,
+        name: str,
+        connection_type: ConnectionType,
+        *,
+        auth: str | None = None,
+    ) -> None:
+        # Store transport config - client created in connect()
+        self._transport = transport
+        self._auth = auth
+        self.config = config
+        self.name = name
+        self.connection_type = connection_type
+        self.client: FastMCPClient[Any] | None = None
+        self._tools_cache: list[mcp_types.Tool] | None = None
+
+    def copy(self) -> Connector:
+        """Create a copy of this connector with fresh (unconnected) state.
+
+        The copy shares transport config but has its own client instance,
+        allowing parallel execution without conflicts.
+        """
+        return Connector(
+            transport=self._transport,
+            config=self.config,
+            name=self.name,
+            connection_type=self.connection_type,
+            auth=self._auth,
+        )
+
+    @property
+    def is_local(self) -> bool:
+        """True if this is a local (non-parallelizable) connection."""
+        return self.connection_type == ConnectionType.LOCAL
+
+    @property
+    def is_remote(self) -> bool:
+        """True if this is a remote (parallelizable) connection."""
+        return self.connection_type == ConnectionType.REMOTE
+
+    @property
+    def is_connected(self) -> bool:
+        return self.client is not None and self.client.is_connected()
+
+    @property
+    def cached_tools(self) -> list[mcp_types.Tool]:
+        return self._tools_cache or []
+
+    async def connect(self) -> None:
+        """Create FastMCP client and connect.
+
+        Client is created here (not in __init__) so that:
+        1. Each parallel trace gets fresh client instances
+        2. httpx auto-instrumentation can inject trace headers
+        """
+        from fastmcp.client import Client as FastMCPClient
+
+        # Create fresh client from stored transport config
+        self.client = FastMCPClient(transport=self._transport, auth=self._auth)
+        await self.client.__aenter__()
+
+    async def disconnect(self) -> None:
+        """Disconnect and clear cache."""
+        if self.client is not None and self.is_connected:
+            await self.client.__aexit__(None, None, None)
+        self.client = None
+        self._tools_cache = None
+
+    async def list_tools(self) -> list[mcp_types.Tool]:
+        """Fetch tools from server, apply filters/transforms/prefix, and cache."""
+        if self.client is None:
+            raise RuntimeError("Not connected - call connect() first")
+        tools = await self.client.list_tools()
+
+        result: list[mcp_types.Tool] = []
+        for tool in tools:
+            # Apply include/exclude filter
+            if self.config.include is not None and tool.name not in self.config.include:
+                continue
+            if self.config.exclude is not None and tool.name in self.config.exclude:
+                continue
+
+            # Apply transform
+            if self.config.transform is not None:
+                from fastmcp.tools.tool import Tool as FastMCPTool
+
+                fastmcp_tool = FastMCPTool.model_construct(
+                    name=tool.name,
+                    description=tool.description or "",
+                    parameters=tool.inputSchema,
+                )
+                transformed = self.config.transform(fastmcp_tool)
+                if transformed is None:
+                    continue
+                tool = mcp_types.Tool(
+                    name=transformed.name,
+                    description=transformed.description,
+                    inputSchema=transformed.parameters,
+                )
+
+            # Apply prefix
+            name = f"{self.config.prefix}_{tool.name}" if self.config.prefix else tool.name
+            result.append(
+                mcp_types.Tool(
+                    name=name,
+                    description=tool.description,
+                    inputSchema=tool.inputSchema,
+                )
+            )
+
+        self._tools_cache = result
+        return result
+
+    async def call_tool(
+        self, name: str, arguments: dict[str, Any] | None = None
+    ) -> mcp_types.CallToolResult:
+        """Call a tool, stripping prefix if needed."""
+        if self.client is None:
+            raise RuntimeError("Not connected - call connect() first")
+        # Strip prefix when calling remote
+        if self.config.prefix and name.startswith(f"{self.config.prefix}_"):
+            name = name[len(self.config.prefix) + 1 :]
+        return await self.client.call_tool_mcp(name, arguments or {})
+
+    async def list_resources(self) -> list[mcp_types.Resource]:
+        if self.client is None:
+            raise RuntimeError("Not connected - call connect() first")
+        return await self.client.list_resources()
+
+    async def list_prompts(self) -> list[mcp_types.Prompt]:
+        if self.client is None:
+            raise RuntimeError("Not connected - call connect() first")
+        return await self.client.list_prompts()
+
+    async def read_resource(
+        self, uri: str
+    ) -> list[mcp_types.TextResourceContents | mcp_types.BlobResourceContents]:
+        if self.client is None:
+            raise RuntimeError("Not connected - call connect() first")
+        return await self.client.read_resource(uri)
+
+    async def get_prompt(
+        self, name: str, arguments: dict[str, Any] | None = None
+    ) -> mcp_types.GetPromptResult:
+        if self.client is None:
+            raise RuntimeError("Not connected - call connect() first")
+        return await self.client.get_prompt(name, arguments)
+
+    def __repr__(self) -> str:
+        t = self.connection_type.value
+        return f"Connector({self.name!r}, {t}, connected={self.is_connected})"
diff --git a/hud/environment/connectors/__init__.py b/hud/environment/connectors/__init__.py
new file mode 100644
index 00000000..e88778e1
--- /dev/null
+++ b/hud/environment/connectors/__init__.py
@@ -0,0 +1,33 @@
+"""Connection connectors - methods for connecting to various sources."""
+
+from hud.environment.connectors.local import LocalConnectorMixin
+from hud.environment.connectors.openai import OpenAIConnectorMixin
+from hud.environment.connectors.remote import RemoteConnectorMixin
+
+__all__ = ["ConnectorsMixin"]
+
+
+class ConnectorsMixin(
+    RemoteConnectorMixin,
+    LocalConnectorMixin,
+    OpenAIConnectorMixin,
+):
+    """Combined connector mixin providing all connection methods.
+
+    Remote connections:
+        connect_hub(slug) - HUD Hub environment
+        connect_url(url) - MCP server via URL
+        connect_openapi(spec) - Mount OpenAPI spec as MCP server
+
+    Local connections (in-process):
+        connect_image(image) - Docker image via stdio
+        connect_fastapi(app) - Mount FastAPI app as MCP server
+        connect_server(server) - Mount MCPServer/FastMCP directly
+
+    MCP config:
+        connect_mcp(config) - Single mcp_config server (auto-detects local/remote)
+        connect_mcp_config(mcp_config) - Multiple mcp_config servers
+
+    Framework imports:
+        connect_function_tools(tools) - Import OpenAI Agents SDK FunctionTools
+    """
diff --git a/hud/environment/connectors/base.py b/hud/environment/connectors/base.py
new file mode 100644
index 00000000..94557e94
--- /dev/null
+++ b/hud/environment/connectors/base.py
@@ -0,0 +1,68 @@
+"""Base connector mixin with shared helper."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any
+
+if TYPE_CHECKING:
+    from collections.abc import Callable
+
+    from fastmcp.tools.tool import Tool
+
+    from hud.environment.connection import ConnectionType, Connector
+
+__all__ = ["BaseConnectorMixin"]
+
+
+class BaseConnectorMixin:
+    """Base mixin providing connection helper.
+
+    Requires:
+        _connections: dict[str, Connector]
+    """
+
+    _connections: dict[str, Connector]
+
+    def _add_connection(
+        self,
+        name: str,
+        transport: Any,
+        *,
+        connection_type: ConnectionType,
+        auth: str | None = None,
+        prefix: str | None = None,
+        include: list[str] | None = None,
+        exclude: list[str] | None = None,
+        transform: Callable[[Tool], Tool | None] | None = None,
+    ) -> Any:
+        """Add a connection to the environment.
+
+        Args:
+            name: Connection name/alias.
+            transport: FastMCP transport (URL, config dict, etc.).
+            connection_type: LOCAL or REMOTE - determines parallelization.
+            auth: Authorization header value.
+            prefix: Prefix for tool names.
+            include: Only include these tools.
+            exclude: Exclude these tools.
+            transform: Transform function for tools.
+
+        Returns:
+            self for chaining.
+        """
+        from hud.environment.connection import ConnectionConfig, Connector
+
+        config = ConnectionConfig(
+            prefix=prefix,
+            include=include,
+            exclude=exclude,
+            transform=transform,
+        )
+        self._connections[name] = Connector(
+            transport,
+            config,
+            name,
+            connection_type=connection_type,
+            auth=auth,
+        )
+        return self
diff --git a/hud/environment/connectors/local.py b/hud/environment/connectors/local.py
new file mode 100644
index 00000000..a8ef946a
--- /dev/null
+++ b/hud/environment/connectors/local.py
@@ -0,0 +1,177 @@
+"""Local connection connectors - Docker image, FastAPI, MCPServer."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any
+
+from hud.environment.connectors.mcp_config import MCPConfigConnectorMixin
+
+if TYPE_CHECKING:
+    from collections.abc import Callable
+
+    from fastmcp.tools.tool import Tool
+
+__all__ = ["LocalConnectorMixin"]
+
+
+class LocalConnectorMixin(MCPConfigConnectorMixin):
+    """Mixin providing local connection methods.
+
+    Methods:
+        connect_image(image) - Run Docker image via stdio
+        connect_fastapi(app) - Mount FastAPI app as MCP server
+        connect_server(server) - Mount any MCPServer/FastMCP directly
+
+    Inherits connect_mcp() from MCPConfigConnectorMixin.
+
+    Note: include_router() is inherited from MCPServer (via FastMCP).
+    """
+
+    def connect_image(
+        self,
+        image: str,
+        *,
+        alias: str | None = None,
+        docker_args: list[str] | None = None,
+        env_vars: dict[str, str] | None = None,
+        prefix: str | None = None,
+        include: list[str] | None = None,
+        exclude: list[str] | None = None,
+        transform: Callable[[Tool], Tool | None] | None = None,
+    ) -> Any:
+        """Connect to a Docker image via stdio.
+
+        Creates an MCP config that runs: docker run -i --rm {image}
+        Environment variables from `.env` files are auto-injected.
+
+        Example:
+            ```python
+            env = Environment("my-env")
+            env.connect_image("mcp/fetch")
+
+            async with env:
+                result = await env.call_tool("fetch", url="https://example.com")
+            ```
+        """
+        from hud.cli.utils.docker import create_docker_run_command
+
+        cmd = create_docker_run_command(
+            image=image,
+            docker_args=docker_args,
+            extra_env=env_vars,
+            interactive=True,
+            remove=True,
+        )
+
+        name = alias or image
+        mcp_config = {
+            name: {
+                "command": cmd[0],
+                "args": cmd[1:],
+            }
+        }
+        return self.connect_mcp(
+            mcp_config,
+            alias=name,
+            prefix=prefix,
+            include=include,
+            exclude=exclude,
+            transform=transform,
+        )
+
+    def connect_fastapi(
+        self,
+        app: Any,
+        *,
+        name: str | None = None,
+        prefix: str | None = None,
+        include_hidden: bool = True,
+    ) -> Any:
+        """Import a FastAPI application's routes as MCP tools.
+
+        Uses FastMCP's from_fastapi() to convert FastAPI endpoints to MCP tools,
+        then imports them synchronously so they're available immediately.
+
+        Args:
+            app: FastAPI application instance
+            name: Custom name for the server (defaults to app.title)
+            prefix: Optional prefix for tool names
+            include_hidden: If True (default), includes routes with include_in_schema=False
+
+        Example:
+            ```python
+            from fastapi import FastAPI
+
+            api = FastAPI()
+
+
+            @api.get("/users/{user_id}", operation_id="get_user")
+            def get_user(user_id: int):
+                return {"id": user_id, "name": "Alice"}
+
+
+            env = Environment("my-env")
+            env.connect_fastapi(api)
+
+            async with env:
+                result = await env.call_tool("get_user", user_id=1)
+            ```
+
+        Tip: Use operation_id in FastAPI decorators for cleaner tool names.
+        """
+        from fastmcp import FastMCP
+
+        # Temporarily enable hidden routes for OpenAPI generation
+        hidden_routes: list[Any] = []
+        if include_hidden:
+            for route in getattr(app, "routes", []):
+                if hasattr(route, "include_in_schema") and not route.include_in_schema:
+                    hidden_routes.append(route)
+                    route.include_in_schema = True
+            # Clear cached openapi schema so it regenerates
+            if hasattr(app, "openapi_schema"):
+                app.openapi_schema = None
+
+        try:
+            server_name = name or getattr(app, "title", None) or "fastapi"
+            mcp_server = FastMCP.from_fastapi(app=app, name=server_name)
+            # Use include_router for synchronous import (tools available immediately)
+            self.include_router(mcp_server, prefix=prefix)  # type: ignore
+        finally:
+            # Restore original states
+            for route in hidden_routes:
+                route.include_in_schema = False
+            if hidden_routes and hasattr(app, "openapi_schema"):
+                app.openapi_schema = None  # Clear cache again
+
+        return self
+
+    def connect_server(
+        self,
+        server: Any,
+        *,
+        prefix: str | None = None,
+    ) -> Any:
+        """Import an MCPServer or FastMCP instance's tools directly.
+
+        Example:
+            ```python
+            from fastmcp import FastMCP
+
+            tools = FastMCP("tools")
+
+
+            @tools.tool
+            def greet(name: str) -> str:
+                return f"Hello, {name}!"
+
+
+            env = Environment("my-env")
+            env.connect_server(tools)
+
+            async with env:
+                result = await env.call_tool("greet", name="World")
+            ```
+        """
+        self.include_router(server, prefix=prefix)  # type: ignore
+        return self
diff --git a/hud/environment/connectors/mcp_config.py b/hud/environment/connectors/mcp_config.py
new file mode 100644
index 00000000..db5aa6af
--- /dev/null
+++ b/hud/environment/connectors/mcp_config.py
@@ -0,0 +1,109 @@
+"""MCP config connection connectors."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any
+
+from hud.environment.connectors.base import BaseConnectorMixin
+
+if TYPE_CHECKING:
+    from collections.abc import Callable
+
+    from fastmcp.tools.tool import Tool
+
+__all__ = ["MCPConfigConnectorMixin"]
+
+
+class MCPConfigConnectorMixin(BaseConnectorMixin):
+    """Mixin providing mcp_config connection methods."""
+
+    def connect_mcp(
+        self,
+        config: dict[str, dict[str, Any]],
+        *,
+        alias: str | None = None,
+        prefix: str | None = None,
+        include: list[str] | None = None,
+        exclude: list[str] | None = None,
+        transform: Callable[[Tool], Tool | None] | None = None,
+    ) -> Any:
+        """Connect using an mcp_config dictionary (single server).
+
+        Auto-detects LOCAL (stdio) vs REMOTE (URL) based on config.
+
+        Example:
+            ```python
+            env = Environment("my-env")
+
+            # Stdio server
+            env.connect_mcp(
+                {
+                    "filesystem": {
+                        "command": "npx",
+                        "args": ["-y", "@modelcontextprotocol/server-filesystem", "/tmp"],
+                    }
+                }
+            )
+
+            async with env:
+                await env.call_tool("read_file", path="/tmp/test.txt")
+            ```
+        """
+        from hud.environment.connection import ConnectionType
+
+        name = alias or next(iter(config.keys()), "mcp")
+        server_config = next(iter(config.values()), {})
+
+        is_local = "command" in server_config or "args" in server_config
+        conn_type = ConnectionType.LOCAL if is_local else ConnectionType.REMOTE
+
+        return self._add_connection(
+            name,
+            config,
+            connection_type=conn_type,
+            prefix=prefix,
+            include=include,
+            exclude=exclude,
+            transform=transform,
+        )
+
+    def connect_mcp_config(
+        self,
+        mcp_config: dict[str, dict[str, Any]],
+        **kwargs: Any,
+    ) -> Any:
+        """Connect multiple servers from an mcp_config dictionary.
+
+        Example:
+            ```python
+            env = Environment("my-env")
+
+            # Claude Desktop style config
+            env.connect_mcp_config(
+                {
+                    "filesystem": {
+                        "command": "npx",
+                        "args": ["-y", "@modelcontextprotocol/server-filesystem", "/tmp"],
+                    },
+                    "github": {
+                        "command": "npx",
+                        "args": ["-y", "@modelcontextprotocol/server-github"],
+                        "env": {"GITHUB_TOKEN": "..."},
+                    },
+                }
+            )
+
+            async with env:
+                await env.call_tool("read_file", path="/tmp/test.txt")
+                await env.call_tool("search_repositories", query="mcp")
+            ```
+        """
+        # Store mcp_config for serialization (v4 format)
+        # Merge with existing if called multiple times
+        if not hasattr(self, "_mcp_config") or self._mcp_config is None:
+            self._mcp_config = {}
+        self._mcp_config.update(mcp_config)
+
+        for server_name, server_config in mcp_config.items():
+            self.connect_mcp({server_name: server_config}, alias=server_name, **kwargs)
+        return self
diff --git a/hud/environment/connectors/openai.py b/hud/environment/connectors/openai.py
new file mode 100644
index 00000000..4b08929a
--- /dev/null
+++ b/hud/environment/connectors/openai.py
@@ -0,0 +1,101 @@
+"""OpenAI Agents SDK connectors - import tools from OpenAI agents."""
+
+from __future__ import annotations
+
+import json
+from typing import Any
+
+__all__ = ["OpenAIConnectorMixin"]
+
+
+class OpenAIConnectorMixin:
+    """Mixin providing OpenAI Agents SDK connector methods."""
+
+    # These are defined on Environment/MCPServer
+    _tool_manager: Any
+
+    def connect_function_tools(
+        self,
+        tools: list[Any],
+        *,
+        prefix: str | None = None,
+    ) -> Any:
+        """Import FunctionTools from the OpenAI Agents SDK.
+
+        Wraps each tool so calls go through HUD with telemetry.
+
+        Example:
+            ```python
+            from agents import function_tool
+
+
+            @function_tool
+            def search(query: str) -> str:
+                '''Search for information.'''
+                return f"Results for {query}"
+
+
+            @function_tool
+            def calculate(expression: str) -> float:
+                '''Evaluate a math expression.'''
+                return eval(expression)
+
+
+            env = Environment("my-env")
+            env.connect_function_tools([search, calculate])
+
+            async with env:
+                result = await env.call_tool("search", query="MCP protocol")
+            ```
+
+        Note:
+            Requires `openai-agents`: pip install openai-agents
+        """
+        try:
+            from agents import FunctionTool
+        except ImportError as e:
+            raise ImportError(
+                "openai-agents is required for connect_function_tools. "
+                "Install with: pip install openai-agents"
+            ) from e
+
+        for tool in tools:
+            if isinstance(tool, FunctionTool):
+                self._add_openai_function_tool(tool, prefix)
+
+        return self
+
+    def _add_openai_function_tool(self, tool: Any, prefix: str | None) -> None:
+        """Convert OpenAI FunctionTool to local MCP tool."""
+        name = f"{prefix}_{tool.name}" if prefix else tool.name
+
+        # Get the original invoke function
+        original_invoke = tool.on_invoke_tool
+
+        # Create wrapper that calls the original
+        async def invoke(**arguments: Any) -> Any:
+            # OpenAI's on_invoke_tool expects (ToolContext, str_json_args)
+            # We need to create a minimal context
+            from agents.tool_context import ToolContext
+
+            ctx = ToolContext(context=None)
+            result = await original_invoke(ctx, json.dumps(arguments))
+            return result
+
+        # Set function metadata for FastMCP
+        invoke.__name__ = name
+        invoke.__doc__ = tool.description
+
+        # Register using FastMCP's tool decorator mechanism
+        # We access the internal _tool_manager from MCPServer
+        from fastmcp.tools import Tool as FastMCPTool
+
+        fastmcp_tool = FastMCPTool.from_function(
+            fn=invoke,
+            name=name,
+            description=tool.description,
+        )
+        # Override the schema with OpenAI's (more accurate)
+        fastmcp_tool.parameters = tool.params_json_schema
+
+        self._tool_manager.add_tool(fastmcp_tool)
diff --git a/hud/environment/connectors/remote.py b/hud/environment/connectors/remote.py
new file mode 100644
index 00000000..b5cdda0b
--- /dev/null
+++ b/hud/environment/connectors/remote.py
@@ -0,0 +1,173 @@
+"""Remote connection connectors - HUD Hub, URL, OpenAPI."""
+
+from __future__ import annotations
+
+import logging
+from typing import TYPE_CHECKING, Any, cast
+
+from hud.environment.connectors.mcp_config import MCPConfigConnectorMixin
+
+if TYPE_CHECKING:
+    from collections.abc import Callable
+
+    from fastmcp.tools.tool import Tool
+
+__all__ = ["RemoteConnectorMixin"]
+
+logger = logging.getLogger(__name__)
+
+
+class RemoteConnectorMixin(MCPConfigConnectorMixin):
+    """Mixin providing remote connection methods.
+
+    Note: include_router() is inherited from MCPServer (via FastMCP).
+    """
+
+    def connect_hub(
+        self,
+        slug: str,
+        *,
+        alias: str | None = None,
+        prefix: str | None = None,
+        include: list[str] | None = None,
+        exclude: list[str] | None = None,
+        transform: Callable[[Tool], Tool | None] | None = None,
+    ) -> Any:
+        """Connect to a HUD Hub environment.
+
+        Creates an MCP connection to the HUD API with the hub slug in headers.
+
+        Example:
+            ```python
+            env = Environment("my-env")
+            env.connect_hub("browser")
+
+            async with env:
+                await env.call_tool("navigate", url="https://google.com")
+            ```
+        """
+        from hud.settings import settings
+
+        logger.info("Connecting to hub environment: %s", slug)
+
+        # Store hub config for serialization (v5 format)
+        # Note: Only first hub is stored for serialization (task configs use single hub)
+        if not hasattr(self, "_hub_config") or self._hub_config is None:
+            hub_config: dict[str, Any] = {"name": slug}
+            if include:
+                hub_config["include"] = include
+            if exclude:
+                hub_config["exclude"] = exclude
+            self._hub_config = hub_config
+
+        # Create mcp_config with standard MCP URL and hub slug in headers
+        mcp_config = {
+            "hud": {
+                "url": settings.hud_mcp_url,
+                "headers": {
+                    "Authorization": f"Bearer {settings.api_key}",
+                    "Environment-Name": slug,
+                },
+            }
+        }
+
+        self.connect_mcp_config(
+            mcp_config, prefix=prefix, include=include, exclude=exclude, transform=transform
+        )
+        logger.info("Hub connected: %s", slug)
+        return self
+
+    def connect_url(
+        self,
+        url: str,
+        *,
+        headers: dict[str, str] | None = None,
+        alias: str | None = None,
+        prefix: str | None = None,
+        include: list[str] | None = None,
+        exclude: list[str] | None = None,
+        transform: Callable[[Tool], Tool | None] | None = None,
+    ) -> Any:
+        """Connect to an MCP server via URL.
+
+        Example:
+            ```python
+            env = Environment("my-env")
+            env.connect_url(
+                "https://mcp.example.com",
+                headers={"Authorization": "Bearer token"},
+            )
+
+            async with env:
+                await env.call_tool("search", query="hello")
+            ```
+        """
+        from hud.environment.connection import ConnectionType
+
+        auth = headers.get("Authorization") if headers else None
+        return self._add_connection(
+            alias or url,
+            url,
+            connection_type=ConnectionType.REMOTE,
+            auth=auth,
+            prefix=prefix,
+            include=include,
+            exclude=exclude,
+            transform=transform,
+        )
+
+    def connect_openapi(
+        self,
+        openapi_spec: dict[str, Any] | str,
+        *,
+        base_url: str | None = None,
+        headers: dict[str, str] | None = None,
+        name: str | None = None,
+        prefix: str | None = None,
+        timeout: float = 30.0,
+    ) -> Any:
+        """Mount an OpenAPI specification as an MCP server.
+
+        Converts REST API endpoints to MCP tools. Base URL is auto-inferred
+        from the spec URL when possible.
+
+        Example:
+            ```python
+            env = Environment("my-env")
+            env.connect_openapi("https://petstore.swagger.io/v2/swagger.json")
+
+            async with env:
+                result = await env.call_tool("getPetById", petId=1)
+            ```
+        """
+        from urllib.parse import urlparse
+
+        import httpx
+        from fastmcp import FastMCP
+
+        if isinstance(openapi_spec, str):
+            if openapi_spec.startswith(("http://", "https://")):
+                if base_url is None:
+                    parsed = urlparse(openapi_spec)
+                    base_url = f"{parsed.scheme}://{parsed.netloc}"
+
+                resp = httpx.get(openapi_spec, headers=headers)
+                resp.raise_for_status()
+                openapi_spec = resp.json()
+            else:
+                import json
+
+                with open(openapi_spec) as f:
+                    openapi_spec = json.load(f)
+
+        if base_url is None:
+            raise ValueError("base_url is required when openapi_spec is a dict or file")
+
+        client = httpx.AsyncClient(base_url=base_url, headers=headers or {}, timeout=timeout)
+        mcp_server = FastMCP.from_openapi(
+            openapi_spec=cast("dict[str, Any]", openapi_spec),
+            client=client,
+            name=name or "openapi",
+        )
+        self.include_router(mcp_server, prefix=prefix)  # type: ignore
+        return self
diff --git a/hud/environment/environment.py b/hud/environment/environment.py
new file mode 100644
index 00000000..320c8e04
--- /dev/null
+++ b/hud/environment/environment.py
@@ -0,0 +1,686 @@
+"""Environment class - unified MCP server and client."""
+
+from __future__ import annotations
+
+import asyncio
+import logging
+from collections.abc import Awaitable, Callable
+from typing import TYPE_CHECKING, Any, Literal, Self
+
+import mcp.types as mcp_types
+
+from hud.environment.connectors import ConnectorsMixin
+from hud.environment.integrations import IntegrationsMixin
+from hud.environment.mock import MockMixin
+from hud.environment.router import ConflictResolution, ToolRouter
+from hud.environment.scenarios import ScenarioMixin
+from hud.server.server import MCPServer
+from hud.types import MCPToolResult
+
+if TYPE_CHECKING:
+    import types
+
+    from hud.environment.connection import Connector
+    from hud.eval.task import Task
+
+__all__ = ["Environment"]
+
+logger = logging.getLogger(__name__)
+
+# Suppress verbose fastmcp logging
+logging.getLogger("fastmcp.server.server").setLevel(logging.WARNING)
+logging.getLogger("fastmcp.server.openapi").setLevel(logging.WARNING)
+
+# Type alias for async callables (no-arg functions that return awaitable)
+AsyncCallable = Callable[[], Awaitable[Any]]
+
+
+class Environment(
+    ConnectorsMixin,
+    IntegrationsMixin,
+    MockMixin,
+    ScenarioMixin,
+    MCPServer,
+):
+    """Unified MCP environment that acts as both server and client.
+
+    Features:
+        - Define local tools with @env.tool decorator
+        - Connect to HUD Hub, URLs, or mcp_config dicts
+        - Automatic tool routing (local vs remote)
+        - Format tools for any LLM provider
+        - Integrate with popular agent frameworks
+        - Mock mode for testing without real connections
+
+    Connector methods (connect to sources):
+        connect_hub(name) - HUD Hub environment
+        connect_url(url) - MCP server via URL
+        connect_mcp(config) - Single mcp_config server
+        connect_mcp_config(mcp_config) - Multiple mcp_config servers
+        connect_image(image) - Docker image via stdio
+        connect_fastapi(app) - Mount FastAPI app as MCP server
+        connect_openapi(spec) - Mount OpenAPI spec as MCP server
+        connect_server(server) - Mount MCPServer/FastMCP directly
+
+    Mock methods (for testing):
+        mock() - Enable mock mode, all tools return mock values
+        unmock() - Disable mock mode
+        mock_tool(name, output) - Set specific mock output for a tool
+        is_mock - Check if mock mode is enabled
+
+    OpenAI integrations:
+        as_openai_chat_tools() - Chat Completions format
+        as_openai_responses_tools() - Responses API format
+        as_openai_agent_tools() - Agents SDK (requires openai-agents)
+
+    Anthropic/Claude integrations:
+        as_claude_tools() - Claude API format
+        as_claude_programmatic_tools() - Programmatic tool use
+        as_anthropic_runner() - Tool runner (requires anthropic)
+
+    Google/Gemini integrations:
+        as_gemini_tools() - Gemini format
+        as_gemini_tool_config() - Tool execution config
+
+    LangChain integrations:
+        as_langchain_tools() - StructuredTools (requires langchain-core)
+
+    Example:
+        ```python
+        env = Environment("my-env")
+
+
+        @env.tool
+        def greet(name: str) -> str:
+            return f"Hello, {name}!"
+
+
+        env.connect_hub("browser", prefix="browser")
+
+        async with env:
+            # Get tools in any format
+            openai_tools = env.as_openai_chat_tools()
+            claude_tools = env.as_claude_tools()
+
+            # Call tools - automatically routed
+            result = await env.call_tool("greet", name="World")
+
+            # Or pass provider-specific format - auto-detected
+            result = await env.call_tool(response.choices[0].message.tool_calls[0])
+
+        # Mock mode for testing
+        env.mock()
+        env.mock_tool("browser_navigate", "Navigation successful")
+        async with env:
+            result = await env.call_tool("browser_navigate", url="https://example.com")
+            # Returns mock value instead of actually navigating
+        ```
+    """
+
+    MAX_CONCURRENT_CONNECTIONS = 10
+
+    def __init__(
+        self,
+        name: str = "environment",
+        instructions: str | None = None,
+        conflict_resolution: ConflictResolution = ConflictResolution.PREFIX,
+        **fastmcp_kwargs: Any,
+    ) -> None:
+        super().__init__(name=name, instructions=instructions, **fastmcp_kwargs)
+        self._connections: dict[str, Connector] = {}
+        self._router = ToolRouter(conflict_resolution=conflict_resolution)
+        self._in_context = False
+
+        # Tool call queues - run after connections established
+        self._setup_calls: list[tuple[str, dict[str, Any]]] = []
+        self._evaluate_calls: list[tuple[str, dict[str, Any]]] = []
+
+        # Default prompt (EvalContext has per-run prompt)
+        self.prompt: str | None = None
+
+        # Serialization support
+        # _hub_config: set by connect_hub() for v5 format {"name": "hub", "include": [...]}
+        # _mcp_config: set by connect_mcp_config() for v4 format {"server_name": {...}}
+        self._hub_config: dict[str, Any] | None = None
+        self._mcp_config: dict[str, dict[str, Any]] | None = None
+
+        # Agent-level tool filtering (applied in as_tools(), not at connection level)
+        # This allows Environment to call all tools while limiting agent visibility
+        self._agent_include: list[str] | None = None
+        self._agent_exclude: list[str] | None = None
+
+        # Initialize mock state
+        self._init_mock()
+
+        # Initialize scenario state
+        self._init_scenarios()
+
+    # =========================================================================
+    # Core Methods
+    # =========================================================================
+
+    def as_tools(self) -> list[mcp_types.Tool]:
+        """Return tools in MCP format (base format).
+
+        Applies agent-level include/exclude filtering if set.
+        """
+        tools = self._router.tools
+
+        # Apply agent-level filtering (from v4 allowed_tools/disallowed_tools)
+        if self._agent_include is not None or self._agent_exclude is not None:
+            filtered = []
+            for tool in tools:
+                # Include filter: None means include all
+                if self._agent_include is not None and tool.name not in self._agent_include:
+                    continue
+                # Exclude filter
+                if self._agent_exclude is not None and tool.name in self._agent_exclude:
+                    continue
+                filtered.append(tool)
+            return filtered
+
+        return tools
+
+    async def call_tool(self, call: Any, /, **kwargs: Any) -> Any:
+        """Call a tool, auto-detecting format and returning matching result format.
+
+        Accepts any format:
+            - String with kwargs: call_tool("navigate", url="...")
+            - Tuple: call_tool(("navigate", {"url": "..."}))
+            - MCPToolCall: call_tool(MCPToolCall(name="navigate", ...))
+            - OpenAI: call_tool(response.choices[0].message.tool_calls[0])
+            - Claude: call_tool(response.content[0])  # tool_use block
+            - Gemini: call_tool(response.candidates[0].content.parts[0])
+
+        Returns:
+            Result formatted to match input format (OpenAI -> OpenAI tool message, etc.)
+        """
+        from hud.environment.utils import format_result, parse_tool_call
+
+        # Parse the tool call (kwargs merged when call is string)
+        parsed, fmt = parse_tool_call(call, **kwargs)
+        result = await self._execute_tool(parsed.name, parsed.arguments or {})
+        return format_result(result, parsed, fmt)
+
+    def _connections_with_tool(self, tool_name: str) -> set[str]:
+        """Get connection names that have a specific tool.
+
+        Uses cached_tools from each Connector to check availability.
+        """
+        result = set()
+        for name, connector in self._connections.items():
+            tool_names = {t.name for t in connector.cached_tools}
+            if tool_name in tool_names:
+                result.add(name)
+        return result
+
+    async def _broadcast_tool(
+        self,
+        tool_name: str,
+        **kwargs: Any,
+    ) -> dict[str, Any]:
+        """Broadcast a tool call to all connections that have the tool.
+
+        Automatically filters to only connections where the tool exists
+        (based on cached_tools from initial discovery).
+
+        Args:
+            tool_name: Name of the tool to call
+            **kwargs: Arguments to pass to the tool
+
+        Returns:
+            Dict mapping connection name to result (or exception)
+        """
+        import asyncio
+
+        # Only call connections that have this tool
+        targets = self._connections_with_tool(tool_name)
+        if not targets:
+            return {}
+
+        results: dict[str, Any] = {}
+
+        async def call_one(name: str) -> None:
+            connector = self._connections.get(name)
+            if not connector or not connector.client:
+                return
+            try:
+                results[name] = await connector.client.call_tool(tool_name, **kwargs)
+                logger.debug("Broadcast '%s' to '%s' succeeded", tool_name, name)
+            except Exception as e:
+                results[name] = e
+                logger.debug("Broadcast '%s' to '%s' failed: %s", tool_name, name, e)
+
+        await asyncio.gather(*[call_one(n) for n in targets], return_exceptions=True)
+        return results
+
+    async def call_tools(self, calls: Any) -> list[Any]:
+        """Call multiple tools, returning results in matching formats."""
+        if calls is None:
+            return []
+        if not isinstance(calls, list):
+            return [await self.call_tool(calls)]
+
+        # Filter to tool calls only (skip text blocks, etc.)
+        tool_calls = []
+        for call in calls:
+            t = call.get("type") if isinstance(call, dict) else getattr(call, "type", None)
+            if t is None or t in ("tool_use", "function"):
+                tool_calls.append(call)
+
+        return await asyncio.gather(*[self.call_tool(c) for c in tool_calls])
+
+    # =========================================================================
+    # Lifecycle Configuration
+    # =========================================================================
+
+    def setup_tool(self, call: Any, /, **kwargs: Any) -> Environment:
+        """Add a tool call to execute after connections are established."""
+        from hud.environment.utils import parse_tool_call
+
+        if isinstance(call, str) and kwargs:
+            self._setup_calls.append((call, kwargs))
+        else:
+            parsed, _ = parse_tool_call(call)
+            self._setup_calls.append((parsed.name, parsed.arguments or {}))
+        return self
+
+    def evaluate_tool(self, call: Any, /, **kwargs: Any) -> Environment:
+        """Add a tool call to execute before disconnecting."""
+        from hud.environment.utils import parse_tool_call
+
+        if isinstance(call, str) and kwargs:
+            self._evaluate_calls.append((call, kwargs))
+        else:
+            parsed, _ = parse_tool_call(call)
+            self._evaluate_calls.append((parsed.name, parsed.arguments or {}))
+        return self
+
+    # =========================================================================
+    # Context Manager
+    # =========================================================================
+
+    async def __aenter__(self) -> Self:
+        """Connect all connectors, build routing, run setup tools."""
+        self._in_context = True
+
+        # Connect to all servers (on_connect callbacks run first within connect())
+        sem = asyncio.Semaphore(self.MAX_CONCURRENT_CONNECTIONS)
+        errors: list[tuple[str, Exception]] = []
+
+        async def connect_one(name: str, conn: Connector) -> None:
+            async with sem:
+                try:
+                    await conn.connect()
+                    await conn.list_tools()
+                except Exception as e:
+                    errors.append((name, e))
+
+        if self._connections:
+            await asyncio.gather(*[connect_one(n, c) for n, c in self._connections.items()])
+            if errors:
+                for conn in self._connections.values():
+                    if conn.is_connected:
+                        await conn.disconnect()
+                name, err = errors[0]
+                raise ConnectionError(f"Failed to connect to {name}") from err
+
+        await self._build_routing()
+
+        # Setup tool calls (after connections)
+        for name, args in self._setup_calls:
+            await self._execute_tool(name, args)
+
+        return self
+
+    async def __aexit__(
+        self,
+        exc_type: type[BaseException] | None,
+        exc_val: BaseException | None,
+        exc_tb: types.TracebackType | None,
+    ) -> None:
+        """Run evaluate tools, exit queue, then disconnect."""
+        from hud.agents.base import find_reward
+
+        # Evaluate tool calls and collect rewards
+        rewards: list[float] = []
+        for name, args in self._evaluate_calls:
+            try:
+                result = await self._execute_tool(name, args)
+                rewards.append(find_reward(result))
+            except Exception as e:
+                logger.warning("Evaluate tool %s failed: %s", name, e)
+
+        # Store average reward from evaluate tools
+        self._evaluate_reward: float | None = None
+        if rewards:
+            self._evaluate_reward = sum(rewards) / len(rewards)
+
+        self._in_context = False
+        if self._connections:
+            await asyncio.gather(*[c.disconnect() for c in self._connections.values()])
+        self._router.clear()
+
+    async def _build_routing(self) -> None:
+        """Build tool routing from local tools and connection caches."""
+        # Use get_tools() not list_tools() - it includes mounted servers without
+        # requiring MCP server communication (via_server=False)
+        local_tools_dict = await self._tool_manager.get_tools()
+        local_tools = list(local_tools_dict.values())
+        self._router.build(
+            local_tools=[t.to_mcp_tool() for t in local_tools],
+            connections=self._connections,
+            connection_order=list(self._connections.keys()),
+        )
+        # Populate mock schemas for auto-generated mock values
+        self._populate_mock_schemas()
+
+    # =========================================================================
+    # Tool Operations
+    # =========================================================================
+
+    async def list_tools(self) -> list[mcp_types.Tool]:
+        """Refresh tools from all connections and rebuild routing."""
+        if self._connections:
+            await asyncio.gather(*[c.list_tools() for c in self._connections.values()])
+        await self._build_routing()
+        return self._router.tools
+
+    async def _execute_tool(self, name: str, arguments: dict[str, Any]) -> MCPToolResult:
+        """Execute a tool by name. Routes to local or remote handler.
+
+        If mock mode is enabled, returns a mock result instead of executing.
+        """
+        # Check mock mode first
+        if self._mock_mode:
+            logger.debug("Mock mode: returning mock result for tool %s", name)
+            return self._get_mock_result(name, arguments)
+
+        if self._router.is_local(name):
+            # Call tool manager directly to avoid FastMCP context requirement
+            result = await self._tool_manager.call_tool(name, arguments)
+            return MCPToolResult(content=result.content, isError=False)
+
+        connection_name = self._router.get_connection(name)
+        if connection_name:
+            conn = self._connections[connection_name]
+            result = await conn.call_tool(name, arguments)
+            return MCPToolResult(content=result.content, isError=result.isError)
+
+        raise ValueError(f"Tool not found: {name}")
+
+    # =========================================================================
+    # Resource Operations
+    # =========================================================================
+
+    async def list_resources(self) -> list[mcp_types.Resource]:
+        """List all resources (local + remote)."""
+        local = list((await self._resource_manager.get_resources()).values())
+        resources: list[mcp_types.Resource] = [r.to_mcp_resource() for r in local]
+
+        if self._connections:
+            results = await asyncio.gather(
+                *[c.list_resources() for c in self._connections.values()], return_exceptions=True
+            )
+            for r in results:
+                if isinstance(r, list):
+                    resources.extend(r)
+
+        return resources
+
+    async def read_resource(
+        self, uri: str
+    ) -> list[mcp_types.TextResourceContents | mcp_types.BlobResourceContents]:
+        """Read a resource by URI (tries local first, then remote)."""
+        from pydantic import AnyUrl
+
+        try:
+            result = await self._resource_manager.read_resource(uri)
+            resource_uri = AnyUrl(uri)
+            if isinstance(result, str):
+                return [mcp_types.TextResourceContents(uri=resource_uri, text=result)]
+            import base64
+
+            return [
+                mcp_types.BlobResourceContents(
+                    uri=resource_uri, blob=base64.b64encode(result).decode()
+                )
+            ]
+        except Exception as e:
+            logger.debug("Local resource read failed for %s: %s", uri, e)
+
+        for conn in self._connections.values():
+            try:
+                return await conn.read_resource(uri)
+            except Exception as e:
+                logger.debug("Remote resource read failed for %s: %s", uri, e)
+                continue
+
+        raise ValueError(f"Resource not found: {uri}")
+
+    # =========================================================================
+    # Prompt Operations
+    # =========================================================================
+
+    async def list_prompts(self) -> list[mcp_types.Prompt]:
+        """List all prompts (local + remote)."""
+        local = list((await self._prompt_manager.get_prompts()).values())
+        prompts: list[mcp_types.Prompt] = [p.to_mcp_prompt() for p in local]
+
+        if self._connections:
+            results = await asyncio.gather(
+                *[c.list_prompts() for c in self._connections.values()], return_exceptions=True
+            )
+            for r in results:
+                if isinstance(r, list):
+                    prompts.extend(r)
+
+        return prompts
+
+    async def get_prompt(
+        self, name: str, arguments: dict[str, Any] | None = None
+    ) -> mcp_types.GetPromptResult:
+        """Get a prompt by name (tries local first, then remote)."""
+        try:
+            return await self._prompt_manager.render_prompt(name, arguments or {})
+        except Exception as e:
+            logger.debug("Local prompt render failed for %s: %s", name, e)
+
+        for conn in self._connections.values():
+            try:
+                return await conn.get_prompt(name, arguments)
+            except Exception as e:
+                logger.debug("Remote prompt get failed for %s: %s", name, e)
+                continue
+
+        raise ValueError(f"Prompt not found: {name}")
+
+    # =========================================================================
+    # Server Methods
+    # =========================================================================
+
+    def serve(
+        self,
+        transport: Literal["stdio", "sse", "streamable-http"] = "streamable-http",
+        host: str = "0.0.0.0",  # noqa: S104
+        port: int = 8000,
+        **kwargs: Any,
+    ) -> None:
+        """Start serving as an MCP server."""
+        self.run(transport=transport, host=host, port=port, **kwargs)
+
+    # =========================================================================
+    # Properties
+    # =========================================================================
+
+    @property
+    def connections(self) -> dict[str, Connector]:
+        return self._connections
+
+    @property
+    def is_connected(self) -> bool:
+        return self._in_context
+
+    @property
+    def is_parallelizable(self) -> bool:
+        """True if all connections are remote (can spawn multiple instances)."""
+        if not self._connections:
+            return True  # No connections = can parallelize (local tools only)
+        return all(conn.is_remote for conn in self._connections.values())
+
+    @property
+    def local_connections(self) -> list[str]:
+        """Names of local (non-parallelizable) connections."""
+        return [name for name, conn in self._connections.items() if conn.is_local]
+
+    # =========================================================================
+    # Serialization
+    # =========================================================================
+
+    @property
+    def is_serializable(self) -> bool:
+        """True if environment can be serialized (no local tools/scenarios).
+
+        For v5 format: requires hub config from connect_hub()
+        For v4 format: requires mcp_config, prompt, AND evaluate_tool
+        """
+        # Check for local tools (registered via @env.tool)
+        if self._router._local_names:
+            return False
+        # Check for local scenarios (registered via @env.scenario)
+        if getattr(self, "_scenarios", {}):
+            return False
+        # v5 hub format
+        if self._hub_config is not None:
+            return True
+        # v4 format requires mcp_config + prompt + evaluate_tool
+        if self._mcp_config is not None:
+            return bool(self.prompt and self._evaluate_calls)
+        return False
+
+    def to_config(self) -> dict[str, Any]:
+        """Serialize environment config for remote submission.
+
+        Returns the config in either v5 format (hub-based) or v4 format (legacy).
+        For v4 format, automatically includes prompt, setup_tool, and evaluate_tool
+        from the Environment's state.
+
+        Returns:
+            dict: Serializable config
+
+        Raises:
+            ValueError: If environment has local tools/scenarios that can't be serialized
+
+        Example:
+            ```python
+            # v5 hub-based
+            env = Environment("my").connect_hub("browser", include=["navigate"])
+            env.to_config()  # {"name": "browser", "include": ["navigate"]}
+
+            # v4 legacy (from Task.from_v4())
+            task = Task.from_v4(legacy_task)
+            task.env.to_config()  # {"prompt": "...", "mcp_config": {...}, ...}
+            ```
+        """
+        if self._router._local_names:
+            raise ValueError(
+                f"Cannot serialize Environment with local tools: "
+                f"{list(self._router._local_names)}. "
+                "Local tools require local execution. For remote submission, "
+                "use dict config or connect to a remote hub."
+            )
+        if getattr(self, "_scenarios", {}):
+            raise ValueError(
+                f"Cannot serialize Environment with local scenarios: "
+                f"{list(self._scenarios.keys())}. "
+                "Local scenarios require local execution. For remote submission, "
+                "define scenarios on the remote environment."
+            )
+
+        # v5 hub-based format
+        if self._hub_config is not None:
+            return self._hub_config.copy()
+
+        # v4 legacy format - requires mcp_config, prompt, AND evaluate_tool
+        if self._mcp_config is not None:
+            # Validate required fields for v4 format
+            if not self.prompt:
+                raise ValueError(
+                    "Cannot serialize v4 Environment without prompt. "
+                    "Set env.prompt before serializing."
+                )
+            if not self._evaluate_calls:
+                raise ValueError(
+                    "Cannot serialize v4 Environment without evaluate_tool. "
+                    "Use env.evaluate_tool() to define evaluation criteria."
+                )
+
+            config: dict[str, Any] = {
+                "prompt": self.prompt,
+                "mcp_config": self._mcp_config,
+                "evaluate_tool": [
+                    {"name": name, "arguments": args} for name, args in self._evaluate_calls
+                ],
+            }
+            if self._setup_calls:
+                config["setup_tool"] = [
+                    {"name": name, "arguments": args} for name, args in self._setup_calls
+                ]
+            return config
+
+        raise ValueError(
+            "Cannot serialize Environment without config. "
+            "Use connect_hub() for v5 tasks or connect_mcp_config() for legacy tasks."
+        )
+
+    def __repr__(self) -> str:
+        return f"Environment({self.name!r}, connections={list(self._connections.keys())})"
+
+    # =========================================================================
+    # Task Creation
+    # =========================================================================
+
+    def __call__(
+        self,
+        scenario: str | None = None,
+        **args: Any,
+    ) -> Task:
+        """Create a Task from this environment.
+
+        Returns a Task that can be passed to hud.eval() for orchestration.
+
+        Args:
+            scenario: Scenario name to run (from @env.scenario). Optional for v4 legacy.
+            **args: Arguments for the scenario
+
+        Returns:
+            Task: A runnable evaluation unit
+
+        Example:
+            ```python
+            env = Environment("my-env").connect_hub("browser")
+
+
+            @env.scenario()
+            async def checkout(user_id: str):
+                yield "Complete checkout"
+                yield 1.0
+
+
+            # Single task via hud.eval
+            async with hud.eval(env("checkout", user_id="alice")) as ctx:
+                await agent.run(ctx.prompt)
+
+            # Multiple tasks with variants
+            tasks = [env("checkout", user_id="alice"), env("checkout", user_id="bob")]
+            async with hud.eval(tasks, variants={"model": ["gpt-4o"]}, group=4) as ctx:
+                ...
+            ```
+        """
+        from hud.eval.task import Task
+
+        return Task(
+            env=self,
+            scenario=scenario,
+            args=args,
+        )
diff --git a/hud/environment/integrations/__init__.py b/hud/environment/integrations/__init__.py
new file mode 100644
index 00000000..412f283f
--- /dev/null
+++ b/hud/environment/integrations/__init__.py
@@ -0,0 +1,45 @@
+"""Provider integrations - format conversion and framework tools."""
+
+from hud.environment.integrations.adk import ADKMixin
+from hud.environment.integrations.anthropic import AnthropicMixin
+from hud.environment.integrations.gemini import GeminiMixin
+from hud.environment.integrations.langchain import LangChainMixin
+from hud.environment.integrations.llamaindex import LlamaIndexMixin
+from hud.environment.integrations.openai import OpenAIMixin
+
+__all__ = ["IntegrationsMixin"]
+
+
+class IntegrationsMixin(
+    OpenAIMixin,
+    AnthropicMixin,
+    GeminiMixin,
+    LangChainMixin,
+    LlamaIndexMixin,
+    ADKMixin,
+):
+    """Combined integration mixin for all providers.
+
+    OpenAI:
+        as_openai_chat_tools() - Chat Completions format
+        as_openai_responses_tools() - Responses API format
+        as_openai_agent_tools() - Agents SDK (requires openai-agents)
+
+    Anthropic/Claude:
+        as_claude_tools() - Claude API format
+        as_claude_programmatic_tools() - Programmatic tool use
+        as_anthropic_runner() - Tool runner (requires anthropic)
+
+    Google/Gemini:
+        as_gemini_tools() - Gemini format
+        as_gemini_tool_config() - Tool config
+
+    Google ADK:
+        as_adk_tools() - ADK FunctionTool objects (requires google-adk)
+
+    LangChain:
+        as_langchain_tools() - StructuredTools (requires langchain-core)
+
+    LlamaIndex:
+        as_llamaindex_tools() - FunctionTools (requires llama-index-core)
+    """
diff --git a/hud/environment/integrations/adk.py b/hud/environment/integrations/adk.py
new file mode 100644
index 00000000..0498fd1a
--- /dev/null
+++ b/hud/environment/integrations/adk.py
@@ -0,0 +1,67 @@
+"""Google ADK integration."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any
+
+from hud.environment.utils.tool_wrappers import create_async_tool_fn
+
+if TYPE_CHECKING:
+    import mcp.types as mcp_types
+
+__all__ = ["ADKMixin"]
+
+
+class ADKMixin:
+    """Mixin providing Google ADK (Agent Development Kit) integration.
+
+    Integration methods (requires google-adk):
+        as_adk_tools() - ADK FunctionTool objects
+
+    Requires: as_tools() -> list[mcp_types.Tool], call_tool(name, args)
+    """
+
+    def as_tools(self) -> list[mcp_types.Tool]:
+        raise NotImplementedError
+
+    async def call_tool(self, name: str, arguments: dict[str, Any]) -> Any:
+        raise NotImplementedError
+
+    def as_adk_tools(self) -> list[Any]:
+        """Convert to Google ADK FunctionTool objects.
+
+        Requires: pip install google-adk
+
+        Returns:
+            List of FunctionTool objects for Google ADK agents.
+
+        Example:
+            ```python
+            from google.adk.agents import Agent
+            from google.adk.runners import Runner
+
+            async with env:
+                agent = Agent(
+                    name="assistant",
+                    model="gemini-2.0-flash",
+                    instruction="You are a helpful assistant.",
+                    tools=env.as_adk_tools(),
+                )
+                runner = Runner(agent=agent)
+                result = await runner.run("Find information about Python")
+            ```
+        """
+        try:
+            from google.adk.tools.function_tool import FunctionTool
+        except ImportError as e:
+            raise ImportError(
+                "Google ADK not installed. Install with: pip install google-adk"
+            ) from e
+
+        tools = []
+        for t in self.as_tools():
+            # ADK only needs async function - it wraps it in FunctionTool
+            async_fn = create_async_tool_fn(self, t.name, t.description)
+            tool = FunctionTool(async_fn)
+            tools.append(tool)
+        return tools
diff --git a/hud/environment/integrations/anthropic.py b/hud/environment/integrations/anthropic.py
new file mode 100644
index 00000000..66f84b4f
--- /dev/null
+++ b/hud/environment/integrations/anthropic.py
@@ -0,0 +1,196 @@
+"""Anthropic/Claude integrations - format conversion and tool runner."""
+
+from __future__ import annotations
+
+import json
+from typing import TYPE_CHECKING, Any
+
+if TYPE_CHECKING:
+    import mcp.types as mcp_types
+
+__all__ = ["AnthropicMixin"]
+
+
+class AnthropicMixin:
+    """Mixin providing Anthropic/Claude format conversion and tool runner.
+
+    Format methods (no deps):
+        as_claude_tools() - Claude API format
+        as_claude_programmatic_tools() - Programmatic tool use format
+
+    Integration methods (requires anthropic):
+        as_anthropic_runner() - Tool runner for executing tool_use blocks
+
+    Requires: as_tools() -> list[mcp_types.Tool], call_tool(name, args)
+    """
+
+    def as_tools(self) -> list[mcp_types.Tool]:
+        raise NotImplementedError
+
+    async def call_tool(self, name: str, arguments: dict[str, Any]) -> Any:
+        raise NotImplementedError
+
+    # =========================================================================
+    # Format Conversion (no external deps)
+    # =========================================================================
+
+    def as_claude_tools(self, *, cache_control: bool = False) -> list[dict[str, Any]]:
+        """Convert to Claude/Anthropic tool format.
+
+        Args:
+            cache_control: Add cache_control for prompt caching
+
+        Returns:
+            List of tool definitions for Claude API.
+
+        Example:
+            ```python
+            from anthropic import Anthropic
+
+            client = Anthropic()
+            async with env:
+                response = client.messages.create(
+                    model="claude-sonnet-4-20250514",
+                    max_tokens=1024,
+                    messages=[{"role": "user", "content": "Navigate to google.com"}],
+                    tools=env.as_claude_tools(),
+                )
+                # Execute tool calls
+                for block in response.content:
+                    if block.type == "tool_use":
+                        result = await env.call_tool(block)
+            ```
+        """
+        tools = []
+        for t in self.as_tools():
+            tool: dict[str, Any] = {
+                "name": t.name,
+                "description": t.description or "",
+                "input_schema": t.inputSchema or {"type": "object", "properties": {}},
+            }
+            if cache_control:
+                tool["cache_control"] = {"type": "ephemeral"}
+            tools.append(tool)
+        return tools
+
+    def as_claude_programmatic_tools(self, *, cache_control: bool = False) -> list[dict[str, Any]]:
+        """Convert to Claude programmatic tool use format.
+
+        Programmatic tool use allows Claude to execute tools via code execution.
+
+        Example:
+            ```python
+            from anthropic import Anthropic
+
+            client = Anthropic()
+            async with env:
+                response = client.messages.create(
+                    model="claude-sonnet-4-20250514",
+                    max_tokens=1024,
+                    messages=[{"role": "user", "content": "Analyze the data"}],
+                    tools=env.as_claude_programmatic_tools(),
+                    betas=["code-execution-2025-01-24"],
+                )
+            ```
+        """
+        tools = []
+        for t in self.as_tools():
+            tool: dict[str, Any] = {
+                "name": t.name,
+                "description": t.description or "",
+                "input_schema": t.inputSchema or {"type": "object", "properties": {}},
+                "allowed_callers": ["code_execution_20250825"],
+            }
+            if cache_control:
+                tool["cache_control"] = {"type": "ephemeral"}
+            tools.append(tool)
+        return tools
+
+    # =========================================================================
+    # Tool Runner Integration (requires anthropic)
+    # =========================================================================
+
+    def as_anthropic_runner(self) -> EnvToolRunner:
+        """Create an Anthropic tool runner for this environment.
+
+        Requires: pip install anthropic
+
+        Returns:
+            EnvToolRunner that can process tool_use blocks from Claude.
+
+        Example:
+            ```python
+            from anthropic import Anthropic
+
+            client = Anthropic()
+            async with env:
+                runner = env.as_anthropic_runner()
+
+                response = client.messages.create(
+                    model="claude-sonnet-4-20250514",
+                    max_tokens=1024,
+                    messages=[{"role": "user", "content": "Navigate to google.com"}],
+                    tools=env.as_claude_tools(),
+                )
+
+                # Execute all tool_use blocks
+                results = []
+                for block in response.content:
+                    if block.type == "tool_use":
+                        result = await runner.run(block)
+                        results.append(result)
+            ```
+        """
+        return EnvToolRunner(self)
+
+
+class EnvToolRunner:
+    """Tool runner that executes tools against an Environment."""
+
+    def __init__(self, env: AnthropicMixin) -> None:
+        self.env = env
+        self._tool_names: set[str] | None = None
+
+    @property
+    def tool_names(self) -> set[str]:
+        """Get available tool names."""
+        if self._tool_names is None:
+            self._tool_names = {t.name for t in self.env.as_tools()}
+        return self._tool_names
+
+    async def run(self, tool_use_block: Any) -> Any:
+        """Execute a tool_use block from Claude.
+
+        Args:
+            tool_use_block: A ToolUseBlock from Claude's response.
+
+        Returns:
+            Tool result dict (or BetaToolResultBlockParam if anthropic installed).
+        """
+        name = tool_use_block.name
+        tool_use_id = tool_use_block.id
+        arguments = tool_use_block.input or {}
+
+        try:
+            result = await self.env.call_tool(name, **arguments)
+            content = result if isinstance(result, str) else json.dumps(result) if result else ""
+            result_dict: dict[str, Any] = {
+                "type": "tool_result",
+                "tool_use_id": tool_use_id,
+                "content": content,
+            }
+        except Exception as e:
+            result_dict = {
+                "type": "tool_result",
+                "tool_use_id": tool_use_id,
+                "content": f"Error: {e}",
+                "is_error": True,
+            }
+
+        # Return typed object if anthropic is available
+        try:
+            from anthropic.types.beta import BetaToolResultBlockParam
+
+            return BetaToolResultBlockParam(**result_dict)
+        except ImportError:
+            return result_dict
diff --git a/hud/environment/integrations/gemini.py b/hud/environment/integrations/gemini.py
new file mode 100644
index 00000000..4f7895b4
--- /dev/null
+++ b/hud/environment/integrations/gemini.py
@@ -0,0 +1,92 @@
+"""Google/Gemini integrations - format conversion."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any
+
+if TYPE_CHECKING:
+    import mcp.types as mcp_types
+
+__all__ = ["GeminiMixin"]
+
+
+class GeminiMixin:
+    """Mixin providing Google/Gemini format conversion.
+
+    Format methods (no deps):
+        as_gemini_tools() - Gemini tool format
+        as_gemini_tool_config() - Tool execution config
+
+    Requires: as_tools() -> list[mcp_types.Tool]
+    """
+
+    def as_tools(self) -> list[mcp_types.Tool]:
+        raise NotImplementedError
+
+    def as_gemini_tools(self) -> list[dict[str, Any]]:
+        """Convert to Gemini/Google AI tool format.
+
+        Returns:
+            List with function_declarations for Gemini API.
+
+        Example:
+            ```python
+            import google.generativeai as genai
+
+            model = genai.GenerativeModel("gemini-1.5-pro")
+            async with env:
+                response = model.generate_content(
+                    "Navigate to google.com",
+                    tools=env.as_gemini_tools(),
+                )
+                # Execute tool calls
+                for part in response.candidates[0].content.parts:
+                    if fn := part.function_call:
+                        result = await env.call_tool(part)
+            ```
+        """
+        return [
+            {
+                "function_declarations": [
+                    {
+                        "name": t.name,
+                        "description": t.description or "",
+                        "parameters": t.inputSchema or {"type": "object", "properties": {}},
+                    }
+                    for t in self.as_tools()
+                ]
+            }
+        ]
+
+    def as_gemini_tool_config(
+        self,
+        mode: str = "AUTO",
+        allowed_tools: list[str] | None = None,
+    ) -> dict[str, Any]:
+        """Get Gemini tool_config for controlling tool execution.
+
+        Args:
+            mode: "AUTO", "ANY", or "NONE"
+            allowed_tools: If mode is "ANY", list of allowed tool names
+
+        Returns:
+            Tool config dict for Gemini API.
+
+        Example:
+            ```python
+            import google.generativeai as genai
+
+            model = genai.GenerativeModel("gemini-1.5-pro")
+            async with env:
+                # Force specific tool usage
+                response = model.generate_content(
+                    "Search for cats",
+                    tools=env.as_gemini_tools(),
+                    tool_config=env.as_gemini_tool_config(mode="ANY", allowed_tools=["search"]),
+                )
+            ```
+        """
+        config: dict[str, Any] = {"function_calling_config": {"mode": mode}}
+        if mode == "ANY" and allowed_tools:
+            config["function_calling_config"]["allowed_function_names"] = allowed_tools
+        return config
diff --git a/hud/environment/integrations/langchain.py b/hud/environment/integrations/langchain.py
new file mode 100644
index 00000000..09d0d52f
--- /dev/null
+++ b/hud/environment/integrations/langchain.py
@@ -0,0 +1,82 @@
+"""LangChain integration."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any
+
+from hud.environment.utils.schema import schema_to_pydantic
+from hud.environment.utils.tool_wrappers import create_tool_fns
+
+if TYPE_CHECKING:
+    import mcp.types as mcp_types
+
+__all__ = ["LangChainMixin"]
+
+
+class LangChainMixin:
+    """Mixin providing LangChain integration.
+
+    Integration methods (requires langchain-core):
+        as_langchain_tools() - LangChain StructuredTool objects
+
+    Requires: as_tools() -> list[mcp_types.Tool], call_tool(name, args)
+    """
+
+    def as_tools(self) -> list[mcp_types.Tool]:
+        raise NotImplementedError
+
+    async def call_tool(self, name: str, arguments: dict[str, Any]) -> Any:
+        raise NotImplementedError
+
+    def as_langchain_tools(self) -> list[Any]:
+        """Convert to LangChain StructuredTool objects.
+
+        Requires: pip install langchain-core
+
+        Returns:
+            List of StructuredTool objects for LangChain agents.
+
+        Example:
+            ```python
+            from langchain_openai import ChatOpenAI
+            from langchain.agents import create_tool_calling_agent, AgentExecutor
+            from langchain_core.prompts import ChatPromptTemplate
+
+            llm = ChatOpenAI(model="gpt-4o")
+            async with env:
+                tools = env.as_langchain_tools()
+
+                prompt = ChatPromptTemplate.from_messages(
+                    [
+                        ("system", "You are a helpful assistant."),
+                        ("human", "{input}"),
+                        ("placeholder", "{agent_scratchpad}"),
+                    ]
+                )
+
+                agent = create_tool_calling_agent(llm, tools, prompt)
+                executor = AgentExecutor(agent=agent, tools=tools)
+                result = await executor.ainvoke({"input": "Navigate to google.com"})
+            ```
+        """
+        try:
+            from langchain_core.tools import StructuredTool
+        except ImportError as e:
+            raise ImportError(
+                "LangChain not installed. Install with: pip install langchain-core"
+            ) from e
+
+        tools = []
+        for t in self.as_tools():
+            schema = t.inputSchema or {"type": "object", "properties": {}}
+            sync_fn, async_fn = create_tool_fns(self, t)
+
+            tool = StructuredTool(
+                name=t.name,
+                description=t.description or "",
+                func=sync_fn,
+                coroutine=async_fn,
+                args_schema=schema_to_pydantic(t.name, schema),
+            )
+            tools.append(tool)
+        return tools
diff --git a/hud/environment/integrations/llamaindex.py b/hud/environment/integrations/llamaindex.py
new file mode 100644
index 00000000..0815d05a
--- /dev/null
+++ b/hud/environment/integrations/llamaindex.py
@@ -0,0 +1,68 @@
+"""LlamaIndex integration."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any
+
+from hud.environment.utils.tool_wrappers import create_tool_fns
+
+if TYPE_CHECKING:
+    import mcp.types as mcp_types
+
+__all__ = ["LlamaIndexMixin"]
+
+
+class LlamaIndexMixin:
+    """Mixin providing LlamaIndex integration.
+
+    Integration methods (requires llama-index-core):
+        as_llamaindex_tools() - LlamaIndex FunctionTool objects
+
+    Requires: as_tools() -> list[mcp_types.Tool], call_tool(name, args)
+    """
+
+    def as_tools(self) -> list[mcp_types.Tool]:
+        raise NotImplementedError
+
+    async def call_tool(self, name: str, arguments: dict[str, Any]) -> Any:
+        raise NotImplementedError
+
+    def as_llamaindex_tools(self) -> list[Any]:
+        """Convert to LlamaIndex FunctionTool objects.
+
+        Requires: pip install llama-index-core
+
+        Returns:
+            List of FunctionTool objects for LlamaIndex agents.
+
+        Example:
+            ```python
+            from llama_index.llms.openai import OpenAI
+            from llama_index.core.agent import ReActAgent
+
+            llm = OpenAI(model="gpt-4o")
+            async with env:
+                tools = env.as_llamaindex_tools()
+                agent = ReActAgent.from_tools(tools, llm=llm, verbose=True)
+                response = await agent.achat("Find information about Python")
+            ```
+        """
+        try:
+            from llama_index.core.tools import FunctionTool
+        except ImportError as e:
+            raise ImportError(
+                "LlamaIndex not installed. Install with: pip install llama-index-core"
+            ) from e
+
+        tools = []
+        for t in self.as_tools():
+            sync_fn, async_fn = create_tool_fns(self, t)
+
+            tool = FunctionTool.from_defaults(
+                fn=sync_fn,
+                async_fn=async_fn,
+                name=t.name,
+                description=t.description or "",
+            )
+            tools.append(tool)
+        return tools
diff --git a/hud/environment/integrations/openai.py b/hud/environment/integrations/openai.py
new file mode 100644
index 00000000..0bad7782
--- /dev/null
+++ b/hud/environment/integrations/openai.py
@@ -0,0 +1,238 @@
+"""OpenAI integrations - format conversion and Agents SDK."""
+
+from __future__ import annotations
+
+import json
+import logging
+from typing import TYPE_CHECKING, Any, cast
+
+from hud.environment.utils.schema import ensure_strict_schema, validate_openai_schema
+
+if TYPE_CHECKING:
+    import mcp.types as mcp_types
+    from openai.types.chat import ChatCompletionToolUnionParam
+
+__all__ = ["OpenAIMixin"]
+
+logger = logging.getLogger(__name__)
+
+
+class OpenAIMixin:
+    """Mixin providing OpenAI format conversion and Agents SDK integration.
+
+    Format methods (no deps):
+        as_openai_chat_tools() - Chat Completions format
+        as_openai_responses_tools() - Responses API format
+
+    Integration methods (requires openai-agents):
+        as_openai_agent_tools() - Agents SDK FunctionTool objects
+
+    Note: The OpenAI Agents SDK also supports:
+        - HostedMCPTool - MCP tools hosted by OpenAI
+        - MCPServerStdio/Sse/StreamableHttp - Direct MCP server connections
+
+    For MCP server integration, use as_mcp_server() from the mcp integration.
+
+    Requires: as_tools() -> list[mcp_types.Tool], call_tool(name, args)
+    """
+
+    def as_tools(self) -> list[mcp_types.Tool]:
+        raise NotImplementedError
+
+    async def call_tool(self, name: str, arguments: dict[str, Any]) -> Any:
+        raise NotImplementedError
+
+    # =========================================================================
+    # Format Conversion (no external deps)
+    # =========================================================================
+
+    def as_openai_chat_tools(
+        self, *, strict: bool = False, validate: bool = True
+    ) -> list[ChatCompletionToolUnionParam]:
+        """Convert to OpenAI Chat Completions tool format.
+
+        Args:
+            strict: Enable strict mode for structured outputs
+            validate: Validate schemas and skip incompatible tools with warnings
+
+        Returns:
+            List of tool definitions for OpenAI Chat Completions API.
+
+        Example:
+            ```python
+            from openai import OpenAI
+
+            client = OpenAI()
+            async with env:
+                response = client.chat.completions.create(
+                    model="gpt-4o",
+                    messages=[{"role": "user", "content": "Navigate to google.com"}],
+                    tools=env.as_openai_chat_tools(),
+                )
+                # Execute tool calls and get results in OpenAI format
+                results = await env.call_tools(response.choices[0].message.tool_calls)
+                # results are {"role": "tool", "tool_call_id": ..., "content": ...}
+            ```
+        """
+        tools: list[ChatCompletionToolUnionParam] = []
+        for t in self.as_tools():
+            schema = dict(t.inputSchema) if t.inputSchema else {"type": "object", "properties": {}}
+
+            # Validate schema for OpenAI compatibility
+            if validate:
+                errors = validate_openai_schema(schema, t.name)
+                if errors:
+                    for error in errors:
+                        logger.warning("Skipping tool: %s", error)
+                    continue
+
+            if strict:
+                schema = ensure_strict_schema(schema)
+
+            tools.append(
+                cast(
+                    "ChatCompletionToolUnionParam",
+                    {
+                        "type": "function",
+                        "function": {
+                            "name": t.name,
+                            "description": t.description or "",
+                            "parameters": schema,
+                            **({"strict": True} if strict else {}),
+                        },
+                    },
+                )
+            )
+        return tools
+
+    def as_openai_responses_tools(self, *, validate: bool = True) -> list[dict[str, Any]]:
+        """Convert to OpenAI Responses API tool format.
+
+        Note: Like Chat Completions, you must execute tools yourself.
+        OpenAI only auto-executes their built-in tools (code_interpreter, etc).
+
+        Args:
+            validate: Validate schemas and skip incompatible tools with warnings
+
+        Returns:
+            List of tool definitions for OpenAI Responses API.
+
+        Example:
+            ```python
+            from openai import OpenAI
+
+            client = OpenAI()
+            async with env:
+                response = client.responses.create(
+                    model="gpt-4o",
+                    input="Navigate to google.com",
+                    tools=env.as_openai_responses_tools(),
+                )
+                # Check for function calls in the response
+                for item in response.output:
+                    if item.type == "function_call":
+                        result = await env.call_tool(item.name, **item.arguments)
+            ```
+        """
+        tools = []
+        for t in self.as_tools():
+            schema = dict(t.inputSchema) if t.inputSchema else {"type": "object", "properties": {}}
+
+            # Validate schema for OpenAI compatibility
+            if validate:
+                errors = validate_openai_schema(schema, t.name)
+                if errors:
+                    for error in errors:
+                        logger.warning("Skipping tool: %s", error)
+                    continue
+
+            tools.append(
+                {
+                    "type": "function",
+                    "name": t.name,
+                    "description": t.description or "",
+                    "parameters": schema,
+                }
+            )
+        return tools
+
+    # =========================================================================
+    # Agents SDK Integration (requires openai-agents)
+    # =========================================================================
+
+    def as_openai_agent_tools(self, *, validate: bool = True) -> list[Any]:
+        """Convert to OpenAI Agents SDK FunctionTool objects.
+
+        This creates FunctionTool objects that automatically execute against
+        this environment. The Agents SDK Runner handles the tool loop.
+
+        Note: The Agents SDK also supports other tool types:
+            - HostedMCPTool: MCP tools hosted by OpenAI
+            - MCPServerStdio/Sse/StreamableHttp: Direct MCP server connections
+
+        For direct MCP integration, consider using as_mcp_server().
+
+        Requires: pip install openai-agents
+
+        Args:
+            validate: Validate schemas and skip incompatible tools with warnings
+
+        Returns:
+            List of FunctionTool objects for OpenAI Agents SDK.
+
+        Example:
+            ```python
+            from agents import Agent, Runner
+
+            async with env:
+                agent = Agent(
+                    name="browser-agent",
+                    instructions="You browse the web.",
+                    tools=env.as_openai_agent_tools(),
+                )
+                result = await Runner.run(agent, "Go to google.com")
+                print(result.final_output)
+            ```
+        """
+        try:
+            from agents import FunctionTool
+        except ImportError as e:
+            raise ImportError(
+                "OpenAI Agents SDK not installed. Install with: pip install openai-agents"
+            ) from e
+
+        tools = []
+        for t in self.as_tools():
+            schema = dict(t.inputSchema) if t.inputSchema else {"type": "object", "properties": {}}
+
+            # Validate schema for OpenAI compatibility
+            if validate:
+                errors = validate_openai_schema(schema, t.name)
+                if errors:
+                    for error in errors:
+                        logger.warning("Skipping tool: %s", error)
+                    continue
+
+            tool = _create_function_tool(self, t, FunctionTool)
+            tools.append(tool)
+        return tools
+
+
+def _create_function_tool(env: OpenAIMixin, tool: mcp_types.Tool, FunctionTool: type) -> Any:
+    """Create a FunctionTool that calls back to the environment."""
+    schema = tool.inputSchema or {"type": "object", "properties": {}}
+
+    async def async_wrapper(ctx: Any, args_json: str) -> str:
+        """Async wrapper for the tool that matches FunctionTool signature."""
+        kwargs = json.loads(args_json) if args_json else {}
+        result = await env.call_tool(tool.name, **kwargs)
+        if isinstance(result, str):
+            return result
+        return json.dumps(result) if result else ""
+
+    return FunctionTool(
+        name=tool.name,
+        description=tool.description or "",
+        params_json_schema=schema,
+        on_invoke_tool=async_wrapper,
+    )
diff --git a/hud/environment/mock.py b/hud/environment/mock.py
new file mode 100644
index 00000000..f0f70541
--- /dev/null
+++ b/hud/environment/mock.py
@@ -0,0 +1,306 @@
+"""Mock functionality for Environment."""
+
+from __future__ import annotations
+
+import logging
+from typing import TYPE_CHECKING, Any
+
+import mcp.types as mcp_types
+
+from hud.types import MCPToolResult
+
+if TYPE_CHECKING:
+    from hud.environment.environment import Environment
+
+__all__ = ["MockMixin", "generate_mock_value"]
+
+logger = logging.getLogger(__name__)
+
+
+def generate_mock_value(schema: dict[str, Any], depth: int = 0) -> Any:
+    """Generate a reasonable mock value from a JSON schema.
+
+    Args:
+        schema: JSON schema dict with 'type', 'properties', etc.
+        depth: Current recursion depth (to prevent infinite loops).
+
+    Returns:
+        A mock value that matches the schema.
+    """
+    if depth > 10:  # Prevent infinite recursion
+        return None
+
+    # Handle $ref - we don't resolve refs, just return placeholder
+    if "$ref" in schema:
+        return {}
+
+    # Handle anyOf/oneOf/allOf - pick first option
+    if "anyOf" in schema:
+        return generate_mock_value(schema["anyOf"][0], depth + 1)
+    if "oneOf" in schema:
+        return generate_mock_value(schema["oneOf"][0], depth + 1)
+    if "allOf" in schema:
+        # Merge all schemas
+        merged: dict[str, Any] = {}
+        for sub_schema in schema["allOf"]:
+            result = generate_mock_value(sub_schema, depth + 1)
+            if isinstance(result, dict):
+                merged.update(result)
+        return merged
+
+    # Check for const or enum first
+    if "const" in schema:
+        return schema["const"]
+    if "enum" in schema:
+        return schema["enum"][0] if schema["enum"] else None
+
+    # Check for default value
+    if "default" in schema:
+        return schema["default"]
+
+    # Handle by type
+    schema_type = schema.get("type")
+
+    if schema_type == "string":
+        # Check for format hints
+        fmt = schema.get("format", "")
+        if fmt == "uri" or fmt == "url":
+            return "https://example.com"
+        if fmt == "email":
+            return "user@example.com"
+        if fmt == "date":
+            return "2024-01-01"
+        if fmt == "date-time":
+            return "2024-01-01T00:00:00Z"
+        if fmt == "uuid":
+            return "00000000-0000-0000-0000-000000000000"
+        # Use title/description hint if available
+        title = schema.get("title", "").lower()
+        if "url" in title or "link" in title:
+            return "https://example.com"
+        if "name" in title:
+            return "mock_name"
+        if "id" in title:
+            return "mock_id"
+        return "mock_string"
+
+    if schema_type == "number" or schema_type == "integer":
+        # Check for bounds
+        minimum = schema.get("minimum", 0)
+        maximum = schema.get("maximum", 100)
+        if schema_type == "integer":
+            return int((minimum + maximum) / 2) if maximum != float("inf") else minimum
+        return float((minimum + maximum) / 2) if maximum != float("inf") else float(minimum)
+
+    if schema_type == "boolean":
+        return True
+
+    if schema_type == "null":
+        return None
+
+    if schema_type == "array":
+        items_schema = schema.get("items", {})
+        if items_schema:
+            # Generate one item
+            return [generate_mock_value(items_schema, depth + 1)]
+        return []
+
+    if schema_type == "object" or "properties" in schema:
+        result: dict[str, Any] = {}
+        properties = schema.get("properties", {})
+        required = set(schema.get("required", []))
+
+        for prop_name, prop_schema in properties.items():
+            # Only include required properties or first few optional ones
+            if prop_name in required or len(result) < 3:
+                result[prop_name] = generate_mock_value(prop_schema, depth + 1)
+
+        return result
+
+    # Handle list of types
+    if isinstance(schema_type, list):
+        # Pick first non-null type
+        for t in schema_type:
+            if t != "null":
+                return generate_mock_value({"type": t}, depth + 1)
+        return None
+
+    # Fallback for unknown schema
+    return None
+
+
+def generate_mock_tool_result(tool: mcp_types.Tool) -> MCPToolResult:
+    """Generate a mock result for a tool based on its output schema.
+
+    Args:
+        tool: MCP Tool with inputSchema and optionally outputSchema.
+
+    Returns:
+        MCPToolResult with mock content.
+    """
+    # Check if tool has an output schema
+    output_schema = getattr(tool, "outputSchema", None)
+
+    if output_schema:
+        mock_value = generate_mock_value(output_schema)
+        content_text = str(mock_value) if mock_value is not None else "mock_result"
+    else:
+        # Generate a sensible default based on tool name
+        tool_name = tool.name
+        if "screenshot" in tool_name.lower() or "image" in tool_name.lower():
+            content_text = "[mock image data]"
+        elif "get" in tool_name.lower() or "list" in tool_name.lower():
+            content_text = "[]"
+        elif "check" in tool_name.lower() or "verify" in tool_name.lower():
+            content_text = "true"
+        elif "count" in tool_name.lower():
+            content_text = "0"
+        else:
+            content_text = "mock_success"
+
+    return MCPToolResult(
+        content=[mcp_types.TextContent(type="text", text=content_text)],
+        isError=False,
+    )
+
+
+class MockMixin:
+    """Mixin that adds mock functionality to Environment.
+
+    When mock mode is enabled:
+    - All tool calls return mock values instead of executing
+    - Specific tools can have custom mock outputs via mock_tool()
+    - Tools are automatically mocked with reasonable defaults based on their schemas
+
+    Usage:
+        env = Environment("test").connect_hub("browser")
+        env.mock()  # Enable mock mode
+
+        # Set specific mock outputs
+        env.mock_tool("navigate", "Navigation successful")
+        env.mock_tool("screenshot", {"image": "base64data..."})
+
+        async with env:
+            result = await env.call_tool("navigate", url="https://example.com")
+            # Returns: MCPToolResult with "Navigation successful"
+    """
+
+    _mock_mode: bool
+    _mock_outputs: dict[str, Any]
+    _mock_tool_schemas: dict[str, mcp_types.Tool]
+
+    def _init_mock(self) -> None:
+        """Initialize mock state. Called from Environment.__init__."""
+        self._mock_mode = False
+        self._mock_outputs = {}
+        self._mock_tool_schemas = {}
+
+    def mock(self) -> Environment:
+        """Enable mock mode - all tool calls will return mock values.
+
+        Returns:
+            self for chaining.
+
+        Example:
+            env = Environment("test").connect_hub("browser").mock()
+        """
+        self._mock_mode = True
+        logger.info("Mock mode enabled for environment %s", getattr(self, "name", "unknown"))
+        return self  # type: ignore[return-value]
+
+    def unmock(self) -> Environment:
+        """Disable mock mode - tool calls will execute normally.
+
+        Returns:
+            self for chaining.
+        """
+        self._mock_mode = False
+        logger.info("Mock mode disabled for environment %s", getattr(self, "name", "unknown"))
+        return self  # type: ignore[return-value]
+
+    @property
+    def is_mock(self) -> bool:
+        """Check if mock mode is enabled."""
+        return self._mock_mode
+
+    def mock_tool(self, name: str, output: Any) -> Environment:
+        """Set a specific mock output for a tool.
+
+        Args:
+            name: Tool name (with prefix if applicable).
+            output: The value to return when this tool is called.
+                   Can be a string, dict, or any JSON-serializable value.
+
+        Returns:
+            self for chaining.
+
+        Example:
+            env.mock_tool("navigate", "Success")
+            env.mock_tool("screenshot", {"type": "image", "data": "..."})
+            env.mock_tool("get_elements", [{"id": "1", "text": "Button"}])
+        """
+        self._mock_outputs[name] = output
+        logger.debug("Mock output set for tool %s", name)
+        return self  # type: ignore[return-value]
+
+    def _get_mock_result(self, name: str, arguments: dict[str, Any]) -> MCPToolResult:
+        """Get mock result for a tool call.
+
+        Priority:
+        1. Custom mock output set via mock_tool()
+        2. Auto-generated mock based on tool's output schema
+        3. Default mock value
+
+        Args:
+            name: Tool name.
+            arguments: Tool arguments (for potential future use).
+
+        Returns:
+            MCPToolResult with mock content.
+        """
+        # Check for custom mock output
+        if name in self._mock_outputs:
+            output = self._mock_outputs[name]
+            # Convert to string if not already
+            if isinstance(output, str):
+                content_text = output
+            else:
+                import json
+
+                try:
+                    content_text = json.dumps(output)
+                except (TypeError, ValueError):
+                    content_text = str(output)
+
+            return MCPToolResult(
+                content=[mcp_types.TextContent(type="text", text=content_text)],
+                isError=False,
+            )
+
+        # Try to find tool schema for auto-generation
+        if name in self._mock_tool_schemas:
+            return generate_mock_tool_result(self._mock_tool_schemas[name])
+
+        # Check router for tool schema
+        router = getattr(self, "_router", None)
+        if router:
+            for tool in router.tools:
+                if tool.name == name:
+                    self._mock_tool_schemas[name] = tool
+                    return generate_mock_tool_result(tool)
+
+        # Default fallback
+        return MCPToolResult(
+            content=[mcp_types.TextContent(type="text", text="mock_success")],
+            isError=False,
+        )
+
+    def _populate_mock_schemas(self) -> None:
+        """Populate mock tool schemas from router after connection.
+
+        Called after _build_routing to cache tool schemas for mock generation.
+        """
+        router = getattr(self, "_router", None)
+        if router:
+            for tool in router.tools:
+                self._mock_tool_schemas[tool.name] = tool
diff --git a/hud/environment/router.py b/hud/environment/router.py
new file mode 100644
index 00000000..a1f423bf
--- /dev/null
+++ b/hud/environment/router.py
@@ -0,0 +1,112 @@
+"""Tool routing for Environment."""
+
+from __future__ import annotations
+
+import logging
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    import mcp.types as mcp_types
+
+    from hud.environment.connection import Connector
+
+__all__ = ["LOCAL_CONNECTION", "ConflictResolution", "ToolRouter"]
+
+logger = logging.getLogger(__name__)
+
+LOCAL_CONNECTION = "__local__"
+
+
+class ConflictResolution(str, Enum):
+    """Strategy for resolving tool name conflicts."""
+
+    PREFIX = "prefix"  # Add connection name as prefix
+    FIRST_WINS = "first_wins"  # First connection wins
+    LAST_WINS = "last_wins"  # Last connection wins
+    ERROR = "error"  # Raise error on conflict
+
+
+@dataclass
+class ToolRouter:
+    """Routes tool calls to local or remote handlers with conflict resolution."""
+
+    conflict_resolution: ConflictResolution = ConflictResolution.PREFIX
+    _tools: list[mcp_types.Tool] = field(default_factory=list)
+    _routing: dict[str, str] = field(default_factory=dict)  # name -> connection
+    _local_names: set[str] = field(default_factory=set)
+
+    @property
+    def tools(self) -> list[mcp_types.Tool]:
+        return self._tools
+
+    def is_local(self, name: str) -> bool:
+        return name in self._local_names
+
+    def get_connection(self, name: str) -> str | None:
+        """Get connection name for tool, None if local or not found."""
+        conn = self._routing.get(name)
+        return None if conn == LOCAL_CONNECTION else conn
+
+    def clear(self) -> None:
+        self._tools.clear()
+        self._routing.clear()
+        self._local_names.clear()
+
+    def build(
+        self,
+        local_tools: list[mcp_types.Tool],
+        connections: dict[str, Connector],
+        connection_order: list[str],
+    ) -> None:
+        """Build routing from local tools and connection caches.
+
+        Local tools always have priority over remote tools.
+        Tools starting with '_' are internal and hidden from listing
+        (but still callable directly).
+        """
+        self.clear()
+        seen: dict[str, str] = {}
+
+        # Local tools first (always priority)
+        for tool in local_tools:
+            # Always add to routing (so tool is callable)
+            seen[tool.name] = LOCAL_CONNECTION
+            self._routing[tool.name] = LOCAL_CONNECTION
+            self._local_names.add(tool.name)
+            # Only add to visible list if not internal (underscore prefix)
+            if not tool.name.startswith("_"):
+                self._tools.append(tool)
+
+        # Remote connections in order
+        for conn_name in connection_order:
+            if conn_name not in connections:
+                continue
+            for tool in connections[conn_name].cached_tools:
+                name = tool.name
+                if name in seen:
+                    existing = seen[name]
+                    if existing == LOCAL_CONNECTION:
+                        continue  # Local always wins
+                    if not self._handle_conflict(name, existing, conn_name):
+                        continue
+                    self._tools = [t for t in self._tools if t.name != name]
+
+                # Always add to routing (so tool is callable)
+                seen[name] = conn_name
+                self._routing[name] = conn_name
+                # Only add to visible list if not internal (underscore prefix)
+                if not name.startswith("_"):
+                    self._tools.append(tool)
+
+        logger.debug("Router: %d tools (%d local)", len(self._tools), len(self._local_names))
+
+    def _handle_conflict(self, name: str, existing: str, new: str) -> bool:
+        """Handle remote-to-remote conflict. Returns True to replace existing."""
+        if self.conflict_resolution == ConflictResolution.ERROR:
+            raise ValueError(f"Tool conflict: '{name}' in '{existing}' and '{new}'")
+        if self.conflict_resolution == ConflictResolution.FIRST_WINS:
+            return False
+        # LAST_WINS returns True, PREFIX (shouldn't conflict) returns False
+        return self.conflict_resolution == ConflictResolution.LAST_WINS
diff --git a/hud/environment/scenarios.py b/hud/environment/scenarios.py
new file mode 100644
index 00000000..1369ea37
--- /dev/null
+++ b/hud/environment/scenarios.py
@@ -0,0 +1,456 @@
+"""Scenario decorator for Environment - defines setup/evaluate phases."""
+
+from __future__ import annotations
+
+import inspect
+import json
+import logging
+import uuid
+from typing import TYPE_CHECKING, Any
+
+if TYPE_CHECKING:
+    from collections.abc import AsyncGenerator, Callable
+
+    from fastmcp.prompts import PromptManager
+    from fastmcp.resources import ResourceManager
+    from fastmcp.tools import ToolManager
+
+__all__ = ["ScenarioMixin"]
+
+logger = logging.getLogger(__name__)
+
+
+class ScenarioMixin:
+    """Mixin providing @env.scenario decorator for setup/evaluate phases.
+
+    Scenarios are async generators that yield twice:
+    - First yield: prompt string (setup phase)
+    - Second yield: reward float (evaluate phase)
+
+    The scenario can receive the agent's answer via yield:
+        answer = yield "Do the task"
+        yield 1.0 if "success" in answer else 0.0
+
+    The answer is passed via the hud_submit tool or ctx.submit().
+
+    The decorator registers both an MCP prompt and resource with the same
+    identifier ({env_name}:{scenario_name}), linked by session state.
+
+    Example:
+        @env.scenario()
+        async def search_cats(url: str):
+            await env.call_tool("navigate", url=url)
+            answer = yield "Find all cat images on the page"
+            result = await env.call_tool("count_cats")
+            yield float(result > 0 or "found" in answer.lower())
+    """
+
+    # These come from Environment/MCPServer
+    name: str
+    _prompt_manager: PromptManager
+    _resource_manager: ResourceManager
+    _tool_manager: ToolManager
+
+    # Scenario state
+    _scenarios: dict[str, Callable[..., AsyncGenerator[Any, Any]]]
+    _scenario_sessions: dict[str, AsyncGenerator[Any, Any]]  # session_id -> generator
+    _scenario_latest: dict[str, str]  # scenario_name -> latest session_id
+    _scenario_answers: dict[str, str]  # scenario_name -> submitted answer
+
+    def _init_scenarios(self) -> None:
+        """Initialize scenario state. Called from Environment.__init__."""
+        self._scenarios = {}
+        self._scenario_sessions = {}
+        self._scenario_latest = {}
+        self._scenario_answers = {}
+
+        # Register _hud_submit tool (underscore = hidden from agent)
+        self._register_hud_submit_tool()
+
+    async def submit(self, scenario: str, answer: str) -> None:
+        """Submit the agent's answer for a scenario's evaluate phase.
+
+        This stores the answer locally and broadcasts to connected hubs
+        that have the _hud_submit tool (auto-detected by Environment).
+
+        Args:
+            scenario: Name of the scenario (without env prefix)
+            answer: The agent's answer/result to submit
+
+        Example:
+            # Direct call with scenario name
+            await env.submit("checkout", "Order completed successfully")
+
+            # Or via EvalContext (knows its own scenario)
+            await ctx.submit("Order completed successfully")
+        """
+        # Store locally for our scenarios
+        self._scenario_answers[scenario] = answer
+        logger.debug(
+            "Stored answer for scenario '%s': %s...",
+            scenario,
+            answer[:50] if len(answer) > 50 else answer,
+        )
+
+        # Broadcast to connections that have _hud_submit
+        # Environment._broadcast_tool auto-filters to connections with the tool
+        await self._broadcast_tool(  # type: ignore[attr-defined]
+            "_hud_submit",
+            scenario=scenario,
+            answer=answer,
+        )
+
+    def _register_hud_submit_tool(self) -> None:
+        """Register the _hud_submit tool for receiving agent answers.
+
+        Named with underscore prefix to hide from agent tool listings.
+        """
+        from fastmcp.tools import Tool
+
+        scenario_self = self
+
+        async def _hud_submit(scenario: str, answer: str) -> str:
+            """Submit the agent's answer for a scenario's evaluate phase.
+
+            Internal tool - called by Environment.submit() on connected hubs.
+
+            Args:
+                scenario: Name of the scenario (without env prefix)
+                answer: The agent's answer/result to submit
+            """
+            # Store locally (don't broadcast - we ARE the target)
+            scenario_self._scenario_answers[scenario] = answer
+            logger.debug(
+                "_hud_submit received answer for scenario '%s': %s...",
+                scenario,
+                answer[:50] if len(answer) > 50 else answer,
+            )
+            return f"Answer submitted for scenario '{scenario}'"
+
+        # Register the tool with underscore name
+        tool = Tool.from_function(_hud_submit)
+        self._tool_manager.add_tool(tool)
+        logger.debug("Registered _hud_submit tool")
+
+    async def run_scenario_setup(self, scenario_name: str, args: dict[str, Any]) -> str | None:
+        """Run a scenario's setup phase and return the prompt.
+
+        Handles both local scenarios (registered via @env.scenario) and remote
+        scenarios (via MCP prompt).
+
+        Args:
+            scenario_name: Name of the scenario to run
+            args: Arguments to pass to the scenario
+
+        Returns:
+            The prompt string from the scenario's setup phase, or None if failed
+        """
+        # Check if scenario is registered locally
+        if scenario_name in self._scenarios:
+            # Local scenario - run setup via generator
+            scenario_fn = self._scenarios[scenario_name]
+            gen = scenario_fn(**args)
+
+            # Run setup phase (code before first yield)
+            prompt = await gen.__anext__()
+
+            # Store generator for evaluate phase
+            session_id = uuid.uuid4().hex[:8]
+            self._scenario_sessions[session_id] = gen
+            self._scenario_latest[scenario_name] = session_id
+
+            logger.debug(
+                "Scenario %s setup complete, session=%s",
+                scenario_name,
+                session_id,
+            )
+            return str(prompt)
+        else:
+            # Remote scenario - call via MCP prompt
+            # If scenario_name already contains ":", it's already namespaced - use directly
+            # Otherwise, prefix with env name: {env_name}:{scenario_name}
+            if ":" in scenario_name:
+                prompt_id = scenario_name
+                logger.debug("Remote scenario (already namespaced): prompt_id=%s", prompt_id)
+            else:
+                env_name = getattr(self, "_source_env_name", None) or self.name
+                safe_env_name = env_name.replace("_", "-")
+                prompt_id = f"{safe_env_name}:{scenario_name}"
+                logger.debug("Remote scenario (adding namespace): prompt_id=%s", prompt_id)
+            try:
+                result = await self.get_prompt(prompt_id, args)  # type: ignore[attr-defined]
+                if result.messages:
+                    first_msg = result.messages[0]
+                    content = first_msg.content
+                    if hasattr(content, "text") and isinstance(content.text, str):  # type: ignore[union-attr]
+                        return content.text  # type: ignore[union-attr]
+                    elif isinstance(content, str):
+                        return content
+            except Exception as e:
+                logger.warning("Failed to get scenario prompt: %s", e)
+            return None
+
+    async def run_scenario_evaluate(self, scenario_name: str) -> float | None:
+        """Run a scenario's evaluate phase and return the reward.
+
+        Uses the submitted answer (if any) via gen.asend().
+        Handles both local and remote scenarios.
+
+        Args:
+            scenario_name: Name of the scenario to evaluate
+
+        Returns:
+            The reward from the scenario's evaluate phase, or None if failed
+        """
+        # Check if we have a stored generator (local scenario)
+        session_id = self._scenario_latest.get(scenario_name)
+        if session_id:
+            gen = self._scenario_sessions.pop(session_id, None)
+            if gen:
+                # Get submitted answer (if any)
+                answer = self._scenario_answers.pop(scenario_name, None)
+
+                try:
+                    # Use asend to pass the answer to the scenario
+                    reward = await gen.asend(answer)
+                    logger.debug(
+                        "Scenario %s evaluate complete, answer=%s, reward=%s",
+                        scenario_name,
+                        answer[:50] if answer and len(answer) > 50 else answer,
+                        reward,
+                    )
+                    return float(reward)
+                except StopAsyncIteration:
+                    # Generator ended without second yield - assume success
+                    return 1.0
+                finally:
+                    # Clean up latest pointer
+                    if self._scenario_latest.get(scenario_name) == session_id:
+                        del self._scenario_latest[scenario_name]
+
+        # Remote scenario - read via MCP resource
+        # If scenario_name already contains ":", it's already namespaced - use directly
+        if ":" in scenario_name:
+            resource_id = scenario_name
+        else:
+            env_name = getattr(self, "_source_env_name", None) or self.name
+            safe_env_name = env_name.replace("_", "-")
+            resource_id = f"{safe_env_name}:{scenario_name}"
+        try:
+            contents = await self.read_resource(resource_id)  # type: ignore[attr-defined]
+            if contents:
+                first_content = contents[0]
+                if hasattr(first_content, "text") and isinstance(first_content.text, str):  # type: ignore[union-attr]
+                    data = json.loads(first_content.text)  # type: ignore[union-attr]
+                    if "reward" in data:
+                        return float(data["reward"])
+        except Exception as e:
+            logger.warning("Failed to get scenario reward: %s", e)
+        return None
+
+    def scenario(
+        self,
+        name: str | None = None,
+        description: str | None = None,
+    ) -> Callable[
+        [Callable[..., AsyncGenerator[Any, None]]],
+        Callable[..., AsyncGenerator[Any, None]],
+    ]:
+        """Decorator to register a scenario with setup and evaluate phases.
+
+        Creates both a prompt and resource with identifier scenario:{name}.
+        The scenario function should yield twice:
+        - First yield: the prompt string (returned from prompt)
+        - Second yield: the reward float (returned from resource)
+
+        Args:
+            name: Optional name for the scenario (defaults to function name)
+            description: Optional description of what the scenario does
+
+        Example:
+            @env.scenario()
+            async def search_cats(url: str):
+                await env.call_tool("navigate", url=url)
+                yield "Find cat images"
+                result = await env.call_tool("count_cats")
+                yield float(result > 0)
+
+            # MCP client usage:
+            # 1. get_prompt("{env_name}:search_cats", {url: "..."}) -> prompt messages
+            # 2. agent runs...
+            # 3. read_resource("{env_name}:search_cats") -> {"reward": 0.95}
+        """
+
+        def decorator(
+            fn: Callable[..., AsyncGenerator[Any, None]],
+        ) -> Callable[..., AsyncGenerator[Any, None]]:
+            scenario_name = name or fn.__name__
+            # Sanitize env name for URI scheme (no underscores allowed)
+            safe_env_name = self.name.replace("_", "-")
+            scenario_id = f"{safe_env_name}:{scenario_name}"
+            scenario_desc = description or fn.__doc__ or f"Scenario: {scenario_name}"
+
+            # Capture source code for reproducibility
+            try:
+                source_code = inspect.getsource(fn)
+            except (OSError, TypeError) as e:
+                logger.warning(
+                    "Could not capture source code for scenario '%s': %s",
+                    scenario_name,
+                    e,
+                )
+                source_code = None
+
+            # Store the generator function
+            self._scenarios[scenario_name] = fn
+
+            # Get function signature for prompt arguments with type info
+            sig = inspect.signature(fn)
+            prompt_args: list[dict[str, Any]] = []
+            for p in sig.parameters.values():
+                is_required = p.default is inspect.Parameter.empty
+                arg_info: dict[str, Any] = {"name": p.name, "required": is_required}
+
+                # Include default value if present
+                if not is_required:
+                    # Only include JSON-serializable defaults
+                    default_val = p.default
+                    if default_val is None or isinstance(
+                        default_val, (str, int, float, bool, list, dict)
+                    ):
+                        arg_info["default"] = default_val
+
+                # Extract type annotation
+                if p.annotation is not inspect.Parameter.empty:
+                    try:
+                        # Use pydantic to convert annotation to JSON schema
+                        from pydantic import TypeAdapter
+
+                        adapter = TypeAdapter(p.annotation)
+                        param_schema = adapter.json_schema()
+                        # Extract type from schema (could be "string", "integer", etc.)
+                        if "type" in param_schema:
+                            arg_info["type"] = param_schema["type"]
+                        elif "$ref" in param_schema or "anyOf" in param_schema:
+                            # Complex type - store the full schema
+                            arg_info["inputSchema"] = param_schema
+                    except Exception:
+                        arg_info["type"] = "string"
+                else:
+                    arg_info["type"] = "string"
+
+                prompt_args.append(arg_info)
+
+            # Register PROMPT - runs setup, returns prompt messages
+            # We need a reference to self and the outer variables
+            scenario_self = self
+            scenario_fn = fn
+            scenario_name_ref = scenario_name
+
+            async def prompt_handler(**handler_args: Any) -> list[str]:
+                # Create generator instance
+                gen = scenario_fn(**handler_args)
+
+                # Run setup phase (code before first yield)
+                prompt_text = await gen.__anext__()
+
+                # Store generator with session ID
+                session_id = uuid.uuid4().hex[:8]
+                scenario_self._scenario_sessions[session_id] = gen
+                scenario_self._scenario_latest[scenario_name_ref] = session_id
+
+                logger.debug(
+                    "Scenario %s setup complete, session=%s, prompt=%s",
+                    scenario_name_ref,
+                    session_id,
+                    prompt_text[:50] if isinstance(prompt_text, str) else prompt_text,
+                )
+
+                # Return just the string - FastMCP wraps it in PromptMessage
+                # Don't return dict or it gets JSON-serialized as text content
+                return [str(prompt_text)]
+
+            # Register prompt using FastMCP - create FunctionPrompt directly
+            # to bypass the **kwargs validation in from_function()
+            from fastmcp.prompts.prompt import FunctionPrompt, PromptArgument
+
+            # Build meta with source code and full arguments info (with types/defaults)
+            scenario_meta: dict[str, Any] = {}
+            if source_code:
+                scenario_meta["code"] = source_code
+            if prompt_args:
+                scenario_meta["arguments"] = prompt_args
+
+            prompt = FunctionPrompt(
+                name=scenario_id,
+                description=f"[Setup] {scenario_desc}",
+                arguments=[
+                    PromptArgument(name=arg["name"], required=arg["required"])
+                    for arg in prompt_args
+                ],
+                fn=prompt_handler,
+                meta=scenario_meta if scenario_meta else None,
+            )
+            self._prompt_manager.add_prompt(prompt)
+
+            # Register RESOURCE - runs evaluate, returns reward
+            async def resource_handler() -> str:
+                # Get latest session for this scenario
+                session_id = scenario_self._scenario_latest.get(scenario_name_ref)
+                if not session_id:
+                    raise ValueError(
+                        f"No active session for scenario '{scenario_name_ref}'. "
+                        "Call the prompt first to run setup."
+                    )
+
+                gen = scenario_self._scenario_sessions.pop(session_id, None)
+                if gen is None:
+                    raise ValueError(f"Session '{session_id}' not found or already evaluated.")
+
+                # Get submitted answer (if any)
+                answer = scenario_self._scenario_answers.pop(scenario_name_ref, None)
+
+                # Run evaluate phase (code after first yield)
+                # Use asend to pass the answer (or None if not submitted)
+                try:
+                    reward = await gen.asend(answer)
+                except StopAsyncIteration:
+                    # Generator ended without second yield - assume success
+                    reward = 1.0
+
+                logger.debug(
+                    "Scenario %s evaluate complete, session=%s, answer=%s, reward=%s",
+                    scenario_name_ref,
+                    session_id,
+                    answer[:50] if answer and len(answer) > 50 else answer,
+                    reward,
+                )
+
+                # Clean up latest pointer if it matches
+                if scenario_self._scenario_latest.get(scenario_name_ref) == session_id:
+                    del scenario_self._scenario_latest[scenario_name_ref]
+
+                return json.dumps({"reward": float(reward)})
+
+            # Register as resource with same scenario: URI
+            from fastmcp.resources.resource import FunctionResource
+
+            resource = FunctionResource.from_function(
+                fn=resource_handler,
+                uri=scenario_id,
+                name=scenario_name,
+                description=f"[Evaluate] {scenario_desc}",
+                mime_type="application/json",
+                meta=scenario_meta,
+            )
+            self._resource_manager.add_resource(resource)
+
+            logger.debug(
+                "Registered scenario '%s' as prompt and resource: %s",
+                scenario_name,
+                scenario_id,
+            )
+
+            return fn
+
+        return decorator
diff --git a/hud/environment/tests/__init__.py b/hud/environment/tests/__init__.py
new file mode 100644
index 00000000..6703f70b
--- /dev/null
+++ b/hud/environment/tests/__init__.py
@@ -0,0 +1 @@
+"""Tests for hud.environment module."""
diff --git a/hud/environment/tests/test_connection.py b/hud/environment/tests/test_connection.py
new file mode 100644
index 00000000..4ce44fa2
--- /dev/null
+++ b/hud/environment/tests/test_connection.py
@@ -0,0 +1,317 @@
+"""Tests for hud.environment.connection module."""
+
+from __future__ import annotations
+
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import mcp.types as mcp_types
+import pytest
+
+from hud.environment.connection import ConnectionConfig, ConnectionType, Connector
+
+
+class TestConnectionConfig:
+    """Tests for ConnectionConfig."""
+
+    def test_default_config(self) -> None:
+        """Config with no options set."""
+        config = ConnectionConfig()
+        assert config.prefix is None
+        assert config.include is None
+        assert config.exclude is None
+        assert config.transform is None
+
+    def test_config_with_options(self) -> None:
+        """Config with all options set."""
+        transform_fn = lambda t: t  # noqa: E731
+        config = ConnectionConfig(
+            prefix="test",
+            include=["tool1", "tool2"],
+            exclude=["tool3"],
+            transform=transform_fn,
+        )
+        assert config.prefix == "test"
+        assert config.include == ["tool1", "tool2"]
+        assert config.exclude == ["tool3"]
+        assert config.transform is transform_fn
+
+
+class TestConnectionType:
+    """Tests for ConnectionType enum."""
+
+    def test_local_type(self) -> None:
+        """LOCAL type for stdio/Docker connections."""
+        assert ConnectionType.LOCAL.value == "local"
+
+    def test_remote_type(self) -> None:
+        """REMOTE type for HTTP connections."""
+        assert ConnectionType.REMOTE.value == "remote"
+
+
+class TestConnector:
+    """Tests for Connector class."""
+
+    def test_init_stores_transport_config(self) -> None:
+        """__init__ stores transport config, doesn't create client."""
+        transport = {"server": {"url": "http://example.com"}}
+        config = ConnectionConfig()
+
+        connector = Connector(
+            transport=transport,
+            config=config,
+            name="test",
+            connection_type=ConnectionType.REMOTE,
+            auth="test-token",
+        )
+
+        assert connector._transport == transport
+        assert connector._auth == "test-token"
+        assert connector.name == "test"
+        assert connector.connection_type == ConnectionType.REMOTE
+        assert connector.client is None  # Not created yet
+        assert connector._tools_cache is None
+
+    def test_is_local_property(self) -> None:
+        """is_local returns True for LOCAL connections."""
+        connector = Connector(
+            transport={},
+            config=ConnectionConfig(),
+            name="local-test",
+            connection_type=ConnectionType.LOCAL,
+        )
+        assert connector.is_local is True
+        assert connector.is_remote is False
+
+    def test_is_remote_property(self) -> None:
+        """is_remote returns True for REMOTE connections."""
+        connector = Connector(
+            transport={},
+            config=ConnectionConfig(),
+            name="remote-test",
+            connection_type=ConnectionType.REMOTE,
+        )
+        assert connector.is_remote is True
+        assert connector.is_local is False
+
+    def test_is_connected_false_when_no_client(self) -> None:
+        """is_connected returns False when client is None."""
+        connector = Connector(
+            transport={},
+            config=ConnectionConfig(),
+            name="test",
+            connection_type=ConnectionType.REMOTE,
+        )
+        assert connector.is_connected is False
+
+    def test_cached_tools_empty_initially(self) -> None:
+        """cached_tools returns empty list initially."""
+        connector = Connector(
+            transport={},
+            config=ConnectionConfig(),
+            name="test",
+            connection_type=ConnectionType.REMOTE,
+        )
+        assert connector.cached_tools == []
+
+    @pytest.mark.asyncio
+    async def test_connect_creates_client(self) -> None:
+        """connect() creates FastMCPClient and enters context."""
+        transport = {"server": {"url": "http://example.com"}}
+        connector = Connector(
+            transport=transport,
+            config=ConnectionConfig(),
+            name="test",
+            connection_type=ConnectionType.REMOTE,
+            auth="test-token",
+        )
+
+        mock_client = MagicMock()
+        mock_client.__aenter__ = AsyncMock(return_value=mock_client)
+        mock_client.is_connected = MagicMock(return_value=True)
+
+        # Patch where it's imported from, not where it's used
+        with patch("fastmcp.client.Client", return_value=mock_client) as mock_cls:
+            await connector.connect()
+
+            # Client was created with correct args
+            mock_cls.assert_called_once_with(transport=transport, auth="test-token")
+            # Client context was entered
+            mock_client.__aenter__.assert_called_once()
+            # Client is now set
+            assert connector.client is mock_client
+
+    @pytest.mark.asyncio
+    async def test_disconnect_clears_client(self) -> None:
+        """disconnect() exits client context and clears state."""
+        connector = Connector(
+            transport={},
+            config=ConnectionConfig(),
+            name="test",
+            connection_type=ConnectionType.REMOTE,
+        )
+
+        mock_client = MagicMock()
+        mock_client.__aexit__ = AsyncMock(return_value=None)
+        mock_client.is_connected = MagicMock(return_value=True)
+        connector.client = mock_client
+        connector._tools_cache = [MagicMock()]
+
+        await connector.disconnect()
+
+        mock_client.__aexit__.assert_called_once_with(None, None, None)
+        assert connector.client is None
+        assert connector._tools_cache is None
+
+    @pytest.mark.asyncio
+    async def test_list_tools_raises_when_not_connected(self) -> None:
+        """list_tools() raises RuntimeError when not connected."""
+        connector = Connector(
+            transport={},
+            config=ConnectionConfig(),
+            name="test",
+            connection_type=ConnectionType.REMOTE,
+        )
+
+        with pytest.raises(RuntimeError, match="Not connected"):
+            await connector.list_tools()
+
+    @pytest.mark.asyncio
+    async def test_list_tools_applies_include_filter(self) -> None:
+        """list_tools() filters tools based on include list."""
+        connector = Connector(
+            transport={},
+            config=ConnectionConfig(include=["tool1"]),
+            name="test",
+            connection_type=ConnectionType.REMOTE,
+        )
+
+        mock_client = MagicMock()
+        mock_client.list_tools = AsyncMock(
+            return_value=[
+                mcp_types.Tool(name="tool1", description="Tool 1", inputSchema={}),
+                mcp_types.Tool(name="tool2", description="Tool 2", inputSchema={}),
+            ]
+        )
+        connector.client = mock_client
+
+        tools = await connector.list_tools()
+
+        assert len(tools) == 1
+        assert tools[0].name == "tool1"
+
+    @pytest.mark.asyncio
+    async def test_list_tools_applies_exclude_filter(self) -> None:
+        """list_tools() filters out tools in exclude list."""
+        connector = Connector(
+            transport={},
+            config=ConnectionConfig(exclude=["tool2"]),
+            name="test",
+            connection_type=ConnectionType.REMOTE,
+        )
+
+        mock_client = MagicMock()
+        mock_client.list_tools = AsyncMock(
+            return_value=[
+                mcp_types.Tool(name="tool1", description="Tool 1", inputSchema={}),
+                mcp_types.Tool(name="tool2", description="Tool 2", inputSchema={}),
+            ]
+        )
+        connector.client = mock_client
+
+        tools = await connector.list_tools()
+
+        assert len(tools) == 1
+        assert tools[0].name == "tool1"
+
+    @pytest.mark.asyncio
+    async def test_list_tools_applies_prefix(self) -> None:
+        """list_tools() adds prefix to tool names."""
+        connector = Connector(
+            transport={},
+            config=ConnectionConfig(prefix="myprefix"),
+            name="test",
+            connection_type=ConnectionType.REMOTE,
+        )
+
+        mock_client = MagicMock()
+        mock_client.list_tools = AsyncMock(
+            return_value=[
+                mcp_types.Tool(name="tool1", description="Tool 1", inputSchema={}),
+            ]
+        )
+        connector.client = mock_client
+
+        tools = await connector.list_tools()
+
+        assert len(tools) == 1
+        assert tools[0].name == "myprefix_tool1"
+
+    @pytest.mark.asyncio
+    async def test_list_tools_caches_results(self) -> None:
+        """list_tools() caches results."""
+        connector = Connector(
+            transport={},
+            config=ConnectionConfig(),
+            name="test",
+            connection_type=ConnectionType.REMOTE,
+        )
+
+        mock_client = MagicMock()
+        mock_client.list_tools = AsyncMock(
+            return_value=[
+                mcp_types.Tool(name="tool1", description="Tool 1", inputSchema={}),
+            ]
+        )
+        connector.client = mock_client
+
+        tools = await connector.list_tools()
+
+        assert connector._tools_cache == tools
+        assert connector.cached_tools == tools
+
+    @pytest.mark.asyncio
+    async def test_call_tool_strips_prefix(self) -> None:
+        """call_tool() strips prefix before calling."""
+        connector = Connector(
+            transport={},
+            config=ConnectionConfig(prefix="myprefix"),
+            name="test",
+            connection_type=ConnectionType.REMOTE,
+        )
+
+        mock_result = mcp_types.CallToolResult(content=[], isError=False)
+        mock_client = MagicMock()
+        mock_client.call_tool_mcp = AsyncMock(return_value=mock_result)
+        connector.client = mock_client
+
+        await connector.call_tool("myprefix_tool1", {"arg": "value"})
+
+        # Prefix should be stripped
+        mock_client.call_tool_mcp.assert_called_once_with("tool1", {"arg": "value"})
+
+    @pytest.mark.asyncio
+    async def test_call_tool_raises_when_not_connected(self) -> None:
+        """call_tool() raises RuntimeError when not connected."""
+        connector = Connector(
+            transport={},
+            config=ConnectionConfig(),
+            name="test",
+            connection_type=ConnectionType.REMOTE,
+        )
+
+        with pytest.raises(RuntimeError, match="Not connected"):
+            await connector.call_tool("tool1", {})
+
+    def test_repr(self) -> None:
+        """__repr__ shows useful info."""
+        connector = Connector(
+            transport={},
+            config=ConnectionConfig(),
+            name="my-server",
+            connection_type=ConnectionType.REMOTE,
+        )
+
+        repr_str = repr(connector)
+        assert "my-server" in repr_str
+        assert "remote" in repr_str
+        assert "connected=False" in repr_str
diff --git a/hud/environment/tests/test_connectors.py b/hud/environment/tests/test_connectors.py
new file mode 100644
index 00000000..1a42e666
--- /dev/null
+++ b/hud/environment/tests/test_connectors.py
@@ -0,0 +1,218 @@
+"""Tests for hud.environment.connectors module."""
+
+from __future__ import annotations
+
+from typing import Any
+from unittest.mock import MagicMock, patch
+
+from hud.environment.connection import ConnectionType, Connector
+
+
+class TestBaseConnectorMixin:
+    """Tests for BaseConnectorMixin._add_connection."""
+
+    def test_add_connection_stores_transport_config(self) -> None:
+        """_add_connection stores transport, doesn't create client."""
+        from hud.environment.connectors.base import BaseConnectorMixin
+
+        class TestEnv(BaseConnectorMixin):
+            def __init__(self) -> None:
+                self._connections: dict[str, Connector] = {}
+
+        env = TestEnv()
+        transport = {"server": {"url": "http://example.com"}}
+
+        env._add_connection(
+            "test-server",
+            transport,
+            connection_type=ConnectionType.REMOTE,
+            auth="test-token",
+            prefix="myprefix",
+        )
+
+        assert "test-server" in env._connections
+        conn = env._connections["test-server"]
+        assert conn._transport == transport
+        assert conn._auth == "test-token"
+        assert conn.config.prefix == "myprefix"
+        assert conn.client is None  # Not created yet
+
+    def test_add_connection_returns_self(self) -> None:
+        """_add_connection returns self for chaining."""
+        from hud.environment.connectors.base import BaseConnectorMixin
+
+        class TestEnv(BaseConnectorMixin):
+            def __init__(self) -> None:
+                self._connections: dict[str, Connector] = {}
+
+        env = TestEnv()
+        result = env._add_connection(
+            "test",
+            {},
+            connection_type=ConnectionType.REMOTE,
+        )
+
+        assert result is env
+
+
+class TestMCPConfigConnectorMixin:
+    """Tests for MCPConfigConnectorMixin."""
+
+    def test_connect_mcp_detects_local_connection(self) -> None:
+        """connect_mcp detects LOCAL type from command in config."""
+        from hud.environment.connectors.mcp_config import MCPConfigConnectorMixin
+
+        class TestEnv(MCPConfigConnectorMixin):
+            def __init__(self) -> None:
+                self._connections: dict[str, Connector] = {}
+
+        env = TestEnv()
+        config = {
+            "filesystem": {
+                "command": "npx",
+                "args": ["-y", "@modelcontextprotocol/server-filesystem"],
+            }
+        }
+
+        env.connect_mcp(config)
+
+        conn = env._connections["filesystem"]
+        assert conn.connection_type == ConnectionType.LOCAL
+
+    def test_connect_mcp_detects_remote_connection(self) -> None:
+        """connect_mcp detects REMOTE type from URL in config."""
+        from hud.environment.connectors.mcp_config import MCPConfigConnectorMixin
+
+        class TestEnv(MCPConfigConnectorMixin):
+            def __init__(self) -> None:
+                self._connections: dict[str, Connector] = {}
+
+        env = TestEnv()
+        config = {
+            "browser": {
+                "url": "https://mcp.hud.ai/browser",
+            }
+        }
+
+        env.connect_mcp(config)
+
+        conn = env._connections["browser"]
+        assert conn.connection_type == ConnectionType.REMOTE
+
+    def test_connect_mcp_uses_alias(self) -> None:
+        """connect_mcp uses alias if provided."""
+        from hud.environment.connectors.mcp_config import MCPConfigConnectorMixin
+
+        class TestEnv(MCPConfigConnectorMixin):
+            def __init__(self) -> None:
+                self._connections: dict[str, Connector] = {}
+
+        env = TestEnv()
+        config = {"server": {"url": "http://example.com"}}
+
+        env.connect_mcp(config, alias="my-alias")
+
+        assert "my-alias" in env._connections
+        assert "server" not in env._connections
+
+    def test_connect_mcp_config_creates_multiple_connections(self) -> None:
+        """connect_mcp_config creates a connection for each server."""
+        from hud.environment.connectors.mcp_config import MCPConfigConnectorMixin
+
+        class TestEnv(MCPConfigConnectorMixin):
+            def __init__(self) -> None:
+                self._connections: dict[str, Connector] = {}
+
+        env = TestEnv()
+        mcp_config = {
+            "server1": {"url": "http://example1.com"},
+            "server2": {"url": "http://example2.com"},
+            "server3": {"command": "npx", "args": ["server"]},
+        }
+
+        env.connect_mcp_config(mcp_config)
+
+        assert len(env._connections) == 3
+        assert "server1" in env._connections
+        assert "server2" in env._connections
+        assert "server3" in env._connections
+
+
+class TestRemoteConnectorMixin:
+    """Tests for RemoteConnectorMixin."""
+
+    def test_connect_url_creates_remote_connection(self) -> None:
+        """connect_url creates REMOTE connection."""
+        from hud.environment.connectors.remote import RemoteConnectorMixin
+
+        class TestEnv(RemoteConnectorMixin):
+            def __init__(self) -> None:
+                self._connections: dict[str, Connector] = {}
+
+            def mount(self, server: Any, *, prefix: str | None = None) -> None:
+                pass
+
+        env = TestEnv()
+        env.connect_url("https://mcp.example.com", alias="example")
+
+        assert "example" in env._connections
+        conn = env._connections["example"]
+        assert conn.connection_type == ConnectionType.REMOTE
+
+    def test_connect_url_extracts_auth_from_headers(self) -> None:
+        """connect_url extracts Authorization from headers."""
+        from hud.environment.connectors.remote import RemoteConnectorMixin
+
+        class TestEnv(RemoteConnectorMixin):
+            def __init__(self) -> None:
+                self._connections: dict[str, Connector] = {}
+
+            def mount(self, server: Any, *, prefix: str | None = None) -> None:
+                pass
+
+        env = TestEnv()
+        env.connect_url(
+            "https://mcp.example.com",
+            headers={"Authorization": "Bearer my-token"},
+            alias="example",
+        )
+
+        conn = env._connections["example"]
+        assert conn._auth == "Bearer my-token"
+
+    @patch("httpx.Client")
+    def test_connect_hub_fetches_config(self, mock_httpx_cls: MagicMock) -> None:
+        """connect_hub fetches mcp_config from API."""
+        from hud.environment.connectors.remote import RemoteConnectorMixin
+
+        class TestEnv(RemoteConnectorMixin):
+            def __init__(self) -> None:
+                self._connections: dict[str, Connector] = {}
+
+            def mount(self, server: Any, *, prefix: str | None = None) -> None:
+                pass
+
+        # Mock httpx response
+        mock_response = MagicMock()
+        mock_response.json.return_value = {
+            "mcp_config": {
+                "browser": {"url": "https://mcp.hud.ai/browser"},
+            }
+        }
+        mock_response.raise_for_status = MagicMock()
+
+        mock_client = MagicMock()
+        mock_client.get.return_value = mock_response
+        mock_client.__enter__ = MagicMock(return_value=mock_client)
+        mock_client.__exit__ = MagicMock(return_value=None)
+        mock_httpx_cls.return_value = mock_client
+
+        env = TestEnv()
+        with patch("hud.settings.settings") as mock_settings:
+            mock_settings.hud_api_url = "https://api.hud.so"
+            mock_settings.api_key = "test-key"
+
+            env.connect_hub("hud/browser")
+
+        # connect_hub creates a connection named "hud" (the server name)
+        assert "hud" in env._connections
diff --git a/hud/environment/tests/test_environment.py b/hud/environment/tests/test_environment.py
new file mode 100644
index 00000000..44febe88
--- /dev/null
+++ b/hud/environment/tests/test_environment.py
@@ -0,0 +1,161 @@
+"""Tests for Environment class - context manager, resources, prompts, prompt feature."""
+
+from __future__ import annotations
+
+import pytest
+
+
+class TestEnvironmentPrompt:
+    """Tests for Environment.prompt feature."""
+
+    def test_prompt_defaults_to_none(self) -> None:
+        """Environment.prompt defaults to None."""
+        from hud.environment import Environment
+
+        env = Environment("test")
+        assert env.prompt is None
+
+    def test_prompt_can_be_set(self) -> None:
+        """Environment.prompt can be set manually."""
+        from hud.environment import Environment
+
+        env = Environment("test")
+        env.prompt = "Navigate to google.com"
+        assert env.prompt == "Navigate to google.com"
+
+
+class TestEnvironmentContextManager:
+    """Tests for Environment async context manager."""
+
+    @pytest.mark.asyncio
+    async def test_context_manager_sets_in_context_flag(self) -> None:
+        """Context manager sets _in_context flag."""
+        from hud.environment import Environment
+
+        env = Environment("test")
+
+        assert env._in_context is False
+
+        async with env:
+            assert env._in_context is True
+
+        assert env._in_context is False
+
+    @pytest.mark.asyncio
+    async def test_context_manager_no_connections(self) -> None:
+        """Context manager works with no connections."""
+        from hud.environment import Environment
+
+        env = Environment("test")
+
+        async with env:
+            # Should work without connections
+            pass
+
+
+class TestEnvironmentResources:
+    """Tests for Environment resource operations."""
+
+    @pytest.mark.asyncio
+    async def test_list_resources_empty(self) -> None:
+        """list_resources returns empty list when no resources."""
+        from hud.environment import Environment
+
+        env = Environment("test")
+
+        async with env:
+            resources = await env.list_resources()
+
+        assert resources == []
+
+    @pytest.mark.asyncio
+    async def test_read_resource_not_found(self) -> None:
+        """read_resource raises when resource not found."""
+        from hud.environment import Environment
+
+        env = Environment("test")
+
+        async with env:
+            with pytest.raises(ValueError, match="Resource not found"):
+                await env.read_resource("file://nonexistent.txt")
+
+
+class TestEnvironmentPrompts:
+    """Tests for Environment prompt operations (MCP prompts, not task prompt)."""
+
+    @pytest.mark.asyncio
+    async def test_list_prompts_empty(self) -> None:
+        """list_prompts returns empty list when no prompts."""
+        from hud.environment import Environment
+
+        env = Environment("test")
+
+        async with env:
+            prompts = await env.list_prompts()
+
+        assert prompts == []
+
+    @pytest.mark.asyncio
+    async def test_get_prompt_not_found(self) -> None:
+        """get_prompt raises when prompt not found."""
+        from hud.environment import Environment
+
+        env = Environment("test")
+
+        async with env:
+            with pytest.raises(ValueError, match="Prompt not found"):
+                await env.get_prompt("nonexistent")
+
+
+class TestEnvironmentSetupEvaluate:
+    """Tests for setup_tool and evaluate_tool methods."""
+
+    def test_setup_tool_with_name_and_kwargs(self) -> None:
+        """setup_tool accepts name and kwargs."""
+        from hud.environment import Environment
+
+        env = Environment("test")
+        env.setup_tool("navigate", url="https://example.com")
+
+        assert len(env._setup_calls) == 1
+        assert env._setup_calls[0] == ("navigate", {"url": "https://example.com"})
+
+    def test_setup_tool_returns_self(self) -> None:
+        """setup_tool returns self for chaining."""
+        from hud.environment import Environment
+
+        env = Environment("test")
+        result = env.setup_tool("navigate", url="https://example.com")
+
+        assert result is env
+
+    def test_evaluate_tool_with_name_and_kwargs(self) -> None:
+        """evaluate_tool accepts name and kwargs."""
+        from hud.environment import Environment
+
+        env = Environment("test")
+        env.evaluate_tool("check_text", contains="success")
+
+        assert len(env._evaluate_calls) == 1
+        assert env._evaluate_calls[0] == ("check_text", {"contains": "success"})
+
+    def test_evaluate_tool_returns_self(self) -> None:
+        """evaluate_tool returns self for chaining."""
+        from hud.environment import Environment
+
+        env = Environment("test")
+        result = env.evaluate_tool("check_text", contains="success")
+
+        assert result is env
+
+    def test_chaining_multiple_setup_calls(self) -> None:
+        """Multiple setup_tool calls can be chained."""
+        from hud.environment import Environment
+
+        env = (
+            Environment("test")
+            .setup_tool("navigate", url="https://example.com")
+            .setup_tool("wait", seconds=2)
+        )
+
+        assert len(env._setup_calls) == 2
diff --git a/hud/environment/tests/test_integrations.py b/hud/environment/tests/test_integrations.py
new file mode 100644
index 00000000..90e84931
--- /dev/null
+++ b/hud/environment/tests/test_integrations.py
@@ -0,0 +1,257 @@
+"""Tests for format integrations - OpenAI, Anthropic, Gemini."""
+
+from __future__ import annotations
+
+from typing import Any
+
+import mcp.types as mcp_types
+
+
+def create_mock_tool(
+    name: str, description: str = "", schema: dict | None = None
+) -> mcp_types.Tool:
+    """Create a mock MCP tool for testing."""
+    return mcp_types.Tool(
+        name=name,
+        description=description,
+        inputSchema=schema or {"type": "object", "properties": {}},
+    )
+
+
+class TestOpenAIMixin:
+    """Tests for OpenAI format conversion."""
+
+    def test_as_openai_chat_tools_basic(self) -> None:
+        """as_openai_chat_tools converts MCP tools to OpenAI format."""
+        from hud.environment.integrations.openai import OpenAIMixin
+
+        class TestEnv(OpenAIMixin):
+            def as_tools(self) -> list[mcp_types.Tool]:
+                return [
+                    create_mock_tool(
+                        "navigate",
+                        "Navigate to URL",
+                        {
+                            "type": "object",
+                            "properties": {"url": {"type": "string"}},
+                            "required": ["url"],
+                        },
+                    ),
+                ]
+
+            async def call_tool(self, name: str, arguments: dict[str, Any]) -> Any:
+                pass
+
+        env = TestEnv()
+        tools = env.as_openai_chat_tools()
+
+        assert len(tools) == 1
+        assert tools[0]["type"] == "function"
+        assert tools[0]["function"]["name"] == "navigate"  # type: ignore[typeddict-item]
+        assert tools[0]["function"]["description"] == "Navigate to URL"  # type: ignore[typeddict-item]
+        assert "url" in tools[0]["function"]["parameters"]["properties"]  # type: ignore[typeddict-item, operator]
+
+    def test_as_openai_chat_tools_strict_mode(self) -> None:
+        """as_openai_chat_tools with strict=True adds strict flag."""
+        from hud.environment.integrations.openai import OpenAIMixin
+
+        class TestEnv(OpenAIMixin):
+            def as_tools(self) -> list[mcp_types.Tool]:
+                return [create_mock_tool("test_tool")]
+
+            async def call_tool(self, name: str, arguments: dict[str, Any]) -> Any:
+                pass
+
+        env = TestEnv()
+        tools = env.as_openai_chat_tools(strict=True)
+
+        assert tools[0]["function"]["strict"] is True  # type: ignore[typeddict-item]
+
+    def test_as_openai_chat_tools_empty(self) -> None:
+        """as_openai_chat_tools returns empty list when no tools."""
+        from hud.environment.integrations.openai import OpenAIMixin
+
+        class TestEnv(OpenAIMixin):
+            def as_tools(self) -> list[mcp_types.Tool]:
+                return []
+
+            async def call_tool(self, name: str, arguments: dict[str, Any]) -> Any:
+                pass
+
+        env = TestEnv()
+        tools = env.as_openai_chat_tools()
+
+        assert tools == []
+
+    def test_as_openai_responses_tools(self) -> None:
+        """as_openai_responses_tools converts to Responses API format."""
+        from hud.environment.integrations.openai import OpenAIMixin
+
+        class TestEnv(OpenAIMixin):
+            def as_tools(self) -> list[mcp_types.Tool]:
+                return [create_mock_tool("search", "Search the web")]
+
+            async def call_tool(self, name: str, arguments: dict[str, Any]) -> Any:
+                pass
+
+        env = TestEnv()
+        tools = env.as_openai_responses_tools()
+
+        assert len(tools) == 1
+        assert tools[0]["type"] == "function"
+        assert tools[0]["name"] == "search"
+        assert tools[0]["description"] == "Search the web"
+
+
+class TestAnthropicMixin:
+    """Tests for Anthropic/Claude format conversion."""
+
+    def test_as_claude_tools_basic(self) -> None:
+        """as_claude_tools converts MCP tools to Claude format."""
+        from hud.environment.integrations.anthropic import AnthropicMixin
+
+        class TestEnv(AnthropicMixin):
+            def as_tools(self) -> list[mcp_types.Tool]:
+                return [
+                    create_mock_tool(
+                        "click",
+                        "Click element",
+                        {
+                            "type": "object",
+                            "properties": {"selector": {"type": "string"}},
+                        },
+                    ),
+                ]
+
+            async def call_tool(self, name: str, arguments: dict[str, Any]) -> Any:
+                pass
+
+        env = TestEnv()
+        tools = env.as_claude_tools()
+
+        assert len(tools) == 1
+        assert tools[0]["name"] == "click"
+        assert tools[0]["description"] == "Click element"
+        assert "input_schema" in tools[0]
+        assert "cache_control" not in tools[0]
+
+    def test_as_claude_tools_with_cache_control(self) -> None:
+        """as_claude_tools with cache_control=True adds cache field."""
+        from hud.environment.integrations.anthropic import AnthropicMixin
+
+        class TestEnv(AnthropicMixin):
+            def as_tools(self) -> list[mcp_types.Tool]:
+                return [create_mock_tool("test")]
+
+            async def call_tool(self, name: str, arguments: dict[str, Any]) -> Any:
+                pass
+
+        env = TestEnv()
+        tools = env.as_claude_tools(cache_control=True)
+
+        assert tools[0]["cache_control"] == {"type": "ephemeral"}
+
+    def test_as_claude_programmatic_tools(self) -> None:
+        """as_claude_programmatic_tools includes allowed_callers."""
+        from hud.environment.integrations.anthropic import AnthropicMixin
+
+        class TestEnv(AnthropicMixin):
+            def as_tools(self) -> list[mcp_types.Tool]:
+                return [create_mock_tool("analyze")]
+
+            async def call_tool(self, name: str, arguments: dict[str, Any]) -> Any:
+                pass
+
+        env = TestEnv()
+        tools = env.as_claude_programmatic_tools()
+
+        assert tools[0]["allowed_callers"] == ["code_execution_20250825"]
+
+
+class TestGeminiMixin:
+    """Tests for Google/Gemini format conversion."""
+
+    def test_as_gemini_tools_basic(self) -> None:
+        """as_gemini_tools converts MCP tools to Gemini format."""
+        from hud.environment.integrations.gemini import GeminiMixin
+
+        class TestEnv(GeminiMixin):
+            def as_tools(self) -> list[mcp_types.Tool]:
+                return [
+                    create_mock_tool(
+                        "search",
+                        "Search query",
+                        {
+                            "type": "object",
+                            "properties": {"query": {"type": "string"}},
+                        },
+                    ),
+                ]
+
+        env = TestEnv()
+        tools = env.as_gemini_tools()
+
+        assert len(tools) == 1
+        assert "function_declarations" in tools[0]
+        declarations = tools[0]["function_declarations"]
+        assert len(declarations) == 1
+        assert declarations[0]["name"] == "search"
+        assert declarations[0]["description"] == "Search query"
+
+    def test_as_gemini_tools_multiple(self) -> None:
+        """as_gemini_tools wraps multiple tools in single declaration list."""
+        from hud.environment.integrations.gemini import GeminiMixin
+
+        class TestEnv(GeminiMixin):
+            def as_tools(self) -> list[mcp_types.Tool]:
+                return [
+                    create_mock_tool("tool1"),
+                    create_mock_tool("tool2"),
+                    create_mock_tool("tool3"),
+                ]
+
+        env = TestEnv()
+        tools = env.as_gemini_tools()
+
+        assert len(tools) == 1  # Single wrapper object
+        assert len(tools[0]["function_declarations"]) == 3
+
+    def test_as_gemini_tool_config_auto(self) -> None:
+        """as_gemini_tool_config with AUTO mode."""
+        from hud.environment.integrations.gemini import GeminiMixin
+
+        class TestEnv(GeminiMixin):
+            def as_tools(self) -> list[mcp_types.Tool]:
+                return []
+
+        env = TestEnv()
+        config = env.as_gemini_tool_config(mode="AUTO")
+
+        assert config["function_calling_config"]["mode"] == "AUTO"
+
+    def test_as_gemini_tool_config_any_with_allowed(self) -> None:
+        """as_gemini_tool_config with ANY mode and allowed tools."""
+        from hud.environment.integrations.gemini import GeminiMixin
+
+        class TestEnv(GeminiMixin):
+            def as_tools(self) -> list[mcp_types.Tool]:
+                return []
+
+        env = TestEnv()
+        config = env.as_gemini_tool_config(mode="ANY", allowed_tools=["search", "navigate"])
+
+        assert config["function_calling_config"]["mode"] == "ANY"
+        assert config["function_calling_config"]["allowed_function_names"] == ["search", "navigate"]
+
+    def test_as_gemini_tool_config_none(self) -> None:
+        """as_gemini_tool_config with NONE mode disables tools."""
+        from hud.environment.integrations.gemini import GeminiMixin
+
+        class TestEnv(GeminiMixin):
+            def as_tools(self) -> list[mcp_types.Tool]:
+                return []
+
+        env = TestEnv()
+        config = env.as_gemini_tool_config(mode="NONE")
+
+        assert config["function_calling_config"]["mode"] == "NONE"
diff --git a/hud/environment/tests/test_local_connectors.py b/hud/environment/tests/test_local_connectors.py
new file mode 100644
index 00000000..d8e3de0a
--- /dev/null
+++ b/hud/environment/tests/test_local_connectors.py
@@ -0,0 +1,201 @@
+"""Tests for local connectors - connect_image, connect_server, connect_fastapi."""
+
+from __future__ import annotations
+
+from typing import Any
+from unittest.mock import MagicMock, patch
+
+from hud.environment.connection import ConnectionType, Connector
+
+
+class TestConnectImage:
+    """Tests for LocalConnectorMixin.connect_image."""
+
+    @patch("hud.cli.utils.docker.create_docker_run_command")
+    def test_connect_image_creates_local_connection(self, mock_docker_cmd: MagicMock) -> None:
+        """connect_image creates LOCAL connection with docker command."""
+        from hud.environment.connectors.local import LocalConnectorMixin
+
+        mock_docker_cmd.return_value = ["docker", "run", "-i", "--rm", "mcp/fetch"]
+
+        class TestEnv(LocalConnectorMixin):
+            def __init__(self) -> None:
+                self._connections: dict[str, Connector] = {}
+
+            def mount(self, server: Any, *, prefix: str | None = None) -> None:
+                pass
+
+        env = TestEnv()
+        env.connect_image("mcp/fetch")
+
+        assert "mcp/fetch" in env._connections
+        conn = env._connections["mcp/fetch"]
+        assert conn.connection_type == ConnectionType.LOCAL
+        mock_docker_cmd.assert_called_once()
+
+    @patch("hud.cli.utils.docker.create_docker_run_command")
+    def test_connect_image_with_alias(self, mock_docker_cmd: MagicMock) -> None:
+        """connect_image uses alias for connection name."""
+        from hud.environment.connectors.local import LocalConnectorMixin
+
+        mock_docker_cmd.return_value = ["docker", "run", "-i", "--rm", "mcp/fetch"]
+
+        class TestEnv(LocalConnectorMixin):
+            def __init__(self) -> None:
+                self._connections: dict[str, Connector] = {}
+
+            def mount(self, server: Any, *, prefix: str | None = None) -> None:
+                pass
+
+        env = TestEnv()
+        env.connect_image("mcp/fetch", alias="fetcher")
+
+        assert "fetcher" in env._connections
+        assert "mcp/fetch" not in env._connections
+
+    @patch("hud.cli.utils.docker.create_docker_run_command")
+    def test_connect_image_with_prefix(self, mock_docker_cmd: MagicMock) -> None:
+        """connect_image passes prefix to config."""
+        from hud.environment.connectors.local import LocalConnectorMixin
+
+        mock_docker_cmd.return_value = ["docker", "run", "-i", "--rm", "mcp/fetch"]
+
+        class TestEnv(LocalConnectorMixin):
+            def __init__(self) -> None:
+                self._connections: dict[str, Connector] = {}
+
+            def mount(self, server: Any, *, prefix: str | None = None) -> None:
+                pass
+
+        env = TestEnv()
+        env.connect_image("mcp/fetch", prefix="fetch")
+
+        conn = env._connections["mcp/fetch"]
+        assert conn.config.prefix == "fetch"
+
+    @patch("hud.cli.utils.docker.create_docker_run_command")
+    def test_connect_image_returns_self(self, mock_docker_cmd: MagicMock) -> None:
+        """connect_image returns self for chaining."""
+        from hud.environment.connectors.local import LocalConnectorMixin
+
+        mock_docker_cmd.return_value = ["docker", "run", "-i", "--rm", "mcp/fetch"]
+
+        class TestEnv(LocalConnectorMixin):
+            def __init__(self) -> None:
+                self._connections: dict[str, Connector] = {}
+
+            def mount(self, server: Any, *, prefix: str | None = None) -> None:
+                pass
+
+        env = TestEnv()
+        result = env.connect_image("mcp/fetch")
+
+        assert result is env
+
+
+class TestConnectServer:
+    """Tests for LocalConnectorMixin.connect_server."""
+
+    def test_connect_server_calls_include_router(self) -> None:
+        """connect_server calls include_router with server and prefix."""
+        from hud.environment.connectors.local import LocalConnectorMixin
+
+        class TestEnv(LocalConnectorMixin):
+            def __init__(self) -> None:
+                self._connections: dict[str, Connector] = {}
+                self.routers: list[tuple[Any, str | None]] = []
+
+            def include_router(self, server: Any, *, prefix: str | None = None) -> None:
+                self.routers.append((server, prefix))
+
+        env = TestEnv()
+        mock_server = MagicMock()
+        env.connect_server(mock_server, prefix="tools")
+
+        assert len(env.routers) == 1
+        assert env.routers[0] == (mock_server, "tools")
+
+    def test_connect_server_returns_self(self) -> None:
+        """connect_server returns self for chaining."""
+        from hud.environment.connectors.local import LocalConnectorMixin
+
+        class TestEnv(LocalConnectorMixin):
+            def __init__(self) -> None:
+                self._connections: dict[str, Connector] = {}
+
+            def include_router(self, server: Any, *, prefix: str | None = None) -> None:
+                pass
+
+        env = TestEnv()
+        result = env.connect_server(MagicMock())
+
+        assert result is env
+
+
+class TestConnectFastAPI:
+    """Tests for LocalConnectorMixin.connect_fastapi."""
+
+    @patch("fastmcp.FastMCP")
+    def test_connect_fastapi_creates_mcp_server(self, mock_fastmcp: MagicMock) -> None:
+        """connect_fastapi converts FastAPI app to MCP server."""
+        from hud.environment.connectors.local import LocalConnectorMixin
+
+        mock_mcp_server = MagicMock()
+        mock_fastmcp.from_fastapi.return_value = mock_mcp_server
+
+        class TestEnv(LocalConnectorMixin):
+            def __init__(self) -> None:
+                self._connections: dict[str, Connector] = {}
+                self.routers: list[tuple[Any, str | None]] = []
+
+            def include_router(self, server: Any, *, prefix: str | None = None) -> None:
+                self.routers.append((server, prefix))
+
+        env = TestEnv()
+        mock_app = MagicMock()
+        mock_app.title = "My API"
+        env.connect_fastapi(mock_app)
+
+        mock_fastmcp.from_fastapi.assert_called_once_with(app=mock_app, name="My API")
+        assert len(env.routers) == 1
+        assert env.routers[0] == (mock_mcp_server, None)
+
+    @patch("fastmcp.FastMCP")
+    def test_connect_fastapi_with_custom_name(self, mock_fastmcp: MagicMock) -> None:
+        """connect_fastapi uses custom name if provided."""
+        from hud.environment.connectors.local import LocalConnectorMixin
+
+        mock_fastmcp.from_fastapi.return_value = MagicMock()
+
+        class TestEnv(LocalConnectorMixin):
+            def __init__(self) -> None:
+                self._connections: dict[str, Connector] = {}
+
+            def include_router(self, server: Any, *, prefix: str | None = None) -> None:
+                pass
+
+        env = TestEnv()
+        mock_app = MagicMock()
+        mock_app.title = "Original"
+        env.connect_fastapi(mock_app, name="custom-api")
+
+        mock_fastmcp.from_fastapi.assert_called_once_with(app=mock_app, name="custom-api")
+
+    @patch("fastmcp.FastMCP")
+    def test_connect_fastapi_returns_self(self, mock_fastmcp: MagicMock) -> None:
+        """connect_fastapi returns self for chaining."""
+        from hud.environment.connectors.local import LocalConnectorMixin
+
+        mock_fastmcp.from_fastapi.return_value = MagicMock()
+
+        class TestEnv(LocalConnectorMixin):
+            def __init__(self) -> None:
+                self._connections: dict[str, Connector] = {}
+
+            def include_router(self, server: Any, *, prefix: str | None = None) -> None:
+                pass
+
+        env = TestEnv()
+        result = env.connect_fastapi(MagicMock())
+
+        assert result is env
diff --git a/hud/environment/tests/test_scenarios.py b/hud/environment/tests/test_scenarios.py
new file mode 100644
index 00000000..875ac846
--- /dev/null
+++ b/hud/environment/tests/test_scenarios.py
@@ -0,0 +1,280 @@
+"""Tests for Environment scenario decorator."""
+
+from __future__ import annotations
+
+import pytest
+
+from hud.environment import Environment
+
+
+class TestScenarioDecorator:
+    """Tests for @env.scenario decorator."""
+
+    def test_scenario_registers_function(self) -> None:
+        """@env.scenario registers the function."""
+        env = Environment("test-env")
+
+        @env.scenario("greet")
+        async def greet_scenario(name: str):
+            yield f"Hello, {name}!"
+            yield 1.0
+
+        assert "greet" in env._scenarios
+
+    def test_scenario_creates_mcp_prompt(self) -> None:
+        """@env.scenario creates an MCP prompt."""
+        env = Environment("test-env")
+
+        @env.scenario("greet", description="Greeting scenario")
+        async def greet_scenario(name: str):
+            yield f"Hello, {name}!"
+            yield 1.0
+
+        # Check that prompt was registered via prompt manager
+        prompt_names = list(env._prompt_manager._prompts.keys())
+        assert "test-env:greet" in prompt_names
+
+    def test_scenario_creates_mcp_resource(self) -> None:
+        """@env.scenario creates an MCP resource."""
+        env = Environment("test-env")
+
+        @env.scenario("greet")
+        async def greet_scenario(name: str):
+            yield f"Hello, {name}!"
+            yield 1.0
+
+        # Check that resource was registered via resource manager
+        resource_uris = list(env._resource_manager._resources.keys())
+        assert "test-env:greet" in resource_uris
+
+    def test_scenario_extracts_arguments(self) -> None:
+        """@env.scenario extracts function arguments for prompt."""
+        env = Environment("test-env")
+
+        @env.scenario("checkout")
+        async def checkout_scenario(user_id: str, amount: int = 100):
+            yield f"Checkout for {user_id}: ${amount}"
+            yield 1.0
+
+        # Find the prompt
+        prompt = env._prompt_manager._prompts.get("test-env:checkout")
+        assert prompt is not None
+        assert prompt.arguments is not None
+
+        # Check arguments
+        arg_names = [arg.name for arg in prompt.arguments]
+        assert "user_id" in arg_names
+        assert "amount" in arg_names
+
+
+class TestScenarioExecution:
+    """Tests for scenario execution flow."""
+
+    @pytest.mark.asyncio
+    async def test_scenario_setup_phase(self) -> None:
+        """Scenario setup phase yields prompt."""
+        env = Environment("test-env")
+        setup_ran = False
+
+        @env.scenario("test")
+        async def test_scenario():
+            nonlocal setup_ran
+            setup_ran = True
+            yield "Test prompt"
+            yield 1.0
+
+        # Get the prompt handler
+        prompt = env._prompt_manager._prompts.get("test-env:test")
+        assert prompt is not None
+
+        # Run setup via prompt render (which calls fn) - no need for context
+        result = await prompt.render({})
+
+        assert setup_ran
+        # Result is list of PromptMessage
+        assert len(result) > 0
+        assert "Test prompt" in str(result[0].content)
+
+    @pytest.mark.asyncio
+    async def test_scenario_stores_session(self) -> None:
+        """Scenario stores generator in session for evaluate phase."""
+        env = Environment("test-env")
+
+        @env.scenario("test")
+        async def test_scenario():
+            yield "Test prompt"
+            yield 1.0
+
+        # Run setup via prompt - no need for context
+        prompt = env._prompt_manager._prompts.get("test-env:test")
+        assert prompt is not None
+        await prompt.render({})
+
+        # Check session was stored
+        assert "test" in env._scenario_latest
+
+    @pytest.mark.asyncio
+    async def test_scenario_full_flow(self) -> None:
+        """Scenario runs setup and evaluate phases correctly."""
+        env = Environment("test-env")
+        phases = []
+
+        @env.scenario("test")
+        async def test_scenario():
+            phases.append("setup")
+            yield "Test prompt"
+            phases.append("evaluate")
+            yield 0.95
+
+        # Setup phase - no context needed for prompt/resource
+        prompt = env._prompt_manager._prompts.get("test-env:test")
+        assert prompt is not None
+        await prompt.render({})
+        assert "setup" in phases
+        assert "evaluate" not in phases
+
+        # Evaluate phase
+        resource = env._resource_manager._resources.get("test-env:test")
+        assert resource is not None
+        await resource.read()
+        assert "evaluate" in phases
+
+
+class TestScenarioWithArgs:
+    """Tests for scenarios with arguments."""
+
+    @pytest.mark.asyncio
+    async def test_scenario_receives_args(self) -> None:
+        """Scenario receives arguments from prompt call."""
+        env = Environment("test-env")
+        received_args = {}
+
+        @env.scenario("checkout")
+        async def checkout_scenario(user_id: str, amount: int = 100):
+            received_args["user_id"] = user_id
+            received_args["amount"] = amount
+            yield f"Checkout {user_id}: ${amount}"
+            yield 1.0
+
+        prompt = env._prompt_manager._prompts.get("test-env:checkout")
+        assert prompt is not None
+        # No context needed for prompt render
+        await prompt.render({"user_id": "alice", "amount": 50})
+
+        assert received_args["user_id"] == "alice"
+        assert received_args["amount"] == 50
+
+
+class TestScenarioSubmit:
+    """Tests for scenario submit and answer flow."""
+
+    @pytest.mark.asyncio
+    async def test_submit_stores_answer(self) -> None:
+        """submit() stores answer for scenario."""
+        env = Environment("test-env")
+
+        @env.scenario("test")
+        async def test_scenario():
+            yield "What is 2+2?"
+            yield 1.0
+
+        # Run setup
+        prompt = env._prompt_manager._prompts.get("test-env:test")
+        assert prompt is not None
+        await prompt.render({})
+
+        # Submit answer
+        await env.submit("test", "4")
+
+        assert env._scenario_answers.get("test") == "4"
+
+    @pytest.mark.asyncio
+    async def test_scenario_receives_answer(self) -> None:
+        """Scenario receives submitted answer via yield."""
+        env = Environment("test-env")
+        received_answer = None
+
+        @env.scenario("qa")
+        async def qa_scenario():
+            nonlocal received_answer
+            answer = yield "What is 2+2?"
+            received_answer = answer
+            yield 1.0 if answer == "4" else 0.0
+
+        # Run setup
+        prompt = env._prompt_manager._prompts.get("test-env:qa")
+        assert prompt is not None
+        await prompt.render({})
+
+        # Submit answer
+        env._scenario_answers["qa"] = "4"
+
+        # Run evaluate
+        resource = env._resource_manager._resources.get("test-env:qa")
+        assert resource is not None
+        await resource.read()
+
+        assert received_answer == "4"
+
+    @pytest.mark.asyncio
+    async def test_scenario_evaluates_answer(self) -> None:
+        """Scenario evaluates answer and returns reward."""
+        env = Environment("test-env")
+
+        @env.scenario("grading")
+        async def grading_scenario():
+            answer = yield "What is the capital of France?"
+            yield 1.0 if "paris" in answer.lower() else 0.0
+
+        # Run setup
+        prompt = env._prompt_manager._prompts.get("test-env:grading")
+        assert prompt is not None
+        await prompt.render({})
+
+        # Submit correct answer
+        env._scenario_answers["grading"] = "Paris"
+
+        # Run evaluate
+        resource = env._resource_manager._resources.get("test-env:grading")
+        assert resource is not None
+        result = await resource.read()
+
+        import json
+
+        data = json.loads(result)
+        assert data["reward"] == 1.0
+
+
+class TestScenarioMeta:
+    """Tests for scenario _meta containing code."""
+
+    def test_scenario_captures_source_code(self) -> None:
+        """@env.scenario captures function source in meta."""
+        env = Environment("test-env")
+
+        @env.scenario("example")
+        async def example_scenario(x: int):
+            yield f"Process {x}"
+            yield 1.0
+
+        prompt = env._prompt_manager._prompts.get("test-env:example")
+        assert prompt is not None
+        assert prompt.meta is not None
+        assert "code" in prompt.meta
+        assert "async def example_scenario" in prompt.meta["code"]
+        assert "yield" in prompt.meta["code"]
+
+    def test_scenario_meta_on_resource(self) -> None:
+        """Resource also has source code in meta."""
+        env = Environment("test-env")
+
+        @env.scenario("example")
+        async def example_scenario():
+            yield "Test"
+            yield 1.0
+
+        resource = env._resource_manager._resources.get("test-env:example")
+        assert resource is not None
+        assert resource.meta is not None
+        assert "code" in resource.meta
+        assert "async def example_scenario" in resource.meta["code"]
diff --git a/hud/environment/tests/test_tools.py b/hud/environment/tests/test_tools.py
new file mode 100644
index 00000000..8c99a01b
--- /dev/null
+++ b/hud/environment/tests/test_tools.py
@@ -0,0 +1,208 @@
+"""Tests for @env.tool() decorator and tool operations."""
+
+from __future__ import annotations
+
+import pytest
+
+from hud.environment import Environment
+
+
+class TestToolDecorator:
+    """Tests for @env.tool() decorator."""
+
+    def test_tool_registers_function(self) -> None:
+        """@env.tool registers the function in tool manager."""
+        env = Environment("test-env")
+
+        @env.tool()
+        def add(a: int, b: int) -> int:
+            """Add two numbers."""
+            return a + b
+
+        # Check tool was registered
+        tool_names = list(env._tool_manager._tools.keys())
+        assert "add" in tool_names
+
+    def test_tool_with_custom_name(self) -> None:
+        """@env.tool(name=...) uses custom name."""
+        env = Environment("test-env")
+
+        @env.tool(name="custom_add")
+        def add(a: int, b: int) -> int:
+            return a + b
+
+        tool_names = list(env._tool_manager._tools.keys())
+        assert "custom_add" in tool_names
+        assert "add" not in tool_names
+
+    def test_tool_preserves_docstring(self) -> None:
+        """@env.tool preserves function docstring as description."""
+        env = Environment("test-env")
+
+        @env.tool()
+        def greet(name: str) -> str:
+            """Greet someone by name."""
+            return f"Hello, {name}!"
+
+        tool = env._tool_manager._tools.get("greet")
+        assert tool is not None
+        assert "Greet someone by name" in (tool.description or "")
+
+    def test_tool_async_function(self) -> None:
+        """@env.tool works with async functions."""
+        env = Environment("test-env")
+
+        @env.tool()
+        async def fetch_data(url: str) -> str:
+            """Fetch data from URL."""
+            return f"Data from {url}"
+
+        tool_names = list(env._tool_manager._tools.keys())
+        assert "fetch_data" in tool_names
+
+    def test_tool_returns_function(self) -> None:
+        """@env.tool returns the original function."""
+        env = Environment("test-env")
+
+        @env.tool()
+        def add(a: int, b: int) -> int:
+            return a + b
+
+        # Should be able to call it directly
+        assert add(2, 3) == 5
+
+
+class TestListTools:
+    """Tests for list_tools and as_tools."""
+
+    @pytest.mark.asyncio
+    async def test_as_tools_returns_registered_tools(self) -> None:
+        """as_tools returns list of registered MCP tools."""
+        env = Environment("test-env")
+
+        @env.tool()
+        def tool1() -> str:
+            return "1"
+
+        @env.tool()
+        def tool2() -> str:
+            return "2"
+
+        async with env:
+            tools = env.as_tools()
+            tool_names = [t.name for t in tools]
+            assert "tool1" in tool_names
+            assert "tool2" in tool_names
+
+    @pytest.mark.asyncio
+    async def test_as_tools_empty_when_no_tools(self) -> None:
+        """as_tools returns empty list when no tools registered."""
+        env = Environment("test-env")
+        async with env:
+            tools = env.as_tools()
+            # May have built-in _hud_submit tool
+            user_tools = [t for t in tools if not t.name.startswith("_")]
+            assert len(user_tools) == 0
+
+
+class TestCallTool:
+    """Tests for call_tool method."""
+
+    @pytest.mark.asyncio
+    async def test_call_tool_executes_function(self) -> None:
+        """call_tool executes registered tool function."""
+        env = Environment("test-env")
+        executed = []
+
+        @env.tool()
+        def greet(name: str) -> str:
+            executed.append(name)
+            return f"Hello, {name}!"
+
+        async with env:
+            result = await env.call_tool("greet", name="Alice")
+
+        assert executed == ["Alice"]
+        assert result is not None
+
+    @pytest.mark.asyncio
+    async def test_call_tool_async_function(self) -> None:
+        """call_tool works with async tool functions."""
+        env = Environment("test-env")
+
+        @env.tool()
+        async def async_greet(name: str) -> str:
+            return f"Hello, {name}!"
+
+        async with env:
+            result = await env.call_tool("async_greet", name="Bob")
+
+        assert result is not None
+
+    @pytest.mark.asyncio
+    async def test_call_tool_not_found(self) -> None:
+        """call_tool raises for unknown tool."""
+        env = Environment("test-env")
+
+        async with env:
+            with pytest.raises(ValueError, match="Tool not found"):
+                await env.call_tool("nonexistent")
+
+
+class TestMockMode:
+    """Tests for mock mode."""
+
+    def test_mock_mode_default_false(self) -> None:
+        """Mock mode is False by default."""
+        env = Environment("test-env")
+        assert env._mock_mode is False
+        assert env.is_mock is False
+
+    def test_mock_enables_mock_mode(self) -> None:
+        """mock() enables mock mode."""
+        env = Environment("test-env")
+        env.mock()
+        assert env._mock_mode is True
+        assert env.is_mock is True
+
+    def test_unmock_disables_mock_mode(self) -> None:
+        """unmock() disables mock mode."""
+        env = Environment("test-env")
+        env.mock()
+        env.unmock()
+        assert env._mock_mode is False
+
+    def test_mock_returns_self_for_chaining(self) -> None:
+        """mock() returns self for chaining."""
+        env = Environment("test-env")
+        result = env.mock()
+        assert result is env
+
+    def test_mock_tool_sets_custom_output(self) -> None:
+        """mock_tool() sets custom output for a tool."""
+        env = Environment("test-env")
+        env.mock_tool("navigate", "Custom result")
+        assert env._mock_outputs["navigate"] == "Custom result"
+
+    @pytest.mark.asyncio
+    async def test_mock_mode_returns_mock_response(self) -> None:
+        """Mock mode returns mock response instead of executing tool."""
+        env = Environment("test-env")
+        call_count = 0
+
+        @env.tool()
+        def real_tool() -> str:
+            nonlocal call_count
+            call_count += 1
+            return "real result"
+
+        env.mock()
+        env.mock_tool("real_tool", "mocked result")
+
+        async with env:
+            result = await env.call_tool("real_tool")
+
+        # Tool should not be called in mock mode
+        assert call_count == 0
+        # Should get the mock result
+        assert result is not None
diff --git a/hud/environment/types.py b/hud/environment/types.py
new file mode 100644
index 00000000..fca74c7c
--- /dev/null
+++ b/hud/environment/types.py
@@ -0,0 +1,23 @@
+"""Environment types for configuration and tracing."""
+
+from __future__ import annotations
+
+from pydantic import BaseModel, Field
+
+__all__ = ["EnvConfig"]
+
+
+class EnvConfig(BaseModel):
+    """Environment configuration for Tasks.
+
+    Specifies which hub to connect to and optional tool filtering.
+
+    Attributes:
+        name: Hub name to connect via connect_hub() (e.g., "browser", "sheets")
+        include: Optional whitelist of tool names to include
+        exclude: Optional blacklist of tool names to exclude
+    """
+
+    name: str = Field(description="Hub name to connect to")
+    include: list[str] | None = Field(default=None, description="Whitelist of tool names")
+    exclude: list[str] | None = Field(default=None, description="Blacklist of tool names")
diff --git a/hud/environment/utils/__init__.py b/hud/environment/utils/__init__.py
new file mode 100644
index 00000000..1e0318bd
--- /dev/null
+++ b/hud/environment/utils/__init__.py
@@ -0,0 +1,35 @@
+"""Environment utilities."""
+
+from hud.environment.utils.formats import (
+    ToolFormat,
+    format_result,
+    parse_tool_call,
+    parse_tool_calls,
+    result_to_string,
+)
+from hud.environment.utils.schema import (
+    ensure_strict_schema,
+    json_type_to_python,
+    schema_to_pydantic,
+)
+from hud.environment.utils.tool_wrappers import (
+    create_async_tool_fn,
+    create_sync_tool_fn,
+    create_tool_fns,
+    stringify_result,
+)
+
+__all__ = [
+    "ToolFormat",
+    "create_async_tool_fn",
+    "create_sync_tool_fn",
+    "create_tool_fns",
+    "ensure_strict_schema",
+    "format_result",
+    "json_type_to_python",
+    "parse_tool_call",
+    "parse_tool_calls",
+    "result_to_string",
+    "schema_to_pydantic",
+    "stringify_result",
+]
diff --git a/hud/environment/utils/formats.py b/hud/environment/utils/formats.py
new file mode 100644
index 00000000..b299dadd
--- /dev/null
+++ b/hud/environment/utils/formats.py
@@ -0,0 +1,215 @@
+"""Tool format parsing and conversion for OpenAI, Claude, Gemini, and MCP."""
+
+from __future__ import annotations
+
+import json
+from enum import Enum, auto
+from typing import Any
+
+from hud.types import MCPToolCall, MCPToolResult
+
+__all__ = [
+    "ToolFormat",
+    "format_result",
+    "parse_tool_call",
+    "parse_tool_calls",
+    "result_to_string",
+]
+
+
+class ToolFormat(Enum):
+    """Detected tool call format."""
+
+    OPENAI = auto()  # function.arguments as JSON string
+    CLAUDE = auto()  # type="tool_use", input as dict
+    GEMINI = auto()  # functionCall with args
+    MCP = auto()  # name + arguments
+
+
+# -----------------------------------------------------------------------------
+# Parsing
+# -----------------------------------------------------------------------------
+
+
+def _to_dict(obj: Any) -> dict[str, Any]:
+    """Convert object to dict for uniform processing."""
+    if isinstance(obj, dict):
+        return obj
+    if hasattr(obj, "model_dump"):
+        return obj.model_dump()
+    if hasattr(obj, "__dict__"):
+        return vars(obj)
+    raise ValueError(f"Cannot convert {type(obj).__name__} to dict")
+
+
+def _parse_json_args(args: Any) -> dict[str, Any]:
+    """Parse arguments, handling JSON strings."""
+    if not args:
+        return {}
+    if isinstance(args, str):
+        try:
+            return json.loads(args)
+        except json.JSONDecodeError:
+            return {}
+    return args
+
+
+def parse_tool_call(call: Any, **kwargs: Any) -> tuple[MCPToolCall, ToolFormat]:
+    """Parse any tool call format into (MCPToolCall, ToolFormat).
+
+    Supports:
+        - String (tool name only, or with kwargs)
+        - Tuple: (name,), (name, args), (name, args, id)
+        - MCPToolCall
+        - OpenAI: {function: {name, arguments}, id}
+        - Claude: {type: "tool_use", name, input, id}
+        - Gemini: {functionCall: {name, args}} or {name, args}
+        - Generic: {name, arguments}
+
+    Args:
+        call: Tool call in any supported format.
+        **kwargs: Additional arguments (merged when call is a string).
+
+    Returns:
+        Tuple of (MCPToolCall, ToolFormat) for the parsed call.
+
+    Raises:
+        ValueError: If format is unrecognized.
+    """
+    # Primitives
+    if isinstance(call, str):
+        return MCPToolCall(name=call, arguments=kwargs or {}), ToolFormat.MCP
+
+    if isinstance(call, tuple):
+        tc = MCPToolCall(name=call[0], arguments=call[1] if len(call) > 1 else {})
+        if len(call) > 2:
+            tc.id = call[2]
+        return tc, ToolFormat.MCP
+
+    if isinstance(call, MCPToolCall):
+        return call, ToolFormat.MCP
+
+    # Convert to dict
+    d = _to_dict(call)
+
+    # OpenAI: {function: {name, arguments}, id}
+    if "function" in d:
+        f = _to_dict(d["function"]) if not isinstance(d["function"], dict) else d["function"]
+        tc = MCPToolCall(name=f["name"], arguments=_parse_json_args(f.get("arguments")))
+        if d.get("id"):
+            tc.id = d["id"]
+        return tc, ToolFormat.OPENAI
+
+    # Claude: {type: "tool_use", name, input, id}
+    if d.get("type") == "tool_use":
+        tc = MCPToolCall(name=d["name"], arguments=d.get("input") or {})
+        if d.get("id"):
+            tc.id = d["id"]
+        return tc, ToolFormat.CLAUDE
+
+    # Gemini: {functionCall: {name, args}} or {name, args}
+    if "functionCall" in d:
+        fc = d["functionCall"]
+        return MCPToolCall(name=fc["name"], arguments=fc.get("args") or {}), ToolFormat.GEMINI
+
+    if "args" in d and "name" in d and "arguments" not in d:
+        return MCPToolCall(name=d["name"], arguments=d.get("args") or {}), ToolFormat.GEMINI
+
+    # Generic: {name, arguments/input}
+    if "name" in d:
+        tc = MCPToolCall(name=d["name"], arguments=d.get("arguments") or d.get("input") or {})
+        if d.get("id"):
+            tc.id = d["id"]
+        return tc, ToolFormat.MCP
+
+    raise ValueError(f"Unrecognized tool call format: {list(d.keys())}")
+
+
+def _is_tool_block(item: Any) -> bool:
+    """Check if item is a tool call (not text/other content)."""
+    t = item.get("type") if isinstance(item, dict) else getattr(item, "type", None)
+    return t is None or t in ("tool_use", "function")
+
+
+def parse_tool_calls(calls: Any) -> list[tuple[MCPToolCall, ToolFormat]]:
+    """Parse multiple tool calls, filtering non-tool content (e.g. Claude TextBlock).
+
+    Args:
+        calls: Single call or list of calls in any format.
+
+    Returns:
+        List of (MCPToolCall, ToolFormat) tuples.
+    """
+    if calls is None:
+        return []
+    if not isinstance(calls, list):
+        try:
+            return [parse_tool_call(calls)]
+        except ValueError:
+            return []
+
+    results = []
+    for item in calls:
+        if not _is_tool_block(item):
+            continue
+        try:
+            results.append(parse_tool_call(item))
+        except ValueError:
+            continue
+    return results
+
+
+# -----------------------------------------------------------------------------
+# Result Formatting
+# -----------------------------------------------------------------------------
+
+
+def result_to_string(result: MCPToolResult) -> str:
+    """Convert MCPToolResult content to string.
+
+    Args:
+        result: MCP tool result with content blocks.
+
+    Returns:
+        String representation of the result content.
+    """
+    if not result.content:
+        return ""
+    parts = []
+    for block in result.content:
+        if (text := getattr(block, "text", None)) is not None:
+            parts.append(str(text))
+        elif (data := getattr(block, "data", None)) is not None:
+            parts.append(f"[binary: {len(data)} bytes]")
+    return "\n".join(parts)
+
+
+def format_result(result: MCPToolResult, tc: MCPToolCall, fmt: ToolFormat) -> Any:
+    """Format MCPToolResult based on the input format.
+
+    Args:
+        result: MCP tool result.
+        tc: Original tool call (for id/name).
+        fmt: Target format.
+
+    Returns:
+        OpenAI: {"role": "tool", "tool_call_id": ..., "content": ...}
+        Claude: {"type": "tool_result", "tool_use_id": ..., "content": ..., "is_error"?: bool}
+        Gemini: {"functionResponse": {"name": ..., "response": {"result": ...}}}
+        MCP: MCPToolResult unchanged
+    """
+    content = result_to_string(result)
+
+    if fmt == ToolFormat.OPENAI:
+        return {"role": "tool", "tool_call_id": tc.id, "content": content}
+
+    if fmt == ToolFormat.CLAUDE:
+        r: dict[str, Any] = {"type": "tool_result", "tool_use_id": tc.id, "content": content}
+        if result.isError:
+            r["is_error"] = True
+        return r
+
+    if fmt == ToolFormat.GEMINI:
+        return {"functionResponse": {"name": tc.name, "response": {"result": content}}}
+
+    return result  # MCP format - return as-is
diff --git a/hud/environment/utils/schema.py b/hud/environment/utils/schema.py
new file mode 100644
index 00000000..a17c2984
--- /dev/null
+++ b/hud/environment/utils/schema.py
@@ -0,0 +1,171 @@
+"""Schema utilities for tool definitions."""
+
+from __future__ import annotations
+
+import logging
+from typing import Any
+
+__all__ = [
+    "ensure_strict_schema",
+    "json_type_to_python",
+    "schema_to_pydantic",
+    "validate_openai_schema",
+]
+
+logger = logging.getLogger(__name__)
+
+
+def ensure_strict_schema(schema: dict[str, Any]) -> dict[str, Any]:
+    """Ensure a JSON schema is compatible with OpenAI's strict mode.
+
+    OpenAI strict mode requires:
+    - additionalProperties: false on all objects
+    - All properties must be in required
+
+    Args:
+        schema: Original JSON schema.
+
+    Returns:
+        Modified schema for strict mode.
+    """
+    schema = dict(schema)
+
+    if schema.get("type") == "object":
+        schema["additionalProperties"] = False
+
+        if "properties" in schema:
+            # All properties must be required
+            schema["required"] = list(schema["properties"].keys())
+
+            # Recursively process nested objects
+            for prop_schema in schema["properties"].values():
+                if isinstance(prop_schema, dict):
+                    _ensure_strict_recursive(prop_schema)
+
+    return schema
+
+
+def _ensure_strict_recursive(schema: dict[str, Any]) -> None:
+    """Recursively apply strict mode to nested schemas."""
+    if schema.get("type") == "object":
+        schema["additionalProperties"] = False
+        if "properties" in schema:
+            schema["required"] = list(schema["properties"].keys())
+            for prop_schema in schema["properties"].values():
+                if isinstance(prop_schema, dict):
+                    _ensure_strict_recursive(prop_schema)
+
+    elif schema.get("type") == "array" and "items" in schema:
+        if isinstance(schema["items"], dict):
+            _ensure_strict_recursive(schema["items"])
+
+
+def schema_to_pydantic(name: str, schema: dict[str, Any]) -> type:
+    """Convert JSON schema to a Pydantic model.
+
+    Args:
+        name: Model name (used for class name).
+        schema: JSON schema with properties.
+
+    Returns:
+        Dynamically created Pydantic model class.
+    """
+    from pydantic import Field, create_model
+
+    properties = schema.get("properties", {})
+    required = set(schema.get("required", []))
+
+    fields = {}
+    for prop_name, prop_schema in properties.items():
+        prop_type = json_type_to_python(prop_schema.get("type", "string"))
+        default = ... if prop_name in required else None
+        description = prop_schema.get("description", "")
+        fields[prop_name] = (prop_type, Field(default=default, description=description))
+
+    return create_model(f"{name}Input", **fields)
+
+
+def json_type_to_python(json_type: str) -> type:
+    """Map JSON schema type to Python type.
+
+    Args:
+        json_type: JSON schema type string.
+
+    Returns:
+        Corresponding Python type.
+    """
+    mapping = {
+        "string": str,
+        "integer": int,
+        "number": float,
+        "boolean": bool,
+        "array": list,
+        "object": dict,
+    }
+    return mapping.get(json_type, str)
+
+
+def validate_openai_schema(
+    schema: dict[str, Any],
+    tool_name: str = "unknown",
+    path: str = "",
+) -> list[str]:
+    """Validate a JSON schema for OpenAI API compatibility.
+
+    OpenAI's API has specific requirements for tool schemas:
+    - Arrays must have 'items' (not 'prefixItems' which tuples generate)
+    - Certain schema features like 'prefixItems' are not supported
+
+    Args:
+        schema: JSON schema to validate.
+        tool_name: Name of the tool (for error messages).
+        path: Current path in schema (for error context).
+
+    Returns:
+        List of validation error messages. Empty if valid.
+    """
+    errors: list[str] = []
+
+    if not isinstance(schema, dict):
+        return errors
+
+    # Check for prefixItems (generated by tuple types)
+    if "prefixItems" in schema:
+        errors.append(
+            f"Tool '{tool_name}' has 'prefixItems' at {path or 'root'} "
+            "(likely from tuple type). Use list[Model] instead of tuple."
+        )
+
+    # Check arrays have 'items'
+    if schema.get("type") == "array" and "items" not in schema and "prefixItems" not in schema:
+        errors.append(
+            f"Tool '{tool_name}' has array at {path or 'root'} without 'items'. "
+            "OpenAI requires 'items' for array schemas."
+        )
+
+    # Recursively check nested schemas
+    # Check properties
+    if "properties" in schema:
+        for prop_name, prop_schema in schema["properties"].items():
+            prop_path = f"{path}.{prop_name}" if path else prop_name
+            errors.extend(validate_openai_schema(prop_schema, tool_name, prop_path))
+
+    # Check items
+    if "items" in schema and isinstance(schema["items"], dict):
+        items_path = f"{path}[items]" if path else "[items]"
+        errors.extend(validate_openai_schema(schema["items"], tool_name, items_path))
+
+    # Check anyOf/oneOf/allOf
+    for key in ("anyOf", "oneOf", "allOf"):
+        if key in schema:
+            for i, sub_schema in enumerate(schema[key]):
+                sub_path = f"{path}.{key}[{i}]" if path else f"{key}[{i}]"
+                errors.extend(validate_openai_schema(sub_schema, tool_name, sub_path))
+
+    # Check $defs (definitions)
+    if "$defs" in schema:
+        for def_name, def_schema in schema["$defs"].items():
+            def_path = f"$defs.{def_name}"
+            errors.extend(validate_openai_schema(def_schema, tool_name, def_path))
+
+    return errors
diff --git a/hud/environment/utils/tool_wrappers.py b/hud/environment/utils/tool_wrappers.py
new file mode 100644
index 00000000..d1089242
--- /dev/null
+++ b/hud/environment/utils/tool_wrappers.py
@@ -0,0 +1,113 @@
+"""Shared tool wrapper utilities for agent framework integrations."""
+
+from __future__ import annotations
+
+import json
+from typing import TYPE_CHECKING, Any
+
+if TYPE_CHECKING:
+    from collections.abc import Callable
+
+    import mcp.types as mcp_types
+
+__all__ = [
+    "create_async_tool_fn",
+    "create_sync_tool_fn",
+    "create_tool_fns",
+    "stringify_result",
+]
+
+
+def stringify_result(result: Any) -> str:
+    """Convert a tool result to string format.
+
+    Args:
+        result: The tool result (str, dict, or other).
+
+    Returns:
+        String representation of the result.
+    """
+    if isinstance(result, str):
+        return result
+    return json.dumps(result) if result else ""
+
+
+def create_async_tool_fn(
+    env: Any,
+    tool_name: str,
+    description: str | None = None,
+) -> Callable[..., Any]:
+    """Create an async function that calls a tool on the environment.
+
+    Args:
+        env: Environment with call_tool method.
+        tool_name: Name of the tool to call.
+        description: Optional description for the function docstring.
+
+    Returns:
+        Async function that calls the tool and returns string result.
+    """
+
+    async def async_fn(**kwargs: Any) -> str:
+        result = await env.call_tool(tool_name, **kwargs)
+        return stringify_result(result)
+
+    async_fn.__name__ = tool_name
+    async_fn.__doc__ = description or f"Tool: {tool_name}"
+    return async_fn
+
+
+def create_sync_tool_fn(
+    env: Any,
+    tool_name: str,
+    description: str | None = None,
+) -> Callable[..., Any]:
+    """Create a sync function that calls a tool on the environment.
+
+    This handles the complexity of running async code from sync context,
+    including when already in an async event loop.
+
+    Args:
+        env: Environment with call_tool method.
+        tool_name: Name of the tool to call.
+        description: Optional description for the function docstring.
+
+    Returns:
+        Sync function that calls the tool and returns string result.
+    """
+    import asyncio
+
+    def sync_fn(**kwargs: Any) -> str:
+        loop = asyncio.get_event_loop()
+        if loop.is_running():
+            import concurrent.futures
+
+            with concurrent.futures.ThreadPoolExecutor() as executor:
+                future = executor.submit(asyncio.run, env.call_tool(tool_name, **kwargs))
+                result = future.result()
+        else:
+            result = loop.run_until_complete(env.call_tool(tool_name, **kwargs))
+
+        return stringify_result(result)
+
+    sync_fn.__name__ = tool_name
+    sync_fn.__doc__ = description or f"Tool: {tool_name}"
+    return sync_fn
+
+
+def create_tool_fns(
+    env: Any,
+    tool: mcp_types.Tool,
+) -> tuple[Callable[..., str], Callable[..., Any]]:
+    """Create both sync and async functions for a tool.
+
+    Args:
+        env: Environment with call_tool method.
+        tool: MCP tool definition.
+
+    Returns:
+        Tuple of (sync_fn, async_fn).
+    """
+    sync_fn = create_sync_tool_fn(env, tool.name, tool.description)
+    async_fn = create_async_tool_fn(env, tool.name, tool.description)
+    return sync_fn, async_fn
diff --git a/hud/eval/__init__.py b/hud/eval/__init__.py
new file mode 100644
index 00000000..0c659773
--- /dev/null
+++ b/hud/eval/__init__.py
@@ -0,0 +1,67 @@
+"""HUD Eval - Evaluation context and management.
+
+This module provides:
+- Task: A runnable evaluation unit (from env())
+- EvalContext: Environment with evaluation tracking (trace_id, reward, etc.)
+- eval(): Standalone context manager for task-based evaluation
+
+Usage:
+    # Using env() to create Task
+    env = Environment("my-env").connect_hub("browser")
+
+    async with env() as ctx:
+        await ctx.call_tool("navigate", url="...")
+
+    async with env("checkout", user_id="alice") as ctx:
+        await agent.run(ctx.prompt)
+
+    # Standalone with task slugs
+    async with hud.eval("my-org/task:1") as ctx:
+        await agent.run(ctx)
+
+    # Orchestrated with Task objects
+    tasks = [env("checkout", user_id="alice"), env("checkout", user_id="bob")]
+    async with hud.eval(tasks, variants={"model": ["gpt-4o"]}, group=4) as ctx:
+        await agent.run(ctx.prompt)
+
+    # Blank eval for manual reward
+    async with hud.eval() as ctx:
+        ctx.reward = compute_reward()
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+# Auto-instrument httpx on import
+import hud.eval.instrument  # noqa: F401
+
+# run_eval is safe to import (uses lazy imports internally)
+from hud.eval.manager import run_eval
+
+# Task is safe to import
+from hud.eval.task import Task
+
+# Utils for v4 format handling
+from hud.eval.utils import build_env_from_v4, is_v4_format, validate_v4_task
+
+if TYPE_CHECKING:
+    from hud.eval.context import EvalContext
+
+__all__ = [
+    "EvalContext",
+    "Task",
+    "build_env_from_v4",
+    "is_v4_format",
+    "run_eval",
+    "validate_v4_task",
+]
+
+
+def __getattr__(name: str) -> object:
+    """Lazy import EvalContext to avoid circular imports."""
+    if name == "EvalContext":
+        from hud.eval.context import EvalContext
+
+        return EvalContext
+    raise AttributeError(f"module 'hud.eval' has no attribute {name!r}")
diff --git a/hud/eval/context.py b/hud/eval/context.py
new file mode 100644
index 00000000..3e6392e9
--- /dev/null
+++ b/hud/eval/context.py
@@ -0,0 +1,671 @@
+"""EvalContext - Environment with evaluation tracking.
+
+EvalContext IS an Environment, with additional evaluation tracking
+capabilities (trace_id, reward, backend reporting).
+
+This makes `async with env.eval("task") as env` natural - you get
+a full Environment that you can call tools on directly.
+"""
+
+from __future__ import annotations
+
+import contextvars
+import logging
+import uuid
+from typing import TYPE_CHECKING, Any, Self
+
+from hud.environment import Environment
+from hud.settings import settings
+from hud.shared import make_request
+from hud.telemetry import flush, instrument
+
+if TYPE_CHECKING:
+    from types import TracebackType
+
+    from hud.eval.task import Task
+    from hud.types import MCPToolResult
+
+
+from hud.eval.types import EvalExitPayload, EvalPayload, ParallelEvalComplete
+
+logger = logging.getLogger(__name__)
+
+# Contextvar to store current trace headers (for httpx auto-instrumentation)
+_current_trace_headers: contextvars.ContextVar[dict[str, str] | None] = contextvars.ContextVar(
+    "current_trace_headers", default=None
+)
+
+# Contextvar to store current api_key override (for telemetry exporter)
+_current_api_key: contextvars.ContextVar[str | None] = contextvars.ContextVar(
+    "current_api_key", default=None
+)
+
+
+def get_current_trace_headers() -> dict[str, str] | None:
+    """Get the current trace headers from context."""
+    return _current_trace_headers.get()
+
+
+def get_current_trace_id() -> str | None:
+    """Get the current trace ID (task_run_id) from context.
+
+    Returns the Trace-Id if inside an eval context, None otherwise.
+    Used by @instrument decorator to know where to send telemetry.
+    """
+    headers = _current_trace_headers.get()
+    if headers:
+        return headers.get("Trace-Id")
+    return None
+
+
+def get_current_api_key() -> str | None:
+    """Get the current API key override from context.
+
+    Returns the api_key if one was passed to hud.eval(), otherwise None.
+    Falls back to settings.api_key if not in an eval context.
+    Used by telemetry exporter for uploads.
+    """
+    return _current_api_key.get()
+
+
+# =============================================================================
+# EvalContext
+# =============================================================================
+
+
+class EvalContext(Environment):
+    """Environment with evaluation tracking capabilities.
+
+    Attributes:
+        trace_id: Unique identifier for this evaluation
+        eval_name: Task/evaluation name (separate from env name)
+        job_id: Links to parent job (auto-detected from hud.job() context)
+        group_id: Links parallel evaluations together
+        variants: Variant assignment dict (for A/B testing)
+        reward: Reward value (user-settable)
+        error: Exception if failed
+        results: All eval results (populated for parallel execution, empty for single)
+        task: Task definition (if loaded from slug)
+
+    Example:
+        ```python
+        # From existing environment
+        async with env.eval("task") as ctx:
+            await ctx.call_tool("navigate", url="...")
+            ctx.reward = 0.9
+
+        # Standalone with slug
+        async with hud.eval("my-org/task:1") as ctx:
+            await agent.run(ctx)
+            ctx.reward = result.reward
+
+        # Blank eval
+        async with hud.eval() as ctx:
+            ctx.reward = compute_reward()
+        ```
+    """
+
+    def __init__(
+        self,
+        name: str = "eval",
+        *,
+        trace_id: str | None = None,
+        api_key: str | None = None,
+        job_id: str | None = None,
+        group_id: str | None = None,
+        index: int = 0,
+        variants: dict[str, Any] | None = None,
+        code_snippet: str | None = None,
+        trace: bool = True,
+        quiet: bool = False,
+        **env_kwargs: Any,
+    ) -> None:
+        """Initialize EvalContext.
+
+        Args:
+            name: Environment/evaluation name
+            trace_id: Unique trace ID (auto-generated if not provided)
+            api_key: API key for backend calls
+            job_id: Job ID to link to (auto-detected if not provided)
+            group_id: Group ID for parallel evaluations
+            index: Index in parallel execution
+            variants: Variant assignment for A/B testing
+            code_snippet: Code being evaluated (for reproducibility)
+            trace: Whether to send trace data to backend (default True)
+            quiet: Whether to suppress printing links (default False)
+            **env_kwargs: Additional kwargs passed to Environment.__init__
+        """
+        # Initialize Environment
+        super().__init__(name=name, **env_kwargs)
+
+        # === Evaluation tracking (not in Environment) ===
+
+        # Identity
+        self.trace_id: str = trace_id or str(uuid.uuid4())
+        self.eval_name: str = name  # Separate from self.name for clarity
+
+        # Job linkage
+        self.job_id: str | None = job_id
+
+        self.group_id: str | None = group_id
+        self.index: int = index
+
+        # Variant assignment
+        self.variants: dict[str, Any] = variants or {}
+
+        # User-settable (per-run values, override Environment defaults)
+        self.prompt: str | None = None  # From scenario setup or task
+        self.reward: float | None = None
+        self.answer: str | None = None  # Agent's submitted answer
+        self.system_prompt: str | None = None  # From task.agent_config, passed to agent
+
+        # Error tracking
+        self.error: BaseException | None = None
+
+        # User metadata (arbitrary key-value pairs)
+        self.metadata: dict[str, Any] = {}
+
+        # Parallel results (empty list for single evals, populated for parallel)
+        self.results: list[EvalContext] = []
+
+        # Code snippet for reproducibility
+        self.code_snippet: str | None = code_snippet
+
+        # Private state for eval tracking
+        self._eval_api_key = api_key
+        self._token: contextvars.Token[dict[str, str] | None] | None = None
+        self._api_key_token: contextvars.Token[str | None] | None = None
+        self._is_summary: bool = False  # True for summary contexts (skip trace)
+        self._suppress_link: bool = quiet  # True to suppress printing eval link
+        self._trace_enabled: bool = trace  # Whether to send trace data to backend
+        self._source_env_name: str | None = None  # Source env name for remote lookups
+        self._task: Task | None = None  # Task config (set by from_task)
+
+    @classmethod
+    def from_environment(
+        cls,
+        env: Environment,
+        name: str,
+        *,
+        trace_id: str | None = None,
+        api_key: str | None = None,
+        job_id: str | None = None,
+        group_id: str | None = None,
+        index: int = 0,
+        variants: dict[str, Any] | None = None,
+        code_snippet: str | None = None,
+        trace: bool = True,
+        quiet: bool = False,
+    ) -> EvalContext:
+        """Create an EvalContext that copies configuration from an existing Environment.
+
+        This creates a new EvalContext with the same connections as the parent.
+        Used by env.eval() to create evaluation contexts.
+
+        Args:
+            env: Parent environment to copy from
+            name: Evaluation name
+            trace_id: Unique trace ID
+            api_key: API key for backend calls
+            job_id: Job ID to link to
+            group_id: Group ID for parallel evaluations
+            index: Index in parallel execution
+            variants: Variant assignment
+            code_snippet: Code being evaluated
+        """
+        ctx = cls(
+            name=name,
+            trace_id=trace_id,
+            api_key=api_key,
+            job_id=job_id,
+            group_id=group_id,
+            index=index,
+            variants=variants,
+            code_snippet=code_snippet,
+            trace=trace,
+            quiet=quiet,
+        )
+
+        # Copy connections from parent - each connector is copied so parallel
+        # execution gets fresh client instances
+        ctx._connections = {name: connector.copy() for name, connector in env._connections.items()}
+        ctx._setup_calls = env._setup_calls.copy()
+        ctx._evaluate_calls = env._evaluate_calls.copy()
+
+        # Copy scenarios (definitions) by reference - they don't change
+        ctx._scenarios = getattr(env, "_scenarios", {})
+        # Create fresh session state for this eval (parallel evals each need their own)
+        ctx._scenario_sessions = {}
+        ctx._scenario_latest = {}
+        ctx._scenario_answers = {}
+
+        # Store source env name for remote scenario lookups
+        ctx._source_env_name = env.name
+
+        # Copy managers by reference (they hold local tools, prompts, resources)
+        # This allows ctx.call_tool(), ctx.get_prompt(), ctx.read_resource() to work
+        # for locally defined tools/scenarios
+        ctx._tool_manager = env._tool_manager
+        ctx._prompt_manager = env._prompt_manager
+        ctx._resource_manager = env._resource_manager
+
+        # Copy prompt
+        if env.prompt:
+            ctx.prompt = env.prompt
+
+        # Copy agent-level tool filters (allowed_tools/disallowed_tools)
+        ctx._agent_include = getattr(env, "_agent_include", None)
+        ctx._agent_exclude = getattr(env, "_agent_exclude", None)
+
+        # Copy router's conflict resolution strategy
+        ctx._router.conflict_resolution = env._router.conflict_resolution
+
+        # Copy mock mode settings (for testing)
+        ctx._mock_mode = getattr(env, "_mock_mode", False)
+        ctx._mock_outputs = getattr(env, "_mock_outputs", {}).copy()
+        ctx._mock_tool_schemas = getattr(env, "_mock_tool_schemas", {}).copy()
+
+        # Copy hub config (needed to detect remote hub for telemetry)
+        ctx._hub_config = getattr(env, "_hub_config", None)
+
+        # Copy mcp config (needed to detect remote HUD MCP for telemetry)
+        ctx._mcp_config = getattr(env, "_mcp_config", None)
+
+        return ctx
+
+    @classmethod
+    def from_task(
+        cls,
+        task: Task,
+        *,
+        name: str | None = None,
+        trace_id: str | None = None,
+        api_key: str | None = None,
+        job_id: str | None = None,
+        group_id: str | None = None,
+        index: int = 0,
+        variants: dict[str, Any] | None = None,
+        code_snippet: str | None = None,
+        trace: bool = True,
+        quiet: bool = False,
+    ) -> EvalContext:
+        """Create an EvalContext from a Task config.
+
+        Args:
+            task: Task config (env, scenario, args)
+            name: Override for eval/trace name (defaults to task scenario/args)
+            trace_id: Unique trace ID
+            api_key: API key for backend calls
+            job_id: Job ID to link to
+            group_id: Group ID for parallel evaluations
+            index: Index in parallel execution
+            variants: Variant assignment
+            code_snippet: Code being evaluated
+            trace: Whether to send traces to backend
+            quiet: Whether to suppress output
+        """
+        from hud.environment import Environment
+        from hud.eval.task import build_eval_name
+
+        eval_name = name or build_eval_name(task.scenario, task.args)
+
+        # task.env is guaranteed to be Environment after Task.__post_init__
+        assert isinstance(task.env, Environment), "Task.env should be Environment"
+
+        ctx = cls.from_environment(
+            env=task.env,
+            name=eval_name,
+            trace_id=trace_id,
+            api_key=api_key,
+            job_id=job_id,
+            group_id=group_id,
+            index=index,
+            variants=variants,
+            code_snippet=code_snippet,
+            trace=trace,
+            quiet=quiet,
+        )
+
+        # Store task info for scenario execution
+        ctx._task = task
+
+        # Set system_prompt from task.agent_config
+        if task.agent_config:
+            if isinstance(task.agent_config, dict):
+                if task.agent_config.get("system_prompt"):
+                    ctx.system_prompt = task.agent_config["system_prompt"]
+            elif task.agent_config.system_prompt:
+                ctx.system_prompt = task.agent_config.system_prompt
+
+        return ctx
+
+    async def _run_task_scenario_setup(self) -> None:
+        """Run the task's scenario setup phase (if scenario provided)."""
+        if self._task is None or self._task.scenario is None:
+            return
+
+        prompt = await self.run_scenario_setup(self._task.scenario, self._task.args)
+        if prompt:
+            self.prompt = prompt
+
+    async def _run_task_scenario_evaluate(self) -> None:
+        """Run the task's scenario evaluate phase (if scenario provided)."""
+        if self._task is None or self._task.scenario is None:
+            return
+
+        reward = await self.run_scenario_evaluate(self._task.scenario)
+        if reward is not None:
+            self.reward = reward
+
+    # =========================================================================
+    # Summary Context - Attribute Access Control
+    # =========================================================================
+
+    # Attributes accessible on summary context (everything else raises ParallelEvalComplete)
+    _SUMMARY_ALLOWED = frozenset(
+        {
+            # Results and metadata
+            "results",
+            "reward",
+            "error",
+            "success",
+            # IDs
+            "trace_id",
+            "job_id",
+            "group_id",
+            "index",
+            # Private attrs
+            "_is_summary",
+            "_suppress_link",
+            "__class__",
+            "__dict__",
+        }
+    )
+
+    def __getattribute__(self, name: str) -> Any:
+        """Block most attribute access on summary contexts."""
+        # Always allow private/dunder and whitelisted attrs
+        if name.startswith("_") or name in EvalContext._SUMMARY_ALLOWED:
+            return super().__getattribute__(name)
+
+        # Check if this is a summary context
+        try:
+            is_summary = super().__getattribute__("_is_summary")
+        except AttributeError:
+            is_summary = False
+
+        if is_summary:
+            raise ParallelEvalComplete
+
+        return super().__getattribute__(name)
+
+    # =========================================================================
+    # Computed Properties (eval-specific)
+    # =========================================================================
+
+    @property
+    def headers(self) -> dict[str, str]:
+        """Headers for gateway integration."""
+        return {"Trace-Id": self.trace_id}
+
+    @property
+    def success(self) -> bool:
+        """True if no error occurred."""
+        return self.error is None
+
+    @property
+    def has_scenario(self) -> bool:
+        """True if a scenario is running and can accept submissions."""
+        return self._task is not None and self._task.scenario is not None
+
+    # =========================================================================
+    # Backend Integration
+    # =========================================================================
+
+    def _get_eval_api_key(self) -> str | None:
+        return self._eval_api_key or settings.api_key
+
+    def _build_base_payload(self) -> EvalPayload:
+        """Build the base payload for enter/exit."""
+        return EvalPayload(
+            prompt=self.prompt,
+            code_snippet=self.code_snippet,
+            job_id=self.job_id,
+            group_id=self.group_id,
+            variants=self.variants if self.variants else None,
+            # Only send task_version_id for v5 tasks (those with scenarios).
+            # v4 tasks have client-side IDs that shouldn't be sent to backend.
+            task_version_id=self._task.id if self._task and self._task.scenario else None,
+            metadata=self.metadata if self.metadata else None,
+        )
+
+    async def log(self, metrics: dict[str, Any]) -> None:
+        """Log metrics to the backend."""
+        api_key = self._get_eval_api_key()
+        if not settings.telemetry_enabled or not api_key:
+            return
+
+        try:
+            await make_request(
+                method="POST",
+                url=f"{settings.hud_telemetry_url}/traces/{self.trace_id}/log",
+                json={"metrics": metrics},
+                api_key=api_key,
+            )
+        except Exception as e:
+            logger.warning("Failed to log metrics: %s", e)
+
+    async def submit(self, answer: str) -> None:
+        """Submit the agent's answer for scenario evaluation.
+
+        Delegates to Environment.submit() with the current scenario name.
+        The answer will be passed to the scenario's evaluate phase via
+        `yield`, e.g.: `answer = yield "Do the task"`
+
+        Args:
+            answer: The agent's final answer/result to submit
+
+        Example:
+            async with env("checkout", product="laptop") as ctx:
+                response = await agent.run(ctx.prompt)
+                await ctx.submit(response)
+            # On exit, scenario's evaluate phase receives the answer
+        """
+        if not self._task or not self._task.scenario:
+            return
+
+        # Store answer on context for display
+        self.answer = answer
+
+        # Delegate to Environment.submit() which handles storage + broadcast
+        await super().submit(self._task.scenario, answer)
+
+    async def _eval_enter(self) -> None:
+        """Notify backend that eval has started."""
+        if not self._trace_enabled:
+            return
+        api_key = self._get_eval_api_key()
+        if not settings.telemetry_enabled or not api_key:
+            return
+
+        try:
+            payload = self._build_base_payload()
+            await make_request(
+                method="POST",
+                url=f"{settings.hud_api_url}/trace/{self.trace_id}/enter",
+                json=payload.model_dump(exclude_none=True),
+                api_key=api_key,
+            )
+        except Exception as e:
+            logger.warning("Failed to send eval enter: %s", e)
+
+    async def _eval_exit(self, error_message: str | None = None) -> None:
+        """Notify backend that eval has completed."""
+        if not self._trace_enabled:
+            return
+        api_key = self._get_eval_api_key()
+        if not settings.telemetry_enabled or not api_key:
+            return
+
+        # Use evaluate tool reward if not manually set
+        reward = self.reward
+        if reward is None:
+            reward = getattr(self, "_evaluate_reward", None)
+
+        try:
+            payload = EvalExitPayload(
+                **self._build_base_payload().model_dump(),
+                reward=reward,
+                success=self.success,
+                error_message=error_message,
+            )
+            await make_request(
+                method="POST",
+                url=f"{settings.hud_api_url}/trace/{self.trace_id}/exit",
+                json=payload.model_dump(exclude_none=True),
+                api_key=api_key,
+            )
+        except Exception as e:
+            logger.warning("Failed to send eval exit: %s", e)
+
+    # =========================================================================
+    # Context Manager (override Environment)
+    # =========================================================================
+
+    async def __aenter__(self) -> Self:
+        """Enter eval context - connect environment and set trace headers."""
+        if self._is_summary:
+            return self
+
+        # Start tracking
+        self._token = _current_trace_headers.set(self.headers)
+        self._api_key_token = _current_api_key.set(self._eval_api_key)
+
+        # Connect environment (MCP servers, tools)
+        await super().__aenter__()
+
+        # Run task scenario setup (if created from_task with scenario)
+        await self._run_task_scenario_setup()
+
+        # Notify backend and print link
+        await self._eval_enter()
+        self._print_eval_link()
+
+        return self
+
+    async def __aexit__(
+        self,
+        exc_type: type[BaseException] | None,
+        exc_val: BaseException | None,
+        exc_tb: TracebackType | None,
+    ) -> bool:
+        """Exit eval context - disconnect and report."""
+        # Summary contexts skip trace tracking (parallel results already tracked)
+        # Suppress ParallelEvalComplete - it's expected for skipping body re-execution
+        if self._is_summary:
+            return exc_type is ParallelEvalComplete
+
+        # Run task scenario evaluate (if no error and has scenario)
+        if exc_type is None:
+            await self._run_task_scenario_evaluate()
+
+        # Track error
+        error_msg: str | None = None
+        if exc_type is not None:
+            self.error = exc_val
+            error_msg = str(exc_val) if exc_val else "Unknown error"
+
+        # Flush any pending telemetry spans for this trace
+        flush(self.trace_id)
+
+        # Disconnect environment (parent class)
+        await super().__aexit__(exc_type, exc_val, exc_tb)
+
+        # Reset context vars
+        if self._token is not None:
+            _current_trace_headers.reset(self._token)
+            self._token = None
+        if self._api_key_token is not None:
+            _current_api_key.reset(self._api_key_token)
+            self._api_key_token = None
+
+        # Notify backend
+        await self._eval_exit(error_msg)
+
+        # Print single eval result summary (unless suppressed for parallel evals)
+        self._print_single_result(error_msg)
+
+        return False
+
+    # =========================================================================
+    # Tool Call Instrumentation
+    # =========================================================================
+
+    async def _execute_tool(self, name: str, arguments: dict[str, Any]) -> MCPToolResult:
+        """Execute a tool with automatic telemetry recording.
+
+        Overrides Environment._execute_tool to record MCP spans for the eval context.
+        Instrumentation is disabled when connected to a remote HUD server (telemetry is
+        recorded server-side in that case).
+        """
+        # Skip instrumentation when connected to a remote hub - telemetry is handled server-side
+        if self._hub_config is not None:
+            return await super()._execute_tool(name, arguments)
+
+        # Skip instrumentation for v4 tasks with HUD MCP config (remote server)
+        if self._mcp_config is not None:
+            from hud.utils.mcp import _is_hud_server
+
+            for server_cfg in self._mcp_config.values():
+                if isinstance(server_cfg, dict):
+                    url = server_cfg.get("url", "")
+                    if url and _is_hud_server(url):
+                        return await super()._execute_tool(name, arguments)
+
+        # For local environments, record MCP spans
+        return await self._execute_tool_instrumented(name, arguments)
+
+    @instrument(category="mcp")
+    async def _execute_tool_instrumented(
+        self, name: str, arguments: dict[str, Any]
+    ) -> MCPToolResult:
+        """Instrumented version of _execute_tool for local environments."""
+        return await super()._execute_tool(name, arguments)
+
+    def __repr__(self) -> str:
+        return f"EvalContext({self.trace_id[:8]}..., name={self.eval_name!r}, reward={self.reward})"
+
+    def _print_eval_link(self) -> None:
+        """Print a nicely formatted eval link."""
+        # Skip if link printing is suppressed (e.g., parallel child traces)
+        if self._suppress_link:
+            return
+
+        from hud.eval.display import print_link
+
+        trace_url = f"https://hud.ai/trace/{self.trace_id}"
+        print_link(trace_url, "🔗 Eval Started")
+
+    def _print_single_result(self, error_msg: str | None) -> None:
+        """Print a single eval result summary."""
+        # Skip if link printing is suppressed (e.g., parallel child traces)
+        if self._suppress_link:
+            return
+
+        from hud.eval.display import print_single_result
+
+        print_single_result(
+            trace_id=self.trace_id,
+            name=self.eval_name,
+            reward=self.reward,
+            error=error_msg,
+        )
+
+
+# Re-export for backwards compatibility with trace module
+__all__ = [
+    "EvalContext",
+    "get_current_api_key",
+    "get_current_trace_headers",
+    "get_current_trace_id",
+]
diff --git a/hud/eval/display.py b/hud/eval/display.py
new file mode 100644
index 00000000..d0943255
--- /dev/null
+++ b/hud/eval/display.py
@@ -0,0 +1,299 @@
+"""Display helpers for eval links, job URLs, and result statistics."""
+
+from __future__ import annotations
+
+import contextlib
+import webbrowser
+from statistics import mean, pstdev
+from typing import Any
+
+from hud.settings import settings
+
+
+def print_link(url: str, title: str, *, open_browser: bool = True) -> None:
+    """Print a nicely formatted link with optional browser opening."""
+    if not (settings.telemetry_enabled and settings.api_key):
+        return
+
+    if open_browser:
+        with contextlib.suppress(Exception):
+            webbrowser.open(url, new=2)
+
+    try:
+        from rich.align import Align
+        from rich.console import Console
+        from rich.panel import Panel
+
+        console = Console()
+        style = "bold underline rgb(108,113,196)"
+        link_markup = f"[{style}][link={url}]{url}[/link][/{style}]"
+        panel = Panel(
+            Align.center(link_markup),
+            title=title,
+            border_style="rgb(192,150,12)",
+            padding=(0, 2),
+        )
+        console.print(panel)
+    except ImportError:
+        print(f"{title}: {url}")  # noqa: T201
+
+
+def print_complete(url: str, name: str, *, error: bool = False) -> None:
+    """Print a completion message with link."""
+    if not (settings.telemetry_enabled and settings.api_key):
+        return
+
+    try:
+        from rich.console import Console
+
+        console = Console()
+        if error:
+            console.print(
+                f"\n[red]✗ '{name}' failed![/red] [dim]View details at:[/dim] "
+                f"[bold link={url}]{url}[/bold link]\n"
+            )
+        else:
+            console.print(
+                f"\n[green]✓ '{name}' complete![/green] [dim]View results at:[/dim] "
+                f"[bold link={url}]{url}[/bold link]\n"
+            )
+    except ImportError:
+        status = "failed" if error else "complete"
+        print(f"\n{name} {status}: {url}\n")  # noqa: T201
+
+
+def print_single_result(
+    trace_id: str,
+    name: str,
+    *,
+    reward: float | None = None,
+    error: str | None = None,
+) -> None:
+    """Print a single eval result summary."""
+    if not (settings.telemetry_enabled and settings.api_key):
+        return
+
+    url = f"https://hud.ai/trace/{trace_id}"
+
+    try:
+        from rich.console import Console
+
+        console = Console()
+
+        if error:
+            console.print(
+                f"\n[red]✗ '{name}' failed![/red]\n"
+                f"  [dim]Error:[/dim] [red]{error[:80]}{'...' if len(error) > 80 else ''}[/red]\n"
+                f"  [dim]View at:[/dim] [bold link={url}]{url}[/bold link]\n"
+            )
+        else:
+            reward_str = f"{reward:.3f}" if reward is not None else "—"
+            reward_color = "green" if reward is not None and reward > 0.7 else "yellow"
+            console.print(
+                f"\n[green]✓ '{name}' complete![/green]\n"
+                f"  [dim]Reward:[/dim] [{reward_color}]{reward_str}[/{reward_color}]\n"
+                f"  [dim]View at:[/dim] [bold link={url}]{url}[/bold link]\n"
+            )
+    except ImportError:
+        status = "failed" if error else "complete"
+        reward_str = f", reward={reward:.3f}" if reward is not None else ""
+        print(f"\n{name} {status}{reward_str}: {url}\n")  # noqa: T201
+
+
+def display_results(
+    results: list[Any],
+    *,
+    tasks: list[Any] | None = None,
+    name: str = "",
+    elapsed: float | None = None,
+    show_details: bool = True,
+) -> None:
+    """Display evaluation results in a formatted table.
+
+    Args:
+        results: List of EvalContext objects from hud.eval()
+        tasks: Optional list of Task objects (for task info in table)
+        name: Optional name for the evaluation
+        elapsed: Optional elapsed time in seconds
+        show_details: Whether to show per-eval details table
+    """
+    if not results:
+        print("No results to display")  # noqa: T201
+        return
+
+    try:
+        from rich.console import Console
+        from rich.table import Table
+
+        console = Console()
+    except ImportError:
+        _display_basic(results, name, elapsed)
+        return
+
+    # Extract stats from results (EvalContext objects)
+    # Use 'or 0' to handle None rewards (scenario failed before returning a reward)
+    rewards = [getattr(r, "reward", 0) or 0 for r in results if r is not None]
+    errors = [r for r in results if r is not None and getattr(r, "error", None)]
+    durations = [getattr(r, "duration", 0) for r in results if getattr(r, "duration", 0) > 0]
+
+    if not rewards:
+        console.print("[yellow]No valid results[/yellow]")
+        return
+
+    mean_reward = mean(rewards) if rewards else 0.0
+    std_reward = pstdev(rewards) if len(rewards) > 1 else 0.0
+    success_count = sum(1 for r in rewards if r > 0.7)
+    success_rate = success_count / len(results) if results else 0.0
+
+    # Print summary
+    title = f"📊 '{name}' Results" if name else "📊 Evaluation Complete"
+    console.print(f"\n[bold]{title}[/bold]")
+    console.print(f"  [dim]Evals:[/dim] {len(results)}")
+    if elapsed:
+        rate = len(results) / elapsed if elapsed > 0 else 0
+        console.print(f"  [dim]Time:[/dim] {elapsed:.1f}s ({rate:.1f}/s)")
+    if durations:
+        console.print(f"  [dim]Avg duration:[/dim] {mean(durations):.2f}s")
+    console.print(f"  [dim]Mean reward:[/dim] [green]{mean_reward:.3f}[/green] ± {std_reward:.3f}")
+    console.print(f"  [dim]Success rate:[/dim] [yellow]{success_rate * 100:.1f}%[/yellow]")
+    if errors:
+        console.print(f"  [dim]Errors:[/dim] [red]{len(errors)}[/red]")
+
+    # Details table
+    if show_details and len(results) <= 50:
+        table = Table(title="Details", show_header=True, header_style="bold")
+        table.add_column("#", style="dim", justify="right", width=4)
+
+        # Check if we have variants (grouped parallel runs)
+        has_variants = any(getattr(r, "variants", None) for r in results if r)
+        has_prompts = any(getattr(r, "prompt", None) for r in results if r)
+        has_answers = any(getattr(r, "answer", None) for r in results if r)
+
+        if has_variants:
+            table.add_column("Variants", style="cyan", max_width=30)
+        elif tasks:
+            table.add_column("Task", style="cyan", max_width=30)
+
+        if has_prompts:
+            table.add_column("Prompt", style="dim", max_width=35)
+
+        if has_answers:
+            table.add_column("Answer", style="dim", max_width=35)
+
+        table.add_column("Reward", justify="right", style="green", width=8)
+        if durations:
+            table.add_column("Time", justify="right", width=8)
+        table.add_column("", justify="center", width=3)  # Status icon
+
+        for i, r in enumerate(results):
+            if r is None:
+                continue
+
+            idx = getattr(r, "index", i)
+            reward = getattr(r, "reward", None)
+            error = getattr(r, "error", None)
+            duration = getattr(r, "duration", 0)
+            variants = getattr(r, "variants", None)
+            prompt = getattr(r, "prompt", None)
+            answer = getattr(r, "answer", None)
+
+            # Status icon
+            if error:
+                status = "[red]✗[/red]"
+            elif reward is not None and reward > 0.7:
+                status = "[green]✓[/green]"
+            else:
+                status = "[yellow]○[/yellow]"
+
+            row = [str(idx)]
+
+            # Variant or task column
+            if has_variants:
+                row.append(_format_variants(variants))
+            elif tasks and i < len(tasks):
+                task = tasks[i]
+                task_label = _get_task_label(task, i)
+                row.append(task_label[:30])
+
+            # Prompt column
+            if has_prompts:
+                row.append(_truncate(prompt, 35))
+
+            # Answer column
+            if has_answers:
+                row.append(_truncate(answer, 35))
+
+            # Reward
+            row.append(f"{reward:.3f}" if reward is not None else "—")
+
+            # Duration
+            if durations:
+                row.append(f"{duration:.1f}s" if duration > 0 else "—")
+
+            row.append(status)
+            table.add_row(*row)
+
+        console.print(table)
+
+    # Variance warning
+    if std_reward > 0.3:
+        console.print(f"\n[yellow]⚠️  High variance (std={std_reward:.3f})[/yellow]")
+
+    console.print()
+
+
+def _display_basic(results: list[Any], name: str, elapsed: float | None) -> None:
+    """Fallback display without rich."""
+    rewards = [getattr(r, "reward", 0) for r in results if r is not None]
+    title = f"'{name}' Results" if name else "Eval Results"
+    print(f"\n{title}")  # noqa: T201
+    print(f"  Evals: {len(results)}")  # noqa: T201
+    if elapsed:
+        print(f"  Time: {elapsed:.1f}s")  # noqa: T201
+    if rewards:
+        print(f"  Mean reward: {mean(rewards):.3f}")  # noqa: T201
+    print()  # noqa: T201
+
+
+def _format_variants(variants: dict[str, Any] | None) -> str:
+    """Format variants dict for display."""
+    if not variants:
+        return "-"
+    parts = [f"{k}={v}" for k, v in variants.items()]
+    result = ", ".join(parts)
+    return result[:28] + ".." if len(result) > 30 else result
+
+
+def _truncate(text: str | None, max_len: int) -> str:
+    """Truncate text to max length."""
+    if not text:
+        return "-"
+    text = text.replace("\n", " ").strip()
+    return text[: max_len - 2] + ".." if len(text) > max_len else text
+
+
+def _get_task_label(task: Any, index: int) -> str:
+    """Get a display label for a task."""
+    if task is None:
+        return f"task_{index}"
+    if isinstance(task, dict):
+        return task.get("id") or task.get("prompt", "")[:25] or f"task_{index}"
+    task_id = getattr(task, "id", None)
+    if task_id:
+        return task_id
+    prompt = getattr(task, "prompt", None) or getattr(task, "scenario", None)
+    if prompt:
+        return prompt[:25]
+    return f"task_{index}"
+
+
+# Backwards compatibility alias
+print_eval_stats = display_results
+
+__all__ = [
+    "display_results",
+    "print_complete",
+    "print_eval_stats",
+    "print_link",
+    "print_single_result",
+]
diff --git a/hud/eval/instrument.py b/hud/eval/instrument.py
new file mode 100644
index 00000000..e950522c
--- /dev/null
+++ b/hud/eval/instrument.py
@@ -0,0 +1,115 @@
+"""Auto-instrumentation for httpx to inject trace headers.
+
+This module patches httpx clients to automatically add:
+- Trace-Id headers when inside an eval context
+- Authorization headers for HUD API calls
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import Any
+from urllib.parse import urlparse
+
+from hud.settings import settings
+
+logger = logging.getLogger(__name__)
+
+
+def _get_trace_headers() -> dict[str, str] | None:
+    """Lazy import to avoid circular dependency."""
+    from hud.eval.context import get_current_trace_headers
+
+    return get_current_trace_headers()
+
+
+def _is_hud_url(url_str: str) -> bool:
+    """Check if URL is a HUD service (inference or MCP)."""
+    parsed = urlparse(url_str)
+    request_host = parsed.netloc or url_str.split("/")[0]
+
+    # Check for known HUD domains (works for any subdomain)
+    if request_host.endswith((".hud.ai", ".hud.so")):
+        return True
+
+    # Also check settings URLs
+    known_hosts = {
+        urlparse(settings.hud_gateway_url).netloc,
+        urlparse(settings.hud_mcp_url).netloc,
+    }
+    return request_host in known_hosts
+
+
+def _httpx_request_hook(request: Any) -> None:
+    """httpx event hook that adds trace headers and auth to HUD requests.
+
+    For inference.hud.ai and mcp.hud.ai:
+    - Injects trace headers (Trace-Id) if in trace context
+    - Injects Authorization header if API key is set and no auth present
+    """
+    url_str = str(request.url)
+    if not _is_hud_url(url_str):
+        return
+
+    # Inject trace headers if in trace context
+    headers = _get_trace_headers()
+    if headers is not None:
+        for key, value in headers.items():
+            request.headers[key] = value
+        logger.debug("Added trace headers to request: %s", url_str)
+
+    # Auto-inject API key if not present
+    has_auth = "authorization" in {k.lower() for k in request.headers}
+    if not has_auth and settings.api_key:
+        request.headers["Authorization"] = f"Bearer {settings.api_key}"
+        logger.debug("Added API key auth to request: %s", url_str)
+
+
+async def _async_httpx_request_hook(request: Any) -> None:
+    """Async version of the httpx event hook."""
+    _httpx_request_hook(request)
+
+
+def _instrument_client(client: Any) -> None:
+    """Add trace hook to an httpx client instance."""
+    is_async = hasattr(client, "aclose")
+    hook = _async_httpx_request_hook if is_async else _httpx_request_hook
+
+    existing_hooks = client.event_hooks.get("request", [])
+    if hook not in existing_hooks:
+        existing_hooks.append(hook)
+        client.event_hooks["request"] = existing_hooks
+
+
+def _patch_httpx() -> None:
+    """Monkey-patch httpx to auto-instrument all clients."""
+    try:
+        import httpx
+    except ImportError:
+        logger.debug("httpx not installed, skipping auto-instrumentation")
+        return
+
+    _original_async_init = httpx.AsyncClient.__init__
+
+    def _patched_async_init(self: Any, *args: Any, **kwargs: Any) -> None:
+        _original_async_init(self, *args, **kwargs)
+        _instrument_client(self)
+
+    httpx.AsyncClient.__init__ = _patched_async_init  # type: ignore[method-assign]
+
+    _original_sync_init = httpx.Client.__init__
+
+    def _patched_sync_init(self: Any, *args: Any, **kwargs: Any) -> None:
+        _original_sync_init(self, *args, **kwargs)
+        _instrument_client(self)
+
+    httpx.Client.__init__ = _patched_sync_init  # type: ignore[method-assign]
+
+    logger.debug("httpx auto-instrumentation enabled")
+
+
+# Auto-patch httpx on module import
+_patch_httpx()
+
+
+__all__ = ["_patch_httpx"]
diff --git a/hud/eval/manager.py b/hud/eval/manager.py
new file mode 100644
index 00000000..f6e7673e
--- /dev/null
+++ b/hud/eval/manager.py
@@ -0,0 +1,466 @@
+"""Standalone eval() context manager.
+
+Provides hud.eval() for task-based evaluation without needing an existing environment.
+"""
+
+from __future__ import annotations
+
+import inspect
+import logging
+import uuid
+from contextlib import asynccontextmanager
+from typing import TYPE_CHECKING, Any
+
+from hud.eval.display import print_complete, print_eval_stats, print_link
+from hud.eval.parallel import (
+    ASTExtractionError,
+    expand_variants,
+    find_user_frame,
+    get_with_block_body,
+    resolve_group_ids,
+)
+from hud.eval.types import ParallelEvalComplete
+
+if TYPE_CHECKING:
+    from collections.abc import AsyncGenerator
+
+    from hud.eval.context import EvalContext
+    from hud.eval.task import Task
+
+logger = logging.getLogger(__name__)
+
+
+def _get_eval_name(tasks: list[Task] | None = None) -> str:
+    """Extract a nice name for job display.
+
+    Args:
+        tasks: List of Task objects
+
+    Returns:
+        Name like "scenario with val1, val2" or "eval" if no tasks
+    """
+    from hud.eval.task import build_eval_name
+
+    # If we have Task objects, derive name from first one
+    if tasks:
+        if tasks[0].scenario:
+            return build_eval_name(tasks[0].scenario, tasks[0].args)
+        # Fall back to env name or prompt
+        if tasks[0].env and hasattr(tasks[0].env, "name"):
+            return tasks[0].env.name
+        if tasks[0].env and hasattr(tasks[0].env, "prompt") and tasks[0].env.prompt:
+            return tasks[0].env.prompt[:30].strip()
+        if tasks[0].id:
+            return tasks[0].id
+
+    return "eval"
+
+
+def _send_job_enter(
+    job_id: str,
+    name: str,
+    variants: dict[str, Any] | None,
+    group: int,
+    api_key: str | None,
+) -> None:
+    """Send job enter payload (sync request before traces start)."""
+    import httpx
+
+    from hud.eval.types import JobEnterPayload
+    from hud.settings import settings
+
+    api_key = api_key or settings.api_key
+    if not settings.telemetry_enabled or not api_key:
+        return
+
+    payload = JobEnterPayload(
+        name=name,
+        variants=variants,
+        group=group,
+    )
+
+    try:
+        httpx.post(
+            f"{settings.hud_api_url}/trace/job/{job_id}/enter",
+            json=payload.model_dump(exclude_none=True),
+            headers={"Authorization": f"Bearer {api_key}"},
+            timeout=10.0,
+        )
+    except Exception as e:
+        logger.warning("Failed to send job enter: %s", e)
+
+
+@asynccontextmanager
+async def run_eval(
+    source: Task | list[Task] | None = None,
+    *,
+    name: str | None = None,
+    variants: dict[str, Any] | None = None,
+    group: int = 1,
+    group_ids: list[str] | None = None,
+    job_id: str | None = None,
+    group_id: str | None = None,
+    trace_id: str | None = None,
+    api_key: str | None = None,
+    max_concurrent: int | None = None,
+    trace: bool = True,
+    quiet: bool = False,
+) -> AsyncGenerator[EvalContext, None]:
+    """Standalone eval context manager.
+
+    Creates an EvalContext for evaluation using Task objects (or deprecated LegacyTask).
+    For loading tasks from datasets, use load_tasks() first.
+
+    Args:
+        source: Task source. Can be:
+            - None: Create blank eval context
+            - Task: Single Task object (from env() or load_tasks())
+            - list[Task]: List of Task objects
+            - LegacyTask: Single LegacyTask object (deprecated, use Task.from_v4())
+            - list[LegacyTask]: List of LegacyTask objects (deprecated)
+        name: Optional name for the eval (used in trace)
+        variants: A/B test configuration (dict with list values expanded)
+        group: Runs per variant for statistical significance
+        group_ids: Optional list of group IDs
+        job_id: Job ID to link to
+        group_id: Group ID for parallel evaluations
+        trace_id: Pre-assigned trace ID (auto-generated if not provided)
+        api_key: API key for backend calls
+        max_concurrent: Maximum concurrent evals (None = unlimited)
+        trace: Whether to send trace data to backend (default True)
+        quiet: Whether to suppress printing links (default False)
+
+    Yields:
+        EvalContext: Environment with evaluation tracking
+
+    Example:
+        ```python
+        from hud.datasets import load_tasks
+
+        # Blank eval (for manual reward)
+        async with hud.eval() as ctx:
+            ctx.reward = compute_reward()
+
+        # With Task objects (from env())
+        env = Environment("my-env").connect_hub("browser")
+        tasks = [env("checkout", user_id="alice"), env("checkout", user_id="bob")]
+        async with hud.eval(tasks, variants={"model": ["gpt-4o"]}, group=4) as ctx:
+            await agent.run(ctx.prompt)
+
+        # Load tasks from file or API
+        tasks = load_tasks("hud-evals/SheetBench-50")
+        async with hud.eval(tasks) as ctx:
+            await agent.run(ctx)
+
+        # With variants and group
+        async with hud.eval(
+            tasks,
+            variants={"model": ["gpt-4o", "claude"]},
+            group=3,
+        ) as ctx:
+            model = ctx.variants["model"]
+            await run_agent(model)
+            ctx.reward = evaluate()
+
+        # With concurrency limit
+        async with hud.eval(tasks, max_concurrent=10) as ctx:
+            await agent.run(ctx)
+
+        # Access results after parallel run
+        for e in ctx.results:
+            print(f"{e.variants}: reward={e.reward}")
+        ```
+    """
+    from hud.eval.task import Task
+    from hud.types import LegacyTask
+
+    if group <= 0:
+        raise ValueError("group must be >= 1")
+
+    # Expand variants
+    variant_combos = expand_variants(variants)
+
+    # Parse source into tasks list - only Task objects accepted
+    tasks: list[Task] = []
+
+    if source is not None:
+        if isinstance(source, Task):
+            # Single Task object
+            tasks = [source]
+        elif isinstance(source, list) and source and isinstance(source[0], Task):
+            # List of Task objects
+            tasks = source  # type: ignore[assignment]
+        elif isinstance(source, LegacyTask) or (
+            isinstance(source, list) and source and isinstance(source[0], LegacyTask)
+        ):
+            # LegacyTask no longer accepted - user must convert first
+            raise TypeError(
+                "LegacyTask is no longer accepted by hud.eval(). "
+                "Convert first with Task.from_v4(legacy_task), or use load_tasks()."
+            )
+        elif isinstance(source, str):
+            # String slugs no longer supported - use load_dataset()
+            raise TypeError(
+                f"String slugs are no longer supported in hud.eval(). "
+                f"Use load_tasks('{source}') first, then pass the tasks list."
+            )
+        elif isinstance(source, list) and source and isinstance(source[0], str):
+            # List of string slugs no longer supported
+            raise TypeError(
+                "String slugs are no longer supported in hud.eval(). "
+                "Use load_tasks() first, then pass the tasks list."
+            )
+
+    # Calculate total evaluations
+    # Each task gets (variants x group) runs; no tasks = single blank eval
+    base_count = len(tasks) or 1
+    total_evals = base_count * len(variant_combos) * group
+
+    # Capture code snippet for parallel execution
+    code_snippet: str | None = None
+    if total_evals > 1:
+        frame = inspect.currentframe()
+        if frame is not None:
+            try:
+                caller = frame.f_back
+                if caller is not None:
+                    code_snippet, _, _ = get_with_block_body(caller)
+            except ASTExtractionError:
+                pass
+            finally:
+                del frame
+
+    # Lazy import to avoid circular dependency
+    from hud.eval.context import EvalContext
+
+    if total_evals == 1:
+        if tasks:
+            # Single task - use EvalContext.from_task()
+            ctx = EvalContext.from_task(
+                tasks[0],
+                name=name,
+                trace_id=trace_id,
+                api_key=api_key,
+                job_id=job_id,
+                group_id=group_id,
+                variants=variant_combos[0],
+                code_snippet=code_snippet,
+                trace=trace,
+                quiet=quiet,
+            )
+            async with ctx:
+                yield ctx
+        else:
+            # Blank eval - use EvalContext directly
+            ctx = EvalContext(
+                name=name or "eval",
+                trace_id=trace_id,
+                api_key=api_key,
+                job_id=job_id,
+                group_id=group_id,
+                variants=variant_combos[0],
+                code_snippet=code_snippet,
+                trace=trace,
+                quiet=quiet,
+            )
+            async with ctx:
+                yield ctx
+
+    else:
+        # Parallel execution: create implicit job to group traces
+        eval_name = _get_eval_name(tasks=tasks)
+        implicit_job_id = job_id or str(uuid.uuid4())
+        job_url = f"https://hud.ai/jobs/{implicit_job_id}"
+
+        # Send job enter (sync request before traces start)
+        _send_job_enter(
+            job_id=implicit_job_id,
+            name=eval_name,
+            variants=variants,
+            group=group,
+            api_key=api_key,
+        )
+
+        # Print job URL (not individual trace URLs)
+        if not quiet:
+            print_link(job_url, f"🚀 {eval_name}")
+
+        error_occurred = False
+        try:
+            # Run parallel evals with job_id
+            completed = await _run_parallel_eval(
+                tasks=tasks,
+                variant_combos=variant_combos,
+                group=group,
+                group_ids=group_ids,
+                job_id=implicit_job_id,  # Propagate job_id to child traces
+                api_key=api_key,
+                code_snippet=code_snippet,
+                max_concurrent=max_concurrent,
+                trace=trace,
+                quiet=quiet,
+            )
+
+            # Create summary context (no trace, just aggregates results)
+            if tasks:
+                # Create summary from first task
+                ctx = EvalContext(
+                    name=eval_name,  # Use the same smart name
+                    api_key=api_key,
+                    job_id=implicit_job_id,
+                )
+            else:
+                ctx = EvalContext(
+                    name="eval",
+                    api_key=api_key,
+                    job_id=implicit_job_id,
+                )
+
+            ctx._is_summary = True  # Skip trace tracking
+            ctx.results = completed
+
+            # Compute aggregate reward
+            rewards = [e.reward for e in completed if e.reward is not None]
+            if rewards:
+                ctx.reward = sum(rewards) / len(rewards)
+
+            # Check if any failed
+            error_occurred = any(e.error is not None for e in completed)
+
+            yield ctx
+        except ParallelEvalComplete:
+            # Expected - body re-executed on summary context, skip it
+            pass
+        except Exception:
+            error_occurred = True
+            raise
+        finally:
+            print_complete(job_url, eval_name, error=error_occurred)
+
+
+async def _run_parallel_eval(
+    tasks: list[Task],
+    variant_combos: list[dict[str, Any]],
+    group: int,
+    group_ids: list[str] | None,
+    job_id: str | None,
+    api_key: str | None,
+    code_snippet: str | None,
+    max_concurrent: int | None,
+    trace: bool = True,
+    quiet: bool = False,
+) -> list[EvalContext]:
+    """Run parallel evaluation.
+
+    Creates EvalContexts from Tasks (or blank) and runs them in parallel.
+    """
+    import asyncio
+    import textwrap
+
+    from hud.eval.parallel import log_eval_stats
+
+    # Find user code frame and extract the with block body
+    caller_frame = find_user_frame()
+    body_source, captured_locals, context_var = get_with_block_body(caller_frame)
+
+    # Calculate total evals and resolve group IDs
+    base_count = len(tasks) or 1
+    total_evals = base_count * len(variant_combos) * group
+    resolved_group_ids = resolve_group_ids(group_ids, total_evals)
+
+    # Build list of (task_or_none, runtime_params) for each parallel eval
+    from hud.eval.context import EvalContext
+
+    eval_configs: list[tuple[Task | None, dict[str, Any]]] = []
+    idx = 0
+
+    if tasks:
+        for base_task in tasks:
+            for variant in variant_combos:
+                for _ in range(group):
+                    runtime_params = {
+                        "api_key": api_key,
+                        "job_id": job_id,
+                        "group_id": resolved_group_ids[idx],
+                        "index": idx,
+                        "variants": variant,
+                        "code_snippet": code_snippet,
+                        "trace": trace,
+                        "quiet": True,  # Individual traces don't print links
+                    }
+                    eval_configs.append((base_task, runtime_params))
+                    idx += 1
+    else:
+        for variant in variant_combos:
+            for _ in range(group):
+                runtime_params = {
+                    "api_key": api_key,
+                    "job_id": job_id,
+                    "group_id": resolved_group_ids[idx],
+                    "index": idx,
+                    "variants": variant,
+                    "code_snippet": code_snippet,
+                    "trace": trace,
+                    "quiet": True,
+                }
+                eval_configs.append((None, runtime_params))
+                idx += 1
+
+    # Create runner function using the actual variable name from the 'as' clause
+    wrapped = f"async def __runner__({context_var}):\n{textwrap.indent(body_source, '    ')}"
+    code = compile(wrapped, "<parallel_eval>", "exec")
+    namespace = captured_locals.copy()
+    exec(code, namespace)  # noqa: S102
+    runner = namespace["__runner__"]
+
+    # Create semaphore for concurrency control
+    sem = asyncio.Semaphore(max_concurrent) if max_concurrent else None
+
+    async def run_one(config: tuple[Task | None, dict[str, Any]]) -> EvalContext:
+        """Run a single eval and return its EvalContext."""
+        task, params = config
+        idx = params["index"]
+
+        # Create context from task or blank
+        if task is not None:
+            ctx = EvalContext.from_task(task, **params)
+        else:
+            ctx = EvalContext(name="eval", **params)
+
+        # Remove sensitive data from params after context creation to prevent
+        # accidental logging if an exception includes local variables
+        params.pop("api_key", None)
+
+        try:
+            if sem:
+                async with sem, ctx:
+                    await runner(ctx)
+            else:
+                async with ctx:
+                    await runner(ctx)
+            return ctx
+        except Exception as e:
+            logger.warning("Parallel eval %d failed: %s", idx, e)
+            ctx.error = e
+            return ctx
+
+    # Run in parallel
+    logger.info(
+        "Running %d evals (%d base x %d variants x %d runs)%s",
+        len(eval_configs),
+        base_count,
+        len(variant_combos),
+        group,
+        f", max_concurrent={max_concurrent}" if max_concurrent else "",
+    )
+    completed = await asyncio.gather(*[run_one(cfg) for cfg in eval_configs])
+
+    # Log and print stats
+    eval_name = completed[0].eval_name if completed else "eval"
+    log_eval_stats(completed)
+    print_eval_stats(completed, name=eval_name)
+
+    return list(completed)
+
+
+__all__ = ["run_eval"]
diff --git a/hud/eval/parallel.py b/hud/eval/parallel.py
new file mode 100644
index 00000000..d980556c
--- /dev/null
+++ b/hud/eval/parallel.py
@@ -0,0 +1,268 @@
+"""Parallel execution support for evaluations.
+
+This module provides AST extraction and parallel execution for running
+the same eval body N times concurrently.
+"""
+
+from __future__ import annotations
+
+import ast
+import inspect
+import itertools
+import linecache
+import logging
+import textwrap
+import uuid
+from typing import TYPE_CHECKING, Any
+
+if TYPE_CHECKING:
+    from types import FrameType
+
+    from hud.eval.context import EvalContext
+
+logger = logging.getLogger(__name__)
+
+# Frames to skip when walking the call stack to find user code
+# These are internal implementation details that shouldn't be considered user code
+_SKIP_FRAME_PATTERNS = (
+    # Python stdlib
+    "contextlib.py",
+    "asyncio",
+    # HUD eval internals (both Unix and Windows paths)
+    "hud/eval/mixin.py",
+    "hud/eval/manager.py",
+    "hud/eval/parallel.py",
+    "hud\\eval\\mixin.py",
+    "hud\\eval\\manager.py",
+    "hud\\eval\\parallel.py",
+)
+
+# Frames that should NOT be skipped even if in site-packages
+# These contain legitimate async with hud.eval() calls
+_ALLOWED_FRAME_PATTERNS = (
+    "hud/datasets/runner.py",
+    "hud\\datasets\\runner.py",
+)
+
+
+def find_user_frame() -> FrameType:
+    """Walk the call stack to find the first user code frame.
+
+    Skips internal frames from contextlib, asyncio, and hud.eval internals.
+    Frames in site-packages are skipped UNLESS they match _ALLOWED_FRAME_PATTERNS.
+
+    Returns:
+        The frame containing user code (typically the async with statement).
+
+    Raises:
+        ASTExtractionError: If no user code frame can be found.
+    """
+    frame = inspect.currentframe()
+    if frame is None:
+        raise ASTExtractionError("Cannot get current frame")
+
+    try:
+        caller_frame = frame.f_back
+        while caller_frame is not None:
+            filename = caller_frame.f_code.co_filename
+
+            # Check if this is an explicitly allowed frame (e.g., hud/datasets/runner.py)
+            if any(pattern in filename for pattern in _ALLOWED_FRAME_PATTERNS):
+                return caller_frame
+
+            # Skip internal frames, but also skip site-packages unless allowed above
+            is_internal = any(pattern in filename for pattern in _SKIP_FRAME_PATTERNS)
+            is_site_packages = "site-packages" in filename
+
+            if not is_internal and not is_site_packages:
+                return caller_frame
+
+            caller_frame = caller_frame.f_back
+
+        raise ASTExtractionError("Cannot find user code frame in call stack")
+    finally:
+        del frame
+
+
+def expand_variants(
+    variants: dict[str, Any] | None,
+) -> list[dict[str, Any]]:
+    """Expand variants dict into all combinations.
+
+    Args:
+        variants: Dict where values can be:
+            - Single value: {"model": "gpt-4o"} → fixed
+            - List: {"model": ["gpt-4o", "claude"]} → expand
+
+    Returns:
+        List of variant assignments, one per combination.
+
+    Examples:
+        >>> expand_variants(None)
+        [{}]
+        >>> expand_variants({"model": "gpt-4o"})
+        [{"model": "gpt-4o"}]
+        >>> expand_variants({"model": ["gpt-4o", "claude"]})
+        [{"model": "gpt-4o"}, {"model": "claude"}]
+    """
+    if not variants:
+        return [{}]
+
+    expanded: dict[str, list[Any]] = {}
+    for key, value in variants.items():
+        if isinstance(value, list):
+            expanded[key] = value
+        else:
+            expanded[key] = [value]
+
+    keys = list(expanded.keys())
+    value_lists = [expanded[k] for k in keys]
+
+    return [dict(zip(keys, combo, strict=True)) for combo in itertools.product(*value_lists)]
+
+
+def resolve_group_ids(
+    group_ids: list[str] | None,
+    total_count: int,
+) -> list[str]:
+    """Resolve group IDs for parallel execution.
+
+    Args:
+        group_ids: Optional list of group IDs (must match total_count if provided)
+        total_count: Total number of evals
+
+    Returns:
+        List of group IDs (one per eval)
+
+    Raises:
+        ValueError: If group_ids length doesn't match total_count
+    """
+    if group_ids:
+        if len(group_ids) != total_count:
+            raise ValueError(
+                f"group_ids length ({len(group_ids)}) must match total evals ({total_count})"
+            )
+        return group_ids
+    else:
+        shared_group_id = str(uuid.uuid4())
+        return [shared_group_id] * total_count
+
+
+def log_eval_stats(completed: list[EvalContext], context: str = "") -> None:
+    """Log statistics for completed evaluations.
+
+    Args:
+        completed: List of completed EvalContext objects
+        context: Optional context string for the log message
+    """
+    rewards = [ctx.reward for ctx in completed if ctx.reward is not None]
+    mean_reward = sum(rewards) / len(rewards) if rewards else 0.0
+    success_count = sum(1 for ctx in completed if ctx.success)
+
+    logger.info(
+        "Evals complete%s: %d/%d succeeded, mean_reward=%.3f",
+        f" ({context})" if context else "",
+        success_count,
+        len(completed),
+        mean_reward,
+    )
+
+
+class ASTExtractionError(Exception):
+    """Error extracting AST from source."""
+
+
+def get_with_block_body(frame: Any) -> tuple[str, dict[str, Any], str]:
+    """Extract the body of a with-block from the calling frame.
+
+    Args:
+        frame: The calling frame (from inspect.currentframe())
+
+    Returns:
+        Tuple of (body_source, captured_locals, context_var_name)
+    """
+    filename = frame.f_code.co_filename
+    lineno = frame.f_lineno
+
+    # Check for interactive session
+    if filename.startswith("<") or filename in ("<stdin>", "<string>"):
+        raise ASTExtractionError("Cannot extract source from interactive session. Use a .py file.")
+
+    # Read and parse source
+    lines = linecache.getlines(filename)
+    if not lines:
+        with open(filename, encoding="utf-8") as f:
+            lines = f.readlines()
+
+    source = "".join(lines)
+    tree = ast.parse(source, filename=filename)
+
+    # Find the async with containing this line
+    with_node = _find_async_with(tree, lineno)
+    if with_node is None:
+        raise ASTExtractionError(f"Cannot find 'async with' statement at line {lineno}")
+
+    # Extract body source
+    body_source = _extract_body(lines, with_node)
+
+    # Extract the context variable name from 'as' clause
+    context_var = _extract_context_var(with_node)
+
+    # Capture both globals (imports) and locals (variables in scope)
+    captured = {**frame.f_globals, **frame.f_locals}
+
+    return body_source, captured, context_var
+
+
+def _extract_context_var(with_node: ast.AsyncWith) -> str:
+    """Extract the variable name from the 'as' clause of an async with statement."""
+    if not with_node.items or not with_node.items[0].optional_vars:
+        raise ASTExtractionError("async with statement must use 'as' clause for parallel execution")
+
+    var_node = with_node.items[0].optional_vars
+    if not isinstance(var_node, ast.Name):
+        raise ASTExtractionError("async with 'as' clause must be a simple variable name")
+
+    return var_node.id
+
+
+def _find_async_with(tree: ast.AST, target_line: int) -> ast.AsyncWith | None:
+    """Find AsyncWith node containing the target line."""
+    for node in ast.walk(tree):
+        if isinstance(node, ast.AsyncWith):
+            end_line = _get_end_line(node)
+            if node.lineno <= target_line <= end_line:
+                return node
+    return None
+
+
+def _get_end_line(node: ast.AST) -> int:
+    """Get the last line number of an AST node."""
+    end = getattr(node, "end_lineno", getattr(node, "lineno", 0))
+    for child in ast.walk(node):
+        child_end = getattr(child, "end_lineno", 0)
+        if child_end > end:
+            end = child_end
+    return end
+
+
+def _extract_body(lines: list[str], with_node: ast.AsyncWith) -> str:
+    """Extract the body source from an AsyncWith node."""
+    if not with_node.body:
+        return "pass"
+
+    start = with_node.body[0].lineno - 1
+    end = _get_end_line(with_node.body[-1])
+
+    body = "".join(lines[start:end])
+    return textwrap.dedent(body)
+
+
+__all__ = [
+    "ASTExtractionError",
+    "expand_variants",
+    "find_user_frame",
+    "get_with_block_body",
+    "log_eval_stats",
+    "resolve_group_ids",
+]
diff --git a/hud/eval/task.py b/hud/eval/task.py
new file mode 100644
index 00000000..085f1bf8
--- /dev/null
+++ b/hud/eval/task.py
@@ -0,0 +1,340 @@
+"""Task - A runnable evaluation unit (Pydantic model).
+
+A Task holds the configuration needed to run an evaluation:
+- Environment configuration (how to create/connect)
+- Optional scenario name and args
+
+When entered as a context manager, it creates an EvalContext.
+
+Usage:
+    env = Environment("my-env").connect_hub("browser")
+
+    # Empty - just env
+    async with env() as ctx:
+        await ctx.call_tool("navigate", url="...")
+
+    # With scenario
+    async with env("checkout", user_id="alice") as ctx:
+        await agent.run(ctx.prompt)
+
+    # Orchestrated via hud.eval
+    tasks = [env("checkout", user_id="alice"), env("checkout", user_id="bob")]
+    async with hud.eval(tasks, variants={"model": ["gpt-4o"]}, group=4) as ctx:
+        ...
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import TYPE_CHECKING, Any
+
+from pydantic import (
+    BaseModel,
+    ConfigDict,
+    Field,
+    field_serializer,
+    field_validator,
+    model_serializer,
+    model_validator,
+)
+
+from hud.types import MCPToolCall
+
+if TYPE_CHECKING:
+    from hud.environment import Environment
+    from hud.environment.types import EnvConfig
+
+__all__ = ["Task", "TaskAgentConfig", "build_eval_name"]
+
+logger = logging.getLogger(__name__)
+
+
+class TaskAgentConfig(BaseModel):
+    """Agent configuration for a Task.
+
+    Contains settings that should be passed to the agent when running this task.
+    """
+
+    model_config = ConfigDict(extra="ignore")
+
+    system_prompt: str | None = Field(
+        default=None,
+        description="Custom system prompt to pass to the agent",
+    )
+
+    @model_validator(mode="before")
+    @classmethod
+    def warn_extra_fields(cls, data: Any) -> Any:
+        """Warn about extra fields that will be ignored."""
+        if isinstance(data, dict):
+            known_fields = {"system_prompt"}
+            extra = set(data.keys()) - known_fields
+            if extra:
+                logger.warning(
+                    "Deprecated or unknown fields in agent_config will be ignored: %s",
+                    ", ".join(sorted(extra)),
+                )
+        return data
+
+
+def build_eval_name(scenario: str | None, args: dict[str, Any] | None) -> str:
+    """Build descriptive name: 'scenario with val1, val2, ...'"""
+    if not scenario:
+        return "eval"
+    if not args:
+        return scenario
+
+    val_parts = []
+    for v in list(args.values())[:3]:  # Max 3 values
+        v_str = repr(v) if isinstance(v, str) else str(v)
+        if len(v_str) > 25:
+            v_str = v_str[:22] + "..."
+        val_parts.append(v_str)
+
+    if val_parts:
+        return f"{scenario} with {', '.join(val_parts)}"
+    return scenario
+
+
+class Task(BaseModel):
+    """A runnable evaluation unit (Pydantic model).
+
+    Simplified v5 Task format:
+    - env: Environment instance OR EnvConfig with hub name + filters
+    - scenario: Scenario name to run
+    - args: Scenario arguments
+    - validation: Optional list of tool calls representing successful completion
+
+    When entered as a context manager, creates an EvalContext.
+
+    Attributes:
+        id: Optional task identifier for filtering/tracking
+        env: Environment instance (auto-created from dict/EnvConfig via validator)
+        scenario: Scenario name to run (from @env.scenario)
+        args: Scenario arguments
+        validation: Optional list of MCPToolCall objects representing successful completion
+
+    Example (v5 format):
+        ```python
+        from hud.eval import Task
+
+        # Pass dict - auto-converts to Environment
+        task = Task(
+            env={"name": "browser", "include": ["navigate", "screenshot"]},
+            scenario="checkout",
+            args={"user_id": "alice"},
+            validation=[{"name": "check_cart", "arguments": {}}],
+        )
+        # task.env is now Environment connected to browser hub!
+
+        # Or pass live Environment directly
+        env = Environment("my-env").connect_hub("browser")
+        task = Task(env=env, scenario="checkout", args={"user_id": "alice"})
+        ```
+
+    Migration from v4:
+        Use Task.from_v4() to convert LegacyTask objects:
+
+        ```python
+        task = Task.from_v4(legacy_task)
+        # or
+        task = Task.from_v4({"prompt": "...", "mcp_config": {...}, ...})
+        ```
+    """
+
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+
+    # Fields - env accepts Environment | EnvConfig | dict, auto-converts to Environment
+    env: Any = Field(default=None)  # Typed as Any for input flexibility, validated below
+    scenario: str | None = None
+    id: str | None = None
+    args: dict[str, Any] = Field(default_factory=dict)
+    validation: list[MCPToolCall] | None = None
+
+    # Agent config - settings passed to agent (system_prompt, etc.)
+    # Accepts TaskAgentConfig or dict (auto-converted via validator)
+    agent_config: TaskAgentConfig | dict[str, Any] | None = None
+
+    # Task metadata - for tracking/filtering, not used by agent
+    metadata: dict[str, Any] = Field(default_factory=dict)
+
+    @field_validator("agent_config", mode="before")
+    @classmethod
+    def convert_agent_config(
+        cls, v: TaskAgentConfig | dict[str, Any] | None
+    ) -> TaskAgentConfig | None:
+        """Auto-convert dict to TaskAgentConfig."""
+        if v is None:
+            return None
+        if isinstance(v, TaskAgentConfig):
+            return v
+        if isinstance(v, dict):
+            return TaskAgentConfig(**v)
+        raise TypeError(
+            f"Task.agent_config must be TaskAgentConfig or dict. Got {type(v).__name__}"
+        )
+
+    @model_validator(mode="before")
+    @classmethod
+    def detect_v4_format(cls, data: Any) -> Any:
+        """Auto-detect v4 LegacyTask format and convert to v5 Task format.
+
+        If the input dict is a valid v4 format (has prompt, mcp_config, evaluate_tool),
+        it's converted using build_env_from_v4().
+
+        This allows Task(**v4_dict) to work seamlessly.
+        """
+        from hud.eval.utils import build_env_from_v4, is_v4_format, validate_v4_task
+
+        if not isinstance(data, dict):
+            return data
+
+        if is_v4_format(data):
+            # Validate completeness before conversion
+            validate_v4_task(data)
+            # build_env_from_v4 returns a dict with all Task fields
+            return build_env_from_v4(data)
+
+        return data
+
+    @field_validator("env", mode="before")
+    @classmethod
+    def convert_env(cls, v: Environment | EnvConfig | dict[str, Any] | None) -> Environment | None:
+        """Auto-convert dict/EnvConfig to Environment.
+
+        Format: {"name": "browser", "include": [...], "exclude": [...]}
+        """
+        from hud.environment import Environment
+        from hud.environment.types import EnvConfig
+
+        if v is None:
+            return None
+        if isinstance(v, Environment):
+            return v
+        if isinstance(v, dict):
+            try:
+                config = EnvConfig(**v)
+            except Exception as e:
+                raise ValueError(
+                    f"Invalid env config: {e}. Expected fields: name (str), "
+                    f"include (list[str] | None), exclude (list[str] | None)"
+                ) from e
+            env = Environment(config.name)
+            env.connect_hub(config.name, include=config.include, exclude=config.exclude)
+            return env
+        if isinstance(v, EnvConfig):
+            env = Environment(v.name)
+            env.connect_hub(v.name, include=v.include, exclude=v.exclude)
+            return env
+        raise TypeError(f"Task.env must be Environment, EnvConfig, or dict. Got {type(v).__name__}")
+
+    @field_validator("validation", mode="before")
+    @classmethod
+    def convert_validation(
+        cls, v: list[MCPToolCall | dict[str, Any]] | None
+    ) -> list[MCPToolCall] | None:
+        """Auto-convert validation dicts to MCPToolCall objects."""
+        if v is None:
+            return None
+        if not isinstance(v, list):
+            raise TypeError(f"validation must be a list, got {type(v).__name__}")
+
+        converted = []
+        for item in v:
+            if isinstance(item, dict):
+                converted.append(MCPToolCall(**item))
+            elif isinstance(item, MCPToolCall):
+                converted.append(item)
+            else:
+                raise TypeError(
+                    f"validation items must be dict or MCPToolCall, got {type(item).__name__}"
+                )
+        return converted
+
+    @field_serializer("env")
+    def serialize_env(self, env: Environment | None) -> dict[str, Any] | None:
+        """Serialize Environment to config dict via to_config()."""
+        if env is None:
+            return None
+        return env.to_config()
+
+    @model_serializer(mode="wrap")
+    def _serialize_task(
+        self,
+        handler: Any,  # SerializerFunctionWrapHandler
+    ) -> dict[str, Any]:
+        """Custom serializer for v4 format flattening.
+
+        For v5 tasks: uses default serialization (env field handled by field_serializer)
+        For v4 tasks: flattens {"prompt": ..., "mcp_config": ..., "evaluate_tool": ...}
+        """
+        # Get default serialization (env is already converted by field_serializer)
+        data = handler(self)
+
+        # Check if this is a v4 task (env config has mcp_config)
+        env_config = data.get("env")
+        if env_config and isinstance(env_config, dict) and "mcp_config" in env_config:
+            # v4 format - flatten into top-level dict
+            result = env_config.copy()
+
+            # Map validation → integration_test_tool
+            if self.validation:
+                result["integration_test_tool"] = [
+                    {"name": v.name, "arguments": v.arguments or {}} for v in self.validation
+                ]
+
+            # Preserve agent_config
+            if data.get("agent_config"):
+                result["agent_config"] = data["agent_config"]
+
+            # Preserve metadata
+            if data.get("metadata"):
+                result["metadata"] = data["metadata"]
+
+            # Preserve id
+            if data.get("id"):
+                result["id"] = data["id"]
+
+            return result
+
+        return data
+
+    @classmethod
+    def from_v4(cls, source: Any) -> Task:
+        """Convert v4 LegacyTask format to v5 Task.
+
+        This is a convenience wrapper. You can also use Task(**dict) directly
+        since the model validator auto-detects v4 format.
+
+        Args:
+            source: LegacyTask, dict, or JSON string with v4 fields
+
+        Returns:
+            Task configured for v4 behavior
+        """
+        import json as json_module
+
+        # JSON string → dict
+        if isinstance(source, str):
+            source = json_module.loads(source)
+
+        # LegacyTask → dict (import only when needed)
+        if hasattr(source, "model_dump"):
+            source = source.model_dump()
+
+        # Model validator handles v4 detection and conversion
+        return cls(**source)
+
+    def copy(self) -> Task:
+        """Create a copy of this Task config.
+
+        Note: env is shared (not deep copied) since Environment instances
+        should be reused. Args and validation are deep copied.
+        """
+        return Task(
+            id=self.id,
+            env=self.env,  # Share reference
+            scenario=self.scenario,
+            args=self.args.copy() if self.args else {},
+            validation=self.validation.copy() if self.validation else None,
+        )
diff --git a/hud/eval/tests/__init__.py b/hud/eval/tests/__init__.py
new file mode 100644
index 00000000..3b6c294e
--- /dev/null
+++ b/hud/eval/tests/__init__.py
@@ -0,0 +1 @@
+"""Tests for hud.eval module."""
diff --git a/hud/eval/tests/test_context.py b/hud/eval/tests/test_context.py
new file mode 100644
index 00000000..21f9fdb7
--- /dev/null
+++ b/hud/eval/tests/test_context.py
@@ -0,0 +1,178 @@
+"""Tests for hud.eval.context module."""
+
+from __future__ import annotations
+
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import pytest
+
+from hud.eval.context import (
+    EvalContext,
+    get_current_trace_headers,
+)
+
+
+class TestEvalContext:
+    """Tests for EvalContext."""
+
+    def test_init_generates_trace_id(self) -> None:
+        """EvalContext generates trace_id if not provided."""
+        ctx = EvalContext(name="test-task", quiet=True)
+
+        assert ctx.trace_id is not None
+        assert len(ctx.trace_id) == 36  # UUID format
+
+    def test_init_uses_provided_trace_id(self) -> None:
+        """EvalContext uses provided trace_id."""
+        ctx = EvalContext(name="test-task", trace_id="custom-id", quiet=True)
+
+        assert ctx.trace_id == "custom-id"
+
+    def test_headers_contains_trace_id(self) -> None:
+        """headers property returns dict with trace ID."""
+        ctx = EvalContext(name="test-task", trace_id="test-123", quiet=True)
+
+        assert ctx.headers == {"Trace-Id": "test-123"}
+
+    def test_success_true_when_no_error(self) -> None:
+        """success property returns True when no error."""
+        ctx = EvalContext(name="test-task", quiet=True)
+
+        assert ctx.success is True
+
+    def test_success_false_when_error(self) -> None:
+        """success property returns False when error is set."""
+        ctx = EvalContext(name="test-task", quiet=True)
+        ctx.error = ValueError("test error")
+
+        assert ctx.success is False
+
+    def test_variants_empty_by_default(self) -> None:
+        """variants is empty dict by default."""
+        ctx = EvalContext(name="test-task", quiet=True)
+
+        assert ctx.variants == {}
+
+    def test_variants_set_from_init(self) -> None:
+        """variants set from parameter."""
+        ctx = EvalContext(
+            name="test-task",
+            variants={"model": "gpt-4o", "temp": 0.7},
+            quiet=True,
+        )
+
+        assert ctx.variants == {"model": "gpt-4o", "temp": 0.7}
+
+    @pytest.mark.asyncio
+    async def test_context_manager_sets_headers(self) -> None:
+        """Context manager sets trace headers in contextvar."""
+        ctx = EvalContext(name="test-task", trace_id="test-123", quiet=True)
+
+        # Mock telemetry calls
+        with (
+            patch.object(ctx, "_eval_enter", new_callable=AsyncMock),
+            patch.object(ctx, "_eval_exit", new_callable=AsyncMock),
+            patch.object(EvalContext, "__aenter__", return_value=ctx),
+            patch.object(EvalContext, "__aexit__", return_value=None),
+        ):
+            assert get_current_trace_headers() is None
+
+            # Manually set token for test
+            from hud.eval.context import _current_trace_headers
+
+            token = _current_trace_headers.set(ctx.headers)
+            try:
+                headers = get_current_trace_headers()
+                assert headers is not None
+                assert headers["Trace-Id"] == "test-123"
+            finally:
+                _current_trace_headers.reset(token)
+
+            assert get_current_trace_headers() is None
+
+    def test_repr(self) -> None:
+        """__repr__ shows useful info."""
+        ctx = EvalContext(
+            name="test-task",
+            trace_id="abc12345-6789-0000-0000-000000000000",
+            quiet=True,
+        )
+        ctx.reward = 0.95
+
+        repr_str = repr(ctx)
+        assert "abc12345" in repr_str
+        assert "test-task" in repr_str
+        assert "0.95" in repr_str
+
+
+class TestEvalContextPrompt:
+    """Tests for EvalContext.prompt feature."""
+
+    def test_prompt_can_be_set(self) -> None:
+        """EvalContext.prompt can be set."""
+        ctx = EvalContext(name="test-task", quiet=True)
+        ctx.prompt = "Test prompt"
+
+        assert ctx.prompt == "Test prompt"
+
+    def test_prompt_included_in_payload(self) -> None:
+        """Prompt is included in eval payload."""
+        ctx = EvalContext(name="test-task", quiet=True)
+        ctx.prompt = "Test prompt"
+
+        payload = ctx._build_base_payload()
+        assert payload.prompt == "Test prompt"
+
+
+class TestEvalContextFromEnvironment:
+    """Tests for EvalContext.from_environment factory."""
+
+    def test_copies_connections(self) -> None:
+        """from_environment copies connections from parent (deep copy)."""
+        from hud.environment import Environment
+
+        parent = Environment("parent-env")
+        # Add a mock connection with copy method
+        mock_conn = MagicMock()
+        mock_conn_copy = MagicMock()
+        mock_conn.copy.return_value = mock_conn_copy
+        parent._connections["test-conn"] = mock_conn
+
+        ctx = EvalContext.from_environment(parent, name="test-task")
+
+        # Verify connection was copied (not same object)
+        assert "test-conn" in ctx._connections
+        mock_conn.copy.assert_called_once()
+        assert ctx._connections["test-conn"] is mock_conn_copy
+
+    def test_copies_prompt(self) -> None:
+        """from_environment copies prompt from parent."""
+        from hud.environment import Environment
+
+        parent = Environment("parent-env")
+        parent.prompt = "Parent prompt"
+
+        ctx = EvalContext.from_environment(parent, name="test-task")
+
+        assert ctx.prompt == "Parent prompt"
+
+    def test_sets_eval_properties(self) -> None:
+        """from_environment sets eval-specific properties."""
+        from hud.environment import Environment
+
+        parent = Environment("parent-env")
+
+        ctx = EvalContext.from_environment(
+            parent,
+            name="test-task",
+            trace_id="custom-trace",
+            variants={"model": "gpt-4o"},
+            group_id="group-123",
+            index=5,
+        )
+
+        assert ctx.eval_name == "test-task"
+        assert ctx.trace_id == "custom-trace"
+        assert ctx.variants == {"model": "gpt-4o"}
+        assert ctx.group_id == "group-123"
+        assert ctx.index == 5
diff --git a/hud/eval/tests/test_eval.py b/hud/eval/tests/test_eval.py
new file mode 100644
index 00000000..6d470808
--- /dev/null
+++ b/hud/eval/tests/test_eval.py
@@ -0,0 +1,210 @@
+"""Tests for hud.eval.task module (Task class)."""
+
+from __future__ import annotations
+
+import pytest
+
+from hud.eval.task import Task
+
+
+class TestTaskDataclass:
+    """Tests for Task as a Pydantic model."""
+
+    def test_init_defaults(self) -> None:
+        """Task initializes with sensible defaults."""
+        task = Task()
+
+        assert task.env is None
+        assert task.scenario is None
+        assert task.args == {}
+
+    def test_init_with_env_dict(self) -> None:
+        """Task auto-converts env dict to Environment via validator."""
+        from hud.environment import Environment
+
+        task = Task(
+            env={"name": "browser", "include": ["navigate"]},
+            scenario="checkout",
+            args={"user_id": "alice"},
+        )
+
+        # env dict is auto-converted to Environment
+        assert isinstance(task.env, Environment)
+        assert task.scenario == "checkout"
+        assert task.args == {"user_id": "alice"}
+
+    def test_copy_creates_new_instance(self) -> None:
+        """copy() creates a new Task instance."""
+        original = Task(
+            env={"name": "test"},
+            scenario="checkout",
+            args={"user_id": "alice"},
+        )
+        copied = original.copy()
+
+        assert copied is not original
+        assert copied.env is original.env  # Env reference is shared (intentional)
+        assert copied.scenario == original.scenario
+        assert copied.args == original.args
+        assert copied.args is not original.args  # Args are deep copied
+
+
+class TestEnvironmentCall:
+    """Tests for Environment.__call__ returning Task."""
+
+    def test_call_returns_task(self) -> None:
+        """Environment() returns a Task object."""
+        from hud.environment import Environment
+
+        env = Environment("test-env")
+        task = env()
+
+        assert isinstance(task, Task)
+
+    def test_call_with_scenario_sets_scenario(self) -> None:
+        """Environment(scenario) sets scenario name."""
+        from hud.environment import Environment
+
+        env = Environment("test-env")
+        task = env("checkout")
+
+        assert task.scenario == "checkout"
+
+    def test_call_with_args_sets_args(self) -> None:
+        """Environment(scenario, **args) sets args."""
+        from hud.environment import Environment
+
+        env = Environment("test-env")
+        task = env("checkout", user_id="alice", amount=100)
+
+        assert task.args == {"user_id": "alice", "amount": 100}
+
+    def test_call_returns_task_with_env(self) -> None:
+        """Environment() returns Task with env reference."""
+        from hud.environment import Environment
+
+        env = Environment("test-env")
+        task = env()
+
+        # Task has reference to the Environment
+        assert task.env is env
+
+        # With setup_tool (v4 legacy)
+        env2 = Environment("test-env").setup_tool("navigate", url="https://example.com")
+        task2 = env2()
+        assert task2.env is env2
+        assert len(task2.env._setup_calls) == 1
+
+
+class TestTaskFromV4:
+    """Tests for Task.from_v4() migration helper."""
+
+    def test_from_v4_with_legacy_task(self) -> None:
+        """Task.from_v4() accepts LegacyTask object."""
+        import warnings
+
+        # Suppress the deprecation warning from LegacyTask
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore", category=DeprecationWarning)
+            from hud.types import LegacyTask
+
+            legacy = LegacyTask(
+                prompt="Navigate to google.com",
+                mcp_config={"hud": {"url": "https://mcp.hud.ai"}},
+                evaluate_tool={"name": "check", "arguments": {}},
+            )
+
+        task = Task.from_v4(legacy)
+
+        assert isinstance(task, Task)
+        assert task.env is not None
+        assert task.env.prompt == "Navigate to google.com"
+        assert task.scenario is None  # Uses setup/evaluate_tool, not scenarios
+
+    def test_from_v4_with_dict(self) -> None:
+        """Task.from_v4() accepts dict with LegacyTask fields."""
+        task = Task.from_v4(
+            {
+                "prompt": "Navigate to google.com",
+                "mcp_config": {"hud": {"url": "https://mcp.hud.ai"}},
+                "evaluate_tool": {"name": "check", "arguments": {}},
+            }
+        )
+
+        assert isinstance(task, Task)
+        assert task.env is not None
+        assert task.env.prompt == "Navigate to google.com"
+
+    def test_from_v4_with_json_string(self) -> None:
+        """Task.from_v4() accepts JSON string."""
+        import json
+
+        data = {
+            "prompt": "Navigate to google.com",
+            "mcp_config": {"hud": {"url": "https://mcp.hud.ai"}},
+            "evaluate_tool": {"name": "check", "arguments": {}},
+        }
+        task = Task.from_v4(json.dumps(data))
+
+        assert isinstance(task, Task)
+        assert task.env is not None
+        assert task.env.prompt == "Navigate to google.com"
+
+    def test_from_v4_with_setup_tool(self) -> None:
+        """Task.from_v4() preserves setup_tool via env._setup_calls."""
+        task = Task.from_v4(
+            {
+                "prompt": "Check URL",
+                "mcp_config": {"hud": {"url": "https://mcp.hud.ai"}},
+                "setup_tool": {"name": "navigate", "arguments": {"url": "https://google.com"}},
+                "evaluate_tool": {"name": "check", "arguments": {}},
+            }
+        )
+
+        # setup_tool is converted to env._setup_calls
+        assert len(task.env._setup_calls) == 1
+        assert task.env._setup_calls[0] == ("navigate", {"url": "https://google.com"})
+
+    def test_from_v4_with_evaluate_tool(self) -> None:
+        """Task.from_v4() preserves evaluate_tool via env._evaluate_calls."""
+        task = Task.from_v4(
+            {
+                "prompt": "Check URL",
+                "mcp_config": {"hud": {"url": "https://mcp.hud.ai"}},
+                "evaluate_tool": {"name": "check_url", "arguments": {"expected": "google"}},
+            }
+        )
+
+        # evaluate_tool is converted to env._evaluate_calls
+        assert len(task.env._evaluate_calls) == 1
+        assert task.env._evaluate_calls[0] == ("check_url", {"expected": "google"})
+
+    def test_from_v4_with_invalid_type_raises(self) -> None:
+        """Task.from_v4() raises TypeError for invalid input."""
+        with pytest.raises(TypeError):
+            Task.from_v4(12345)  # type: ignore[arg-type]
+
+    def test_from_v4_with_invalid_json_raises(self) -> None:
+        """Task.from_v4() raises JSONDecodeError for invalid JSON."""
+        import json
+
+        with pytest.raises(json.JSONDecodeError):
+            Task.from_v4("not valid json")
+
+    def test_from_v4_does_not_warn_on_use(self) -> None:
+        """Task.from_v4() suppresses LegacyTask deprecation warning."""
+        import warnings
+
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter("always")
+            Task.from_v4(
+                {
+                    "prompt": "test",
+                    "mcp_config": {"hud": {}},
+                    "evaluate_tool": {"name": "check", "arguments": {}},
+                }
+            )
+
+        # Should not trigger deprecation warning since we're migrating
+        legacy_warnings = [x for x in w if issubclass(x.category, DeprecationWarning)]
+        assert len(legacy_warnings) == 0
diff --git a/hud/eval/tests/test_manager.py b/hud/eval/tests/test_manager.py
new file mode 100644
index 00000000..9b237382
--- /dev/null
+++ b/hud/eval/tests/test_manager.py
@@ -0,0 +1,152 @@
+"""Tests for hud.eval.manager module (hud.eval() function)."""
+
+from __future__ import annotations
+
+from unittest.mock import AsyncMock, patch
+
+import pytest
+
+from hud.eval.context import EvalContext, get_current_trace_headers
+from hud.eval.manager import run_eval
+
+
+class TestRunEvalNoArgs:
+    """Tests for hud.eval() with no arguments (blank eval)."""
+
+    @pytest.mark.asyncio
+    async def test_blank_eval_creates_context(self) -> None:
+        """hud.eval() with no args creates an EvalContext."""
+        with (
+            patch.object(EvalContext, "_eval_enter", new_callable=AsyncMock),
+            patch.object(EvalContext, "_eval_exit", new_callable=AsyncMock),
+        ):
+            async with run_eval(quiet=True) as ctx:
+                assert isinstance(ctx, EvalContext)
+                assert ctx.eval_name == "eval"
+
+    @pytest.mark.asyncio
+    async def test_blank_eval_generates_trace_id(self) -> None:
+        """hud.eval() with no args generates a trace_id."""
+        with (
+            patch.object(EvalContext, "_eval_enter", new_callable=AsyncMock),
+            patch.object(EvalContext, "_eval_exit", new_callable=AsyncMock),
+        ):
+            async with run_eval(quiet=True) as ctx:
+                assert ctx.trace_id is not None
+                assert len(ctx.trace_id) == 36  # UUID format
+
+    @pytest.mark.asyncio
+    async def test_blank_eval_sets_trace_headers(self) -> None:
+        """hud.eval() sets trace headers in contextvar during context."""
+        with (
+            patch.object(EvalContext, "_eval_enter", new_callable=AsyncMock),
+            patch.object(EvalContext, "_eval_exit", new_callable=AsyncMock),
+        ):
+            # Before context, no headers
+            assert get_current_trace_headers() is None
+
+            async with run_eval(quiet=True) as ctx:
+                # Inside context, headers are set
+                headers = get_current_trace_headers()
+                assert headers is not None
+                assert headers["Trace-Id"] == ctx.trace_id
+
+            # After context, headers are cleared
+            assert get_current_trace_headers() is None
+
+    @pytest.mark.asyncio
+    async def test_blank_eval_reward_can_be_set(self) -> None:
+        """hud.eval() allows setting reward on context."""
+        with (
+            patch.object(EvalContext, "_eval_enter", new_callable=AsyncMock),
+            patch.object(EvalContext, "_eval_exit", new_callable=AsyncMock),
+        ):
+            async with run_eval(quiet=True) as ctx:
+                assert ctx.reward is None
+                ctx.reward = 0.95
+
+            assert ctx.reward == 0.95
+
+    @pytest.mark.asyncio
+    async def test_blank_eval_reports_reward_on_exit(self) -> None:
+        """hud.eval() reports reward to backend on exit."""
+        with (
+            patch.object(EvalContext, "_eval_enter", new_callable=AsyncMock),
+            patch.object(EvalContext, "_eval_exit", new_callable=AsyncMock) as mock_exit,
+        ):
+            async with run_eval(quiet=True) as ctx:
+                ctx.reward = 0.85
+
+            # _eval_exit should have been called (with no error)
+            mock_exit.assert_called_once_with(None)
+
+    @pytest.mark.asyncio
+    async def test_blank_eval_empty_variants(self) -> None:
+        """hud.eval() with no args has empty variants dict."""
+        with (
+            patch.object(EvalContext, "_eval_enter", new_callable=AsyncMock),
+            patch.object(EvalContext, "_eval_exit", new_callable=AsyncMock),
+        ):
+            async with run_eval(quiet=True) as ctx:
+                assert ctx.variants == {}
+
+    @pytest.mark.asyncio
+    async def test_blank_eval_has_headers_property(self) -> None:
+        """hud.eval() context has headers property for gateway integration."""
+        with (
+            patch.object(EvalContext, "_eval_enter", new_callable=AsyncMock),
+            patch.object(EvalContext, "_eval_exit", new_callable=AsyncMock),
+        ):
+            async with run_eval(quiet=True) as ctx:
+                headers = ctx.headers
+                assert "Trace-Id" in headers
+                assert headers["Trace-Id"] == ctx.trace_id
+
+
+class TestRunEvalWithApiKey:
+    """Tests for hud.eval() with api_key parameter."""
+
+    @pytest.mark.asyncio
+    async def test_api_key_passed_to_context(self) -> None:
+        """hud.eval(api_key=...) passes api_key to context."""
+        with (
+            patch.object(EvalContext, "_eval_enter", new_callable=AsyncMock),
+            patch.object(EvalContext, "_eval_exit", new_callable=AsyncMock),
+        ):
+            async with run_eval(api_key="test-key", quiet=True) as ctx:
+                assert ctx._eval_api_key == "test-key"
+
+
+class TestRunEvalWithJobId:
+    """Tests for hud.eval() with job_id parameter."""
+
+    @pytest.mark.asyncio
+    async def test_job_id_passed_to_context(self) -> None:
+        """hud.eval(job_id=...) passes job_id to context."""
+        with (
+            patch.object(EvalContext, "_eval_enter", new_callable=AsyncMock),
+            patch.object(EvalContext, "_eval_exit", new_callable=AsyncMock),
+        ):
+            async with run_eval(job_id="job-123", quiet=True) as ctx:
+                assert ctx.job_id == "job-123"
+
+
+class TestRunEvalErrorHandling:
+    """Tests for hud.eval() error handling."""
+
+    @pytest.mark.asyncio
+    async def test_error_tracked_on_exception(self) -> None:
+        """hud.eval() tracks error when exception occurs."""
+        with (
+            patch.object(EvalContext, "_eval_enter", new_callable=AsyncMock),
+            patch.object(EvalContext, "_eval_exit", new_callable=AsyncMock) as mock_exit,
+        ):
+            with pytest.raises(ValueError):
+                async with run_eval(quiet=True):
+                    raise ValueError("test error")
+
+            # _eval_exit should have been called with error message
+            mock_exit.assert_called_once()
+            error_msg = mock_exit.call_args[0][0]
+            assert error_msg is not None
+            assert "test error" in error_msg
diff --git a/hud/eval/tests/test_parallel.py b/hud/eval/tests/test_parallel.py
new file mode 100644
index 00000000..4e55b8fb
--- /dev/null
+++ b/hud/eval/tests/test_parallel.py
@@ -0,0 +1,168 @@
+"""Tests for hud.eval.parallel module."""
+
+from __future__ import annotations
+
+import ast
+
+import pytest
+
+from hud.eval.parallel import (
+    ASTExtractionError,
+    _extract_body,
+    _find_async_with,
+    _get_end_line,
+    expand_variants,
+    resolve_group_ids,
+)
+
+
+class TestExpandVariants:
+    """Tests for expand_variants helper."""
+
+    def test_none_returns_empty_dict(self) -> None:
+        """None variants returns list with empty dict."""
+        result = expand_variants(None)
+        assert result == [{}]
+
+    def test_empty_dict_returns_empty_dict(self) -> None:
+        """Empty variants returns list with empty dict."""
+        result = expand_variants({})
+        assert result == [{}]
+
+    def test_single_value_stays_single(self) -> None:
+        """Single non-list value stays as single variant."""
+        result = expand_variants({"model": "gpt-4o"})
+        assert result == [{"model": "gpt-4o"}]
+
+    def test_list_expands_to_variants(self) -> None:
+        """List value expands to multiple variants."""
+        result = expand_variants({"model": ["gpt-4o", "claude"]})
+        assert result == [{"model": "gpt-4o"}, {"model": "claude"}]
+
+    def test_multiple_lists_create_combinations(self) -> None:
+        """Multiple lists create all combinations."""
+        result = expand_variants(
+            {
+                "model": ["a", "b"],
+                "temp": [0.0, 1.0],
+            }
+        )
+
+        assert len(result) == 4
+        assert {"model": "a", "temp": 0.0} in result
+        assert {"model": "a", "temp": 1.0} in result
+        assert {"model": "b", "temp": 0.0} in result
+        assert {"model": "b", "temp": 1.0} in result
+
+    def test_mixed_single_and_list(self) -> None:
+        """Mixed single values and lists work correctly."""
+        result = expand_variants(
+            {
+                "model": ["gpt-4o", "claude"],
+                "temp": 0.7,
+            }
+        )
+
+        assert len(result) == 2
+        assert {"model": "gpt-4o", "temp": 0.7} in result
+        assert {"model": "claude", "temp": 0.7} in result
+
+
+class TestResolveGroupIds:
+    """Tests for resolve_group_ids helper."""
+
+    def test_uses_provided_group_ids(self) -> None:
+        """Uses provided group_ids when given."""
+        result = resolve_group_ids(["a", "b", "c"], 3)
+        assert result == ["a", "b", "c"]
+
+    def test_generates_shared_group_id(self) -> None:
+        """Generates shared group_id when not provided."""
+        result = resolve_group_ids(None, 3)
+        assert len(result) == 3
+        # All should be the same
+        assert result[0] == result[1] == result[2]
+        # Should be a valid UUID
+        assert len(result[0]) == 36
+
+    def test_raises_on_length_mismatch(self) -> None:
+        """Raises ValueError when group_ids length doesn't match."""
+        with pytest.raises(ValueError, match="group_ids length"):
+            resolve_group_ids(["a", "b"], 3)
+
+
+class TestASTHelpers:
+    """Tests for AST helper functions."""
+
+    def test_find_async_with_finds_correct_node(self) -> None:
+        """_find_async_with finds the async with containing target line."""
+        source = """
+async def main():
+    x = 1
+    async with something as ctx:
+        do_stuff()
+        more_stuff()
+    y = 2
+"""
+        tree = ast.parse(source)
+
+        # Line 5 is inside the async with
+        node = _find_async_with(tree, 5)
+        assert node is not None
+        assert isinstance(node, ast.AsyncWith)
+
+    def test_find_async_with_returns_none_when_not_found(self) -> None:
+        """_find_async_with returns None when line is outside async with."""
+        source = """
+async def main():
+    x = 1
+    async with something as ctx:
+        do_stuff()
+    y = 2
+"""
+        tree = ast.parse(source)
+
+        # Line 7 is outside the async with
+        node = _find_async_with(tree, 7)
+        assert node is None
+
+    def test_get_end_line(self) -> None:
+        """_get_end_line returns last line of node."""
+        source = """
+async with ctx:
+    line1()
+    line2()
+    line3()
+"""
+        tree = ast.parse(source)
+        async_with = tree.body[0]
+
+        end_line = _get_end_line(async_with)
+        assert end_line >= 4  # At least through line 4
+
+    def test_extract_body(self) -> None:
+        """_extract_body extracts the body source from async with."""
+        source = """async with ctx:
+    do_thing()
+    more_thing()
+"""
+        lines = source.split("\n")
+        lines = [line + "\n" for line in lines]
+
+        tree = ast.parse(source)
+        async_with = tree.body[0]
+        assert isinstance(async_with, ast.AsyncWith)
+
+        body = _extract_body(lines, async_with)
+        assert "do_thing()" in body
+        assert "more_thing()" in body
+
+
+class TestASTExtractionError:
+    """Tests for ASTExtractionError."""
+
+    def test_is_exception(self) -> None:
+        """ASTExtractionError is an exception."""
+        error = ASTExtractionError("test message")
+        assert isinstance(error, Exception)
+        assert str(error) == "test message"
diff --git a/hud/eval/tests/test_task.py b/hud/eval/tests/test_task.py
new file mode 100644
index 00000000..b8be866d
--- /dev/null
+++ b/hud/eval/tests/test_task.py
@@ -0,0 +1,145 @@
+"""Tests for hud.eval.task module."""
+
+from __future__ import annotations
+
+import pytest
+
+from hud.eval.task import Task, TaskAgentConfig
+
+
+class TestTaskSerialization:
+    """Tests for Task serialization and roundtrip."""
+
+    def test_v5_task_roundtrip(self) -> None:
+        """v5 Task serializes and deserializes correctly."""
+        task = Task(
+            env={"name": "browser", "include": ["navigate", "click"]},
+            scenario="checkout",
+            id="task-1",
+            args={"user_id": "alice"},
+        )
+
+        # Serialize
+        data = task.model_dump(mode="json")
+
+        # Should have v5 format
+        assert "env" in data
+        assert data["env"]["name"] == "browser"
+        assert data["scenario"] == "checkout"
+        assert data["id"] == "task-1"
+
+        # Recreate from serialized data
+        task2 = Task(**data)
+
+        # Serialize again
+        data2 = task2.model_dump(mode="json")
+
+        # Should be identical
+        assert data == data2
+
+    def test_v4_task_roundtrip(self) -> None:
+        """v4 Task serializes (flattens) and deserializes correctly."""
+        v4_dict = {
+            "prompt": "Go to google.com and search for cats",
+            "mcp_config": {
+                "browser": {"url": "http://localhost:8080"},
+            },
+            "evaluate_tool": {"name": "check_url", "arguments": {"contains": "google"}},
+            "setup_tool": {"name": "navigate", "arguments": {"url": "about:blank"}},
+            "id": "v4-task-1",
+            "agent_config": {"system_prompt": "You are a helpful assistant"},
+            "metadata": {"category": "navigation"},
+        }
+
+        # Create Task from v4 dict
+        task = Task.from_v4(v4_dict)
+
+        # Serialize (should flatten to v4 format)
+        data = task.model_dump(mode="json")
+
+        # Should have v4 format (flat, not nested env)
+        assert "prompt" in data
+        assert "mcp_config" in data
+        assert "evaluate_tool" in data
+        assert data["prompt"] == "Go to google.com and search for cats"
+        assert data["id"] == "v4-task-1"
+
+        # Recreate from serialized data
+        task2 = Task(**data)
+
+        # Serialize again
+        data2 = task2.model_dump(mode="json")
+
+        # Should be identical
+        assert data == data2
+
+    def test_v4_preserves_agent_config(self) -> None:
+        """v4 Task preserves agent_config through roundtrip."""
+        v4_dict = {
+            "prompt": "Test prompt",
+            "mcp_config": {"server": {"url": "http://localhost"}},
+            "evaluate_tool": {"name": "check", "arguments": {}},
+            "agent_config": {"system_prompt": "Custom system prompt"},
+        }
+
+        task = Task.from_v4(v4_dict)
+        data = task.model_dump(mode="json")
+
+        assert data.get("agent_config") == {"system_prompt": "Custom system prompt"}
+
+        # Roundtrip
+        task2 = Task(**data)
+        assert task2.agent_config is not None
+        assert isinstance(task2.agent_config, TaskAgentConfig)
+        assert task2.agent_config.system_prompt == "Custom system prompt"
+
+    def test_v4_preserves_metadata(self) -> None:
+        """v4 Task preserves metadata through roundtrip."""
+        v4_dict = {
+            "prompt": "Test prompt",
+            "mcp_config": {"server": {"url": "http://localhost"}},
+            "evaluate_tool": {"name": "check", "arguments": {}},
+            "metadata": {"key1": "value1", "key2": 42},
+        }
+
+        task = Task.from_v4(v4_dict)
+        data = task.model_dump(mode="json")
+
+        assert data.get("metadata") == {"key1": "value1", "key2": 42}
+
+        # Roundtrip
+        task2 = Task(**data)
+        assert task2.metadata == {"key1": "value1", "key2": 42}
+
+
+class TestTaskValidation:
+    """Tests for Task validation."""
+
+    def test_v5_allows_none_env(self) -> None:
+        """v5 Task allows None env (for blank evals)."""
+        task = Task(scenario="test")  # env=None is valid
+        assert task.env is None
+        assert task.scenario == "test"
+
+    def test_v4_requires_evaluate_tool(self) -> None:
+        """v4 Task requires evaluate_tool for validation."""
+        from hud.eval.utils import validate_v4_task
+
+        with pytest.raises(ValueError, match="evaluate_tool"):
+            validate_v4_task(
+                {
+                    "prompt": "test",
+                    "mcp_config": {"server": {}},
+                    # Missing evaluate_tool
+                }
+            )
+
+    def test_agent_config_accepts_dict(self) -> None:
+        """agent_config can be provided as dict and gets converted."""
+        task = Task(
+            env={"name": "browser"},
+            agent_config={"system_prompt": "Hello"},
+        )
+
+        assert isinstance(task.agent_config, TaskAgentConfig)
+        assert task.agent_config.system_prompt == "Hello"
diff --git a/hud/eval/types.py b/hud/eval/types.py
new file mode 100644
index 00000000..6a2df059
--- /dev/null
+++ b/hud/eval/types.py
@@ -0,0 +1,63 @@
+"""Types and exceptions for the eval module.
+
+Kept separate to avoid circular imports.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+from pydantic import BaseModel
+
+# =============================================================================
+# Exceptions
+# =============================================================================
+
+
+class ParallelEvalComplete(Exception):
+    """Raised by summary context to skip body re-execution after parallel eval.
+
+    This is caught by the eval() context manager to cleanly exit.
+    The summary context with results is still accessible after the with block.
+    """
+
+
+# =============================================================================
+# Payload Models
+# =============================================================================
+
+
+class EvalPayload(BaseModel):
+    """Base payload for eval enter/exit."""
+
+    prompt: str | None = None
+    code_snippet: str | None = None
+    job_id: str | None = None
+    group_id: str | None = None
+    variants: dict[str, Any] | None = None
+    task_version_id: str | None = None
+    metadata: dict[str, Any] | None = None
+
+
+class EvalExitPayload(EvalPayload):
+    """Exit payload with result fields."""
+
+    reward: float | None = None
+    success: bool = True
+    error_message: str | None = None
+
+
+class JobEnterPayload(BaseModel):
+    """Payload for job/{job_id}/enter - sent once at job start."""
+
+    name: str | None = None
+    variants: dict[str, Any] | None = None  # Full variant config
+    group: int | None = None
+
+
+__all__ = [
+    "EvalExitPayload",
+    "EvalPayload",
+    "JobEnterPayload",
+    "ParallelEvalComplete",
+]
diff --git a/hud/eval/utils.py b/hud/eval/utils.py
new file mode 100644
index 00000000..5688314a
--- /dev/null
+++ b/hud/eval/utils.py
@@ -0,0 +1,183 @@
+"""Utility functions for the eval module."""
+
+from __future__ import annotations
+
+import logging
+import warnings
+from typing import Any
+
+__all__ = ["build_env_from_v4", "is_v4_format", "validate_v4_task"]
+
+logger = logging.getLogger(__name__)
+
+
+def is_v4_format(data: dict[str, Any]) -> bool:
+    """Detect if dict looks like v4 LegacyTask format.
+
+    Used for branching logic. Checks if data has the core v4 fields
+    (prompt AND mcp_config). Does NOT validate completeness.
+
+    Args:
+        data: Dict to check
+
+    Returns:
+        True if looks like v4 format, False otherwise
+    """
+    if not isinstance(data, dict):
+        return False
+
+    # Core v4 detection: prompt + mcp_config
+    return bool(data.get("prompt")) and bool(data.get("mcp_config"))
+
+
+def validate_v4_task(data: dict[str, Any]) -> None:
+    """Validate v4 task has all required fields.
+
+    A valid v4 task must have all three required fields:
+    - prompt: The task instruction
+    - mcp_config: MCP server configuration
+    - evaluate_tool: How to evaluate success
+
+    Call this after is_v4_format() when you need to ensure completeness.
+
+    Args:
+        data: Dict to validate
+
+    Raises:
+        ValueError: If any required fields are missing
+    """
+    missing = []
+    if not data.get("prompt"):
+        missing.append("prompt")
+    if not data.get("mcp_config"):
+        missing.append("mcp_config")
+    if not data.get("evaluate_tool"):
+        missing.append("evaluate_tool")
+
+    if missing:
+        raise ValueError(f"v4 task missing required fields: {', '.join(missing)}")
+
+
+def build_env_from_v4(source: dict[str, Any] | Any) -> dict[str, Any]:
+    """Build Environment from v4 LegacyTask format.
+
+    Creates an Environment configured with the legacy task's fields.
+    Returns a dict ready to be passed to Task() constructor.
+
+    Args:
+        source: dict or LegacyTask with v4 fields (prompt, mcp_config, etc.)
+
+    Returns:
+        Dict with Task fields: env, id, scenario, args, validation, system_prompt, metadata
+
+    Raises:
+        TypeError: If source is not a dict or LegacyTask
+    """
+    from hud.environment import Environment
+    from hud.types import LegacyTask, MCPToolCall
+
+    # Convert dict to LegacyTask if needed
+    if isinstance(source, dict):
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore", category=DeprecationWarning)
+            legacy = LegacyTask(**source)
+    elif isinstance(source, LegacyTask):
+        legacy = source
+    else:
+        raise TypeError(f"Expected dict or LegacyTask, got {type(source).__name__}")
+
+    # Warn if using local MCP configs (command without url)
+    _warn_local_mcp(legacy.mcp_config)
+
+    # Extract tool filters from agent_config (v4 style)
+    # These are agent-level filters, not connection-level
+    include_tools: list[str] | None = None
+    exclude_tools: list[str] | None = None
+    if legacy.agent_config:
+        include_tools = legacy.agent_config.allowed_tools
+        exclude_tools = legacy.agent_config.disallowed_tools
+
+    # Convert ["*"] wildcard to None (meaning include all)
+    if include_tools == ["*"]:
+        include_tools = None
+
+    # Create Environment - NO connections made here, just config stored
+    env = Environment(legacy.id or "v4-legacy")
+    env.connect_mcp_config(legacy.mcp_config)
+
+    # Store agent-level tool filters on Environment (applied in as_tools())
+    # This allows Environment to call setup/evaluate while hiding them from agent
+    env._agent_include = include_tools
+    env._agent_exclude = exclude_tools
+
+    # Set the prompt
+    env.prompt = legacy.prompt
+
+    # Add setup_tool calls (stored, not executed)
+    if legacy.setup_tool:
+        setup_calls = legacy.setup_tool
+        if not isinstance(setup_calls, list):
+            setup_calls = [setup_calls]
+        for call in setup_calls:
+            env.setup_tool(call.name, **(call.arguments or {}))
+
+    # Add evaluate_tool calls (stored, not executed)
+    if legacy.evaluate_tool:
+        eval_calls = legacy.evaluate_tool
+        if not isinstance(eval_calls, list):
+            eval_calls = [eval_calls]
+        for call in eval_calls:
+            env.evaluate_tool(call.name, **(call.arguments or {}))
+
+    # Build Task fields dict
+    result: dict[str, Any] = {
+        "env": env,
+        "id": legacy.id,
+        "scenario": None,  # v4 uses prompt, not scenarios
+        "args": {},
+    }
+
+    # Map integration_test_tool → validation (same concept: tool calls to verify)
+    if legacy.integration_test_tool:
+        int_test = legacy.integration_test_tool
+        if not isinstance(int_test, list):
+            int_test = [int_test]
+        # Convert to MCPToolCall if needed
+        result["validation"] = [
+            call if isinstance(call, MCPToolCall) else MCPToolCall(**call.model_dump())
+            for call in int_test
+        ]
+
+    # Extract agent_config (just system_prompt for now)
+    if legacy.agent_config and legacy.agent_config.system_prompt:
+        result["agent_config"] = {"system_prompt": legacy.agent_config.system_prompt}
+
+    # Preserve metadata
+    if legacy.metadata:
+        result["metadata"] = legacy.metadata
+
+    return result
+
+
+def _warn_local_mcp(mcp_config: dict[str, Any] | None) -> None:
+    """Warn if mcp_config uses local MCP servers (command without url).
+
+    Local MCP servers can cause port conflicts when running tasks concurrently.
+    """
+    if not mcp_config:
+        return
+
+    has_local = any(
+        isinstance(server_cfg, dict) and "command" in server_cfg and not server_cfg.get("url")
+        for server_cfg in mcp_config.values()
+        if isinstance(server_cfg, dict)
+    )
+
+    if has_local:
+        warnings.warn(
+            "Task uses local MCP configuration (command without url). "
+            "This may cause port conflicts when running tasks concurrently. "
+            "Consider using remote MCP servers for parallel execution.",
+            UserWarning,
+            stacklevel=4,
+        )
diff --git a/hud/misc/__init__.py b/hud/misc/__init__.py
deleted file mode 100644
index 40fb1d81..00000000
--- a/hud/misc/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-"""Miscellaneous utilities for HUD SDK."""
diff --git a/hud/misc/claude_plays_pokemon.py b/hud/misc/claude_plays_pokemon.py
deleted file mode 100644
index 96b78ae2..00000000
--- a/hud/misc/claude_plays_pokemon.py
+++ /dev/null
@@ -1,292 +0,0 @@
-# pyright: reportGeneralTypeIssues=false
-from __future__ import annotations
-
-import json
-import logging
-from typing import TYPE_CHECKING, Any, cast
-
-from anthropic import AsyncAnthropic
-
-from hud.adapters import Adapter
-from hud.adapters.common.types import CLA
-
-# Update import to current API; if this script is legacy, keep it optional
-try:
-    from hud.agents import MCPAgent as Agent  # type: ignore[assignment]
-except Exception:  # pragma: no cover - optional example script
-    from hud.agents import MCPAgent as Agent  # fallback
-from hud.settings import settings
-
-if TYPE_CHECKING:
-    from anthropic.types.beta import (
-        BetaImageBlockParam,
-        BetaMessageParam,
-        BetaTextBlockParam,
-    )
-
-    from hud.env.environment import Observation
-
-logger = logging.getLogger(__name__)
-
-# Constants
-DEFAULT_MODEL = "claude-3-7-sonnet-20250219"
-DEFAULT_MAX_TOKENS = 4096
-DEFAULT_MAX_ITERATIONS = 10
-DEFAULT_TEMPERATURE = 0.7
-DEFAULT_MAX_MESSAGE_MEMORY = 20
-
-
-def generate_system_prompt(game_name: str) -> str:
-    """Generate the system prompt for the AI agent.
-
-    Args:
-        game_name: Name of the game being played
-
-    Returns:
-        str: The system prompt for the AI agent
-    """
-    return """You are a specialized AI assistant designed to play Pokémon games via screenshot analysis and text instructions. Your task is to understand the current game state from visual input, determine appropriate actions, and respond with structured outputs that control the game.
-
-For each turn, you will receive:
-1. A screenshot of the current game state
-2. Contextual information about the game progress, recent events, and objectives
-
-Based on this information, you must analyze the situation, determine the best course of action, and provide a structured JSON response.
-
-## Response Format
-Your response MUST follow this exact JSON format with no additional markers, tags, or block delimiters:
-
-{
-  "analysis": "Brief analysis of the current game situation, visible UI elements, and important context (1-3 sentences)",
-  "current_objective": "The immediate goal based on the game state (single sentence)",
-  "reasoning": "Step-by-step logic explaining your chosen action sequence (2-4 sentences)",
-  "progress_assessment": "Evaluation of whether previous action(s) achieved their intended goal and why/why not (1-2 sentences)",
-  "actions": [
-    {
-      "type": "press",
-      "keys": ["up"|"down"|"left"|"right"|"a"|"b"|"start"|"select"|"pause"]
-    },
-    {
-      "type": "wait",
-      "time": milliseconds_to_wait
-    }
-  ]
-}
-
-IMPORTANT: Do not include any conversation markers like <<ASSISTANT_CONVERSATION_START>> or <<ASSISTANT_CONVERSATION_END>> around your response. Provide only the clean JSON object.
-
-## Action Types
-- Button presses: {"type": "press", "keys": ["button_name"]} - Valid buttons are: up, down, left, right, a, b, start, select, pause
-- Wait for processing: {"type": "wait", "time": milliseconds}
-
-## Important Rules
-1. Never use "wait" commands while the game is paused. The game state will not change while paused, so waiting is ineffective.
-2. If you detect the game is paused, your next action should be to unpause by using {"type": "press", "keys": ["pause"]} before attempting other actions.
-3. Maintain awareness of whether the game is in a paused state based on visual cues in the screenshot.
-
-## Game Play Guidelines
-1. **Navigation**: Use directional buttons to move the character or navigate menus
-2. **Interaction**: Use 'a' to confirm selections and interact with objects/NPCs, 'b' to cancel or exit menus
-3. **Menu Access**: Use 'start' to access the game menu
-4. **Battle Strategy**: Analyze Pokémon types, moves, and stats to make optimal battle decisions
-5. **Progressive Play**: Work toward completing the current objective while being mindful of longer-term goals like leveling Pokémon, collecting badges, and advancing the story
-6. **Resource Management**: Monitor and manage HP, PP, items, and Pokéballs effectively
-7. **Memory**: Maintain awareness of the game history and your previous actions to avoid repetitive behaviors
-
-Always provide thoughtful analysis and clear reasoning for your decisions. If you're uncertain about the best course of action, prioritize safe moves that gather more information.
-"""  # noqa: E501
-
-
-def extract_action_from_response_block(block: dict[str, Any]) -> list[dict[str, Any]]:
-    """Extract actions from a response block.
-
-    Args:
-        block: The response block containing actions
-
-    Returns:
-        list[dict[str, Any]]: List of actions extracted from the block
-    """
-    if "actions" in block:
-        actions = block["actions"]
-        if isinstance(actions, list):
-            return actions
-    return []
-
-
-def extract_json_from_response(response: str) -> str:
-    """Extract JSON from a response string.
-
-    Args:
-        response: The response string containing JSON
-
-    Returns:
-        str: The extracted JSON string
-    """
-    # Try to find JSON block with markdown code block markers
-    start = response.find("```json")
-    end = response.rfind("```")
-    if start != -1 and end != -1:
-        start += len("```json")
-        return response[start:end].strip()
-
-    # Try to find JSON object directly
-    start = response.find("{")
-    end = response.rfind("}")
-    if start != -1 and end != -1:
-        return response[start : end + 1].strip()
-
-    return response.strip()
-
-
-class ClaudePlaysPokemon(Agent[AsyncAnthropic, CLA]):
-    """AI agent that plays Pokémon games using Claude."""
-
-    def __init__(
-        self,
-        client: AsyncAnthropic | None = None,
-        adapter: Adapter | None = None,
-        model: str = DEFAULT_MODEL,
-        max_tokens: int = DEFAULT_MAX_TOKENS,
-        max_iterations: int = DEFAULT_MAX_ITERATIONS,
-        temperature: float = DEFAULT_TEMPERATURE,
-        max_message_memory: int = DEFAULT_MAX_MESSAGE_MEMORY,
-    ) -> None:
-        """Initialize the Claude Plays Pokémon agent.
-
-        Args:
-            client: Anthropic API client
-            adapter: Game adapter
-            model: Claude model to use
-            max_tokens: Maximum tokens for response
-            max_iterations: Maximum number of iterations
-            temperature: Response temperature
-            max_message_memory: Maximum number of messages to remember
-
-        Raises:
-            ValueError: If API key is not provided
-        """
-        if client is None:
-            api_key = settings.anthropic_api_key
-            if not api_key:
-                raise ValueError("Anthropic API key is required")
-            client = AsyncAnthropic(api_key=api_key)
-
-        if adapter is None:
-            adapter = Adapter()
-
-        super().__init__(
-            client=client,
-            adapter=adapter,
-        )
-
-        self.model = model
-        self.max_tokens = max_tokens
-        self.max_iterations = max_iterations
-        self.temperature = temperature
-        self.max_message_memory = max_message_memory
-
-        self.system_prompts: list[BetaMessageParam] = [
-            {
-                "role": "assistant",
-                "content": generate_system_prompt("Pokemon Red"),
-            }
-        ]
-
-        self.messages: list[BetaMessageParam] = []
-
-    async def fetch_response(self, observation: Observation) -> tuple[list[dict[str, Any]], bool]:
-        """Fetch a response from Claude based on the current observation.
-
-        Args:
-            observation: The current game observation
-
-        Returns:
-            tuple[list[dict[str, Any]], bool, list[LogType] | None]: List of actions, whether the game is done, and a list of strings or dictionaries of logs.
-
-        Raises:
-            ValueError: If client is not initialized
-        """  # noqa: E501
-        if not self.client:
-            raise ValueError("Client is not initialized")
-
-        user_content: list[BetaTextBlockParam | BetaImageBlockParam] = []
-
-        if observation.text:
-            user_content.append(
-                {
-                    "type": "text",
-                    "text": observation.text,
-                }
-            )
-
-        if observation.screenshot:
-            logger.debug("Processing screenshot data")
-            user_content.append(
-                {
-                    "type": "image",
-                    "source": {
-                        "type": "base64",
-                        "media_type": "image/png",
-                        "data": observation.screenshot,
-                    },
-                }
-            )
-
-        self.messages.append(
-            {
-                "role": "user",
-                "content": user_content,
-            }
-        )
-
-        logger.debug(
-            "Sending messages to Claude", extra={"messages": self.system_prompts + self.messages}
-        )
-
-        response = await self.client.beta.messages.create(
-            model=self.model,
-            messages=self.system_prompts + self.messages,
-            temperature=self.temperature,
-            max_tokens=self.max_tokens,
-        )
-
-        response_content = response.content
-        self.messages.append(
-            cast(
-                "BetaMessageParam",
-                {
-                    "role": "user",
-                    "content": response_content,
-                },
-            )
-        )
-
-        # Maintain message memory limit
-        while len(self.messages) > self.max_message_memory:
-            self.messages.pop(0)
-
-        action_list: list[dict[str, Any]] = []
-
-        # Parse response content to extract actions
-        for block in response_content:
-            if block.type == "text":
-                text_json = extract_json_from_response(block.text)
-                try:
-                    text = json.loads(text_json)
-                    if not isinstance(text, dict):
-                        logger.error("Invalid response format", extra={"text": text})
-                        raise ValueError("Response is not a dictionary")
-
-                    action_list.extend(extract_action_from_response_block(text))
-
-                except json.JSONDecodeError as e:
-                    logger.error(
-                        "Failed to parse response", extra={"error": str(e), "text": text_json}
-                    )
-
-            else:
-                logger.error("Unexpected block type", extra={"type": type(block)})
-
-        logger.debug("Extracted actions", extra={"actions": action_list})
-
-        return action_list, False
diff --git a/hud/otel/__init__.py b/hud/otel/__init__.py
deleted file mode 100644
index 07ab487b..00000000
--- a/hud/otel/__init__.py
+++ /dev/null
@@ -1,35 +0,0 @@
-"""HUD OpenTelemetry integration.
-
-This package provides the internal OpenTelemetry implementation for HUD telemetry.
-Users should interact with the telemetry APIs through hud.telemetry instead.
-
-Internal Components:
-- config: OpenTelemetry configuration and setup
-- context: Trace context management and utilities
-- processors: Span enrichment with HUD context
-- exporters: Sending spans to HUD backend
-- collector: In-memory span collection for replay
-- instrumentation: Auto-instrumentation for agents and MCP
-"""
-
-from __future__ import annotations
-
-from .collector import enable_trace_collection
-from .config import configure_telemetry, is_telemetry_configured, shutdown_telemetry
-from .context import (
-    get_current_task_run_id,
-    is_root_trace,
-    span_context,
-    trace,
-)
-
-__all__ = [
-    "configure_telemetry",
-    "enable_trace_collection",
-    "get_current_task_run_id",
-    "is_root_trace",
-    "is_telemetry_configured",
-    "shutdown_telemetry",
-    "span_context",
-    "trace",
-]
diff --git a/hud/otel/collector.py b/hud/otel/collector.py
deleted file mode 100644
index 310eb38f..00000000
--- a/hud/otel/collector.py
+++ /dev/null
@@ -1,142 +0,0 @@
-"""Global span collector for building in-memory traces.
-
-This module provides a way to collect spans during execution
-and retrieve them as a Trace object, enabling replay functionality
-without modifying agent code.
-"""
-
-from __future__ import annotations
-
-import logging
-import threading
-from contextvars import ContextVar
-from typing import TYPE_CHECKING
-
-from opentelemetry import trace
-from opentelemetry.sdk.trace.export import SpanExporter, SpanExportResult
-
-from hud.types import Trace
-
-if TYPE_CHECKING:
-    from opentelemetry.sdk.trace import ReadableSpan
-
-logger = logging.getLogger(__name__)
-
-# Global storage for collected spans by task_run_id
-_TRACE_STORAGE: dict[str, TraceCollector] = {}
-_LOCK = threading.Lock()
-
-# Context variable to track if collection is enabled
-_collecting_enabled: ContextVar[bool] = ContextVar("collecting_enabled", default=False)
-
-
-class TraceCollector:
-    """Collects spans for a single task run."""
-
-    def __init__(self, task_run_id: str) -> None:
-        self.task_run_id = task_run_id
-        self.spans: list[ReadableSpan] = []
-        self._lock = threading.Lock()
-
-    def add_span(self, span: ReadableSpan) -> None:
-        """Thread-safe span addition."""
-        with self._lock:
-            self.spans.append(span)
-
-    def to_trace(self) -> Trace:
-        """Convert collected spans to a Trace object."""
-        from .exporters import HudSpan, _span_to_dict
-
-        trace = Trace()
-
-        # Convert spans to TraceSteps
-        for span in self.spans:
-            try:
-                # Use the same conversion logic as the exporter
-                span_dict = _span_to_dict(span)
-                hud_span = HudSpan.model_validate(span_dict)
-
-                # The attributes field is already a TraceStep
-                step = hud_span.attributes
-                # Add timing from the span itself
-                step.start_timestamp = hud_span.start_time
-                step.end_timestamp = hud_span.end_time
-                trace.append(step)
-
-            except Exception as e:
-                # Log but don't fail the whole trace
-                logger.debug("Failed to convert span: %s", e)
-
-        return trace
-
-
-class CollectingSpanExporter(SpanExporter):
-    """A span exporter that collects spans in memory for replay."""
-
-    def export(self, spans: list[ReadableSpan]) -> SpanExportResult:
-        """Collect spans if collection is enabled."""
-        if not _collecting_enabled.get():
-            return SpanExportResult.SUCCESS
-
-        for span in spans:
-            # Extract task_run_id from span
-            task_run_id = span.attributes.get("hud.task_run_id") if span.attributes else None
-            if not task_run_id or not isinstance(task_run_id, str):
-                continue
-
-            # Get or create collector
-            with _LOCK:
-                if task_run_id not in _TRACE_STORAGE:
-                    _TRACE_STORAGE[task_run_id] = TraceCollector(task_run_id)
-                collector = _TRACE_STORAGE[task_run_id]
-
-            # Add span
-            collector.add_span(span)
-
-        return SpanExportResult.SUCCESS
-
-    def shutdown(self) -> None:
-        """Clean up resources."""
-        with _LOCK:
-            _TRACE_STORAGE.clear()
-
-
-def enable_trace_collection(enabled: bool = True) -> None:
-    """Enable or disable in-memory trace collection."""
-    _collecting_enabled.set(enabled)
-
-
-def get_trace(task_run_id: str) -> Trace | None:
-    """Retrieve collected trace for a task run ID.
-
-    Returns None if no trace was collected or collection was disabled.
-    """
-    with _LOCK:
-        collector = _TRACE_STORAGE.get(task_run_id)
-        if collector:
-            return collector.to_trace()
-    return None
-
-
-def clear_trace(task_run_id: str) -> None:
-    """Clear collected trace for a task run ID."""
-    with _LOCK:
-        _TRACE_STORAGE.pop(task_run_id, None)
-
-
-def install_collector() -> None:
-    """Install the collecting span exporter.
-
-    This should be called after configure_telemetry().
-    """
-    provider = trace.get_tracer_provider()
-    # Guard for SDK tracer providers only
-    if hasattr(provider, "add_span_processor"):
-        from opentelemetry.sdk.trace.export import SimpleSpanProcessor
-
-        exporter = CollectingSpanExporter()
-        processor = SimpleSpanProcessor(exporter)
-        try:
-            provider.add_span_processor(processor)  # type: ignore[attr-defined]
-        except Exception:
-            logger.warning("Failed to add span processor")
diff --git a/hud/otel/config.py b/hud/otel/config.py
deleted file mode 100644
index 9250f104..00000000
--- a/hud/otel/config.py
+++ /dev/null
@@ -1,183 +0,0 @@
-"""Central configuration for OpenTelemetry inside HUD SDK.
-
-This file is responsible for
-1. creating the global ``TracerProvider``
-2. attaching span processors (HUD enrichment, batch + exporter)
-3. activating the community MCP instrumentation so that *every* MCP
-   request/response/notification is traced automatically.
-
-It is *idempotent*: calling :func:`configure_telemetry` more than once
-returns the same provider and does nothing.
-"""
-
-from __future__ import annotations
-
-import logging
-from typing import Any
-
-from opentelemetry import trace
-from opentelemetry.sdk.resources import Resource
-from opentelemetry.sdk.trace import TracerProvider
-from opentelemetry.sdk.trace.export import BatchSpanProcessor
-
-from hud.settings import settings
-
-from .collector import enable_trace_collection, install_collector
-from .exporters import HudSpanExporter
-from .instrumentation import install_mcp_instrumentation
-from .processors import HudEnrichmentProcessor
-
-logger = logging.getLogger(__name__)
-
-# Global singleton provider so multiple calls do not create duplicates
-_TRACER_PROVIDER: TracerProvider | None = None
-
-
-def is_telemetry_configured() -> bool:
-    """Check if telemetry has been configured."""
-    return _TRACER_PROVIDER is not None
-
-
-# ---------------------------------------------------------------------------
-# Public API
-# ---------------------------------------------------------------------------
-
-
-def configure_telemetry(
-    *,
-    service_name: str = "hud-sdk",
-    service_version: str | None = None,
-    environment: str | None = None,
-    extra_resource_attributes: dict[str, Any] | None = None,
-    enable_otlp: bool = False,
-    otlp_endpoint: str | None = None,
-    otlp_headers: dict[str, str] | None = None,
-    enable_collection: bool = True,
-) -> TracerProvider:
-    """Initialise OpenTelemetry for the current Python process.
-
-    It is safe to call this in every entry-point; the provider will only
-    be created once.
-    """
-    global _TRACER_PROVIDER
-
-    if _TRACER_PROVIDER is not None:
-        return _TRACER_PROVIDER
-
-    # ------------------------------------------------------------------
-    # 1. Resource (identity of this service)
-    # ------------------------------------------------------------------
-    res_attrs: dict[str, Any] = {
-        "service.name": service_name,
-        "telemetry.sdk.name": "hud-otel",
-        "telemetry.sdk.language": "python",
-    }
-    if service_version:
-        res_attrs["service.version"] = service_version
-    if environment:
-        res_attrs["deployment.environment"] = environment
-    if extra_resource_attributes:
-        res_attrs.update(extra_resource_attributes)
-
-    resource = Resource.create(res_attrs)
-
-    # ------------------------------------------------------------------
-    # 2. Provider
-    # ------------------------------------------------------------------
-    provider = TracerProvider(resource=resource)
-    _TRACER_PROVIDER = provider
-
-    # ------------------------------------------------------------------
-    # 3. Processors / exporters
-    # ------------------------------------------------------------------
-    provider.add_span_processor(HudEnrichmentProcessor())
-
-    # HUD exporter (only if enabled and API key is available)
-    if settings.telemetry_enabled and settings.api_key:
-        # Use the HudSpanExporter directly (it now handles async context internally)
-        exporter = HudSpanExporter(
-            telemetry_url=settings.hud_telemetry_url, api_key=settings.api_key
-        )
-
-        # Batch exports for efficiency while maintaining reasonable real-time visibility
-        provider.add_span_processor(
-            BatchSpanProcessor(
-                exporter,
-                schedule_delay_millis=1000,  # Export every 5 seconds (less frequent)
-                max_queue_size=16384,  # Larger queue for high-volume scenarios
-                max_export_batch_size=512,  # Larger batches (fewer uploads)
-                export_timeout_millis=30000,
-            )
-        )
-    elif settings.telemetry_enabled and not settings.api_key and not enable_otlp:
-        # Error if no exporters are configured
-        raise ValueError(
-            "No telemetry backend configured. Either:\n"
-            "1. Set HUD_API_KEY environment variable for HUD telemetry (https://hud.ai)\n"
-            "2. Use enable_otlp=True with configure_telemetry() for alternative backends (e.g., Jaeger)\n"  # noqa: E501
-        )
-    elif not settings.telemetry_enabled:
-        logger.info("HUD telemetry disabled via HUD_TELEMETRY_ENABLED=false")
-
-    # OTLP exporter (optional - for standard OTel viewers)
-    if enable_otlp:
-        try:
-            from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
-
-            otlp_config = {}
-            if otlp_endpoint:
-                otlp_config["endpoint"] = otlp_endpoint
-                # Default to HTTP endpoint if not specified
-                if not otlp_endpoint.startswith(("http://", "https://")):
-                    otlp_config["endpoint"] = f"http://{otlp_endpoint}/v1/traces"
-            else:
-                # Default HTTP endpoint
-                otlp_config["endpoint"] = "http://localhost:4318/v1/traces"
-
-            if otlp_headers:
-                otlp_config["headers"] = otlp_headers
-
-            otlp_exporter = OTLPSpanExporter(**otlp_config)
-            provider.add_span_processor(
-                BatchSpanProcessor(
-                    otlp_exporter,
-                    schedule_delay_millis=1000,
-                    max_queue_size=16384,
-                    max_export_batch_size=512,
-                    export_timeout_millis=30000,
-                )
-            )
-            logger.info("OTLP HTTP exporter enabled - endpoint: %s", otlp_config["endpoint"])
-        except ImportError:
-            logger.warning(
-                "OTLP export requested but opentelemetry-exporter-otlp-proto-http not installed. "
-                "Install with: pip install 'hud-python[agent]'"
-            )
-
-    # ------------------------------------------------------------------
-    # 4. Activate provider and instrumentation
-    # ------------------------------------------------------------------
-    trace.set_tracer_provider(provider)
-    install_mcp_instrumentation(provider)
-
-    # Install in-memory collector if requested
-    if enable_collection:
-        install_collector()
-        enable_trace_collection(True)
-        logger.debug("In-memory trace collection enabled")
-
-    # Agent instrumentation now handled by @hud.instrument decorators
-    logger.debug("OpenTelemetry configuration completed")
-
-    logger.debug("OpenTelemetry configured (provider id=%s)", id(provider))
-    return provider
-
-
-def shutdown_telemetry() -> None:
-    """Flush and shutdown the global provider (if configured)."""
-    global _TRACER_PROVIDER
-    if _TRACER_PROVIDER is None:
-        return
-    _TRACER_PROVIDER.shutdown()  # type: ignore[arg-type]
-    _TRACER_PROVIDER = None
-    logger.debug("OpenTelemetry shutdown complete")
diff --git a/hud/otel/context.py b/hud/otel/context.py
deleted file mode 100644
index 756ff64a..00000000
--- a/hud/otel/context.py
+++ /dev/null
@@ -1,572 +0,0 @@
-"""OpenTelemetry context utilities for HUD telemetry.
-
-This module provides internal utilities for managing OpenTelemetry contexts.
-User-facing APIs are in hud.telemetry.
-"""
-
-from __future__ import annotations
-
-import contextlib
-import contextvars
-import logging
-import traceback
-from contextlib import contextmanager
-from typing import TYPE_CHECKING, Any
-
-from opentelemetry import baggage, context
-from opentelemetry import trace as otel_trace
-from opentelemetry.trace import Status, StatusCode
-
-if TYPE_CHECKING:
-    from collections.abc import Generator
-    from types import TracebackType
-
-from hud.settings import settings
-from hud.shared import make_request, make_request_sync
-
-logger = logging.getLogger(__name__)
-
-# Context variables for task tracking
-current_task_run_id: contextvars.ContextVar[str | None] = contextvars.ContextVar(
-    "current_task_run_id", default=None
-)
-is_root_trace_var: contextvars.ContextVar[bool] = contextvars.ContextVar(
-    "is_root_trace", default=False
-)
-
-# Step counters for different types
-current_base_mcp_steps: contextvars.ContextVar[int] = contextvars.ContextVar(
-    "current_base_mcp_steps", default=0
-)
-current_mcp_tool_steps: contextvars.ContextVar[int] = contextvars.ContextVar(
-    "current_mcp_tool_steps", default=0
-)
-current_agent_steps: contextvars.ContextVar[int] = contextvars.ContextVar(
-    "current_agent_steps", default=0
-)
-
-# Keys for OpenTelemetry baggage
-TASK_RUN_ID_KEY = "hud.task_run_id"
-IS_ROOT_TRACE_KEY = "hud.is_root_trace"
-BASE_MCP_STEPS_KEY = "hud.base_mcp_steps"
-MCP_TOOL_STEPS_KEY = "hud.mcp_tool_steps"
-AGENT_STEPS_KEY = "hud.agent_steps"
-
-
-def set_current_task_run_id(task_run_id: str | None) -> contextvars.Token:
-    """Set the current task run ID."""
-    return current_task_run_id.set(task_run_id)
-
-
-def get_current_task_run_id() -> str | None:
-    """Get current task_run_id from either contextvars or OTel baggage."""
-    # First try OTel baggage
-    task_run_id = baggage.get_baggage(TASK_RUN_ID_KEY)
-    if task_run_id and isinstance(task_run_id, str):
-        return task_run_id
-
-    # Fallback to contextvars
-    return current_task_run_id.get()
-
-
-def is_root_trace() -> bool:
-    """Check if current context is a root trace."""
-    # First try OTel baggage
-    is_root = baggage.get_baggage(IS_ROOT_TRACE_KEY)
-    if isinstance(is_root, str):
-        return is_root.lower() == "true"
-
-    # Fallback to contextvars
-    return is_root_trace_var.get()
-
-
-def get_base_mcp_steps() -> int:
-    """Get current base MCP step count from either contextvars or OTel baggage."""
-    # First try OTel baggage
-    step_count = baggage.get_baggage(BASE_MCP_STEPS_KEY)
-    if step_count and isinstance(step_count, str):
-        try:
-            return int(step_count)
-        except ValueError:
-            pass
-
-    # Fallback to contextvars
-    return current_base_mcp_steps.get()
-
-
-def get_mcp_tool_steps() -> int:
-    """Get current MCP tool step count from either contextvars or OTel baggage."""
-    # First try OTel baggage
-    step_count = baggage.get_baggage(MCP_TOOL_STEPS_KEY)
-    if step_count and isinstance(step_count, str):
-        try:
-            return int(step_count)
-        except ValueError:
-            pass
-
-    # Fallback to contextvars
-    return current_mcp_tool_steps.get()
-
-
-def get_agent_steps() -> int:
-    """Get current agent step count from either contextvars or OTel baggage."""
-    # First try OTel baggage
-    step_count = baggage.get_baggage(AGENT_STEPS_KEY)
-    if step_count and isinstance(step_count, str):
-        try:
-            return int(step_count)
-        except ValueError:
-            pass
-
-    # Fallback to contextvars
-    return current_agent_steps.get()
-
-
-def increment_base_mcp_steps() -> int:
-    """Increment the base MCP step count and update baggage.
-
-    Returns:
-        The new base MCP step count after incrementing
-    """
-    current = get_base_mcp_steps()
-    new_count = current + 1
-
-    # Update contextvar
-    current_base_mcp_steps.set(new_count)
-
-    # Update baggage for propagation
-    ctx = baggage.set_baggage(BASE_MCP_STEPS_KEY, str(new_count))
-    context.attach(ctx)
-
-    # Update current span if one exists
-    span = otel_trace.get_current_span()
-    if span and span.is_recording():
-        span.set_attribute("hud.base_mcp_steps", new_count)
-
-    return new_count
-
-
-def increment_mcp_tool_steps() -> int:
-    """Increment the MCP tool step count and update baggage.
-
-    Returns:
-        The new MCP tool step count after incrementing
-    """
-    current = get_mcp_tool_steps()
-    new_count = current + 1
-
-    # Update contextvar
-    current_mcp_tool_steps.set(new_count)
-
-    # Update baggage for propagation
-    ctx = baggage.set_baggage(MCP_TOOL_STEPS_KEY, str(new_count))
-    context.attach(ctx)
-
-    # Update current span if one exists
-    span = otel_trace.get_current_span()
-    if span and span.is_recording():
-        span.set_attribute("hud.mcp_tool_steps", new_count)
-
-    return new_count
-
-
-def increment_agent_steps() -> int:
-    """Increment the agent step count and update baggage.
-
-    Returns:
-        The new agent step count after incrementing
-    """
-    current = get_agent_steps()
-    new_count = current + 1
-
-    # Update contextvar
-    current_agent_steps.set(new_count)
-
-    # Update baggage for propagation
-    ctx = baggage.set_baggage(AGENT_STEPS_KEY, str(new_count))
-    context.attach(ctx)
-
-    # Update current span if one exists
-    span = otel_trace.get_current_span()
-    if span and span.is_recording():
-        span.set_attribute("hud.agent_steps", new_count)
-
-    return new_count
-
-
-@contextmanager
-def span_context(
-    name: str,
-    attributes: dict[str, Any] | None = None,
-    kind: otel_trace.SpanKind = otel_trace.SpanKind.INTERNAL,
-) -> Generator[otel_trace.Span, None, None]:
-    """Create a child span within the current trace context.
-
-    This is a simple wrapper around OpenTelemetry's span creation that
-    ensures the span inherits the current HUD context (task_run_id, etc).
-
-    Args:
-        name: Name for the span
-        attributes: Additional attributes to add to the span
-        kind: OpenTelemetry span kind
-
-    Example:
-        with span_context("process_data", {"items": 100}) as span:
-            # Process data...
-            span.set_attribute("processed", True)
-    """
-    tracer = otel_trace.get_tracer("hud-sdk")
-
-    # Current task_run_id will be added by HudEnrichmentProcessor
-    with tracer.start_as_current_span(
-        name,
-        attributes=attributes,
-        kind=kind,
-    ) as span:
-        yield span
-
-
-async def _update_task_status_async(
-    task_run_id: str,
-    status: str,
-    job_id: str | None = None,
-    error_message: str | None = None,
-    trace_name: str | None = None,
-    task_id: str | None = None,
-    group_id: str | None = None,
-    extra_metadata: dict[str, Any] | None = None,
-) -> None:
-    """Async task status update."""
-    if not settings.telemetry_enabled:
-        return
-
-    try:
-        data: dict[str, Any] = {"status": status}
-
-        # Resolve effective job_id from explicit param, OTel baggage, or current job context
-        effective_job_id: str | None = job_id
-        if not effective_job_id:
-            bj = baggage.get_baggage("hud.job_id")
-            if isinstance(bj, str) and bj:
-                effective_job_id = bj
-        if not effective_job_id:
-            try:
-                from hud.telemetry.job import get_current_job  # Local import to avoid cycles
-
-                current_job = get_current_job()
-                if current_job:
-                    effective_job_id = current_job.id
-            except Exception:
-                effective_job_id = None
-
-        if effective_job_id:
-            data["job_id"] = effective_job_id
-        if error_message:
-            data["error_message"] = error_message
-
-        # Build metadata with trace name and step counts
-        metadata = {}
-        if trace_name:
-            metadata["trace_name"] = trace_name
-
-        # Include all three step counts in metadata
-        metadata["base_mcp_steps"] = get_base_mcp_steps()
-        metadata["mcp_tool_steps"] = get_mcp_tool_steps()
-        metadata["agent_steps"] = get_agent_steps()
-
-        # Merge any extra metadata provided by callers (e.g., task config summaries)
-        if extra_metadata:
-            with contextlib.suppress(Exception):
-                metadata.update(extra_metadata)
-
-        if metadata:
-            data["metadata"] = metadata
-
-        if task_id:
-            data["task_id"] = task_id
-
-        if group_id:
-            data["group_id"] = group_id
-
-        await make_request(
-            method="POST",
-            url=f"{settings.hud_telemetry_url}/trace/{task_run_id}/status",
-            json=data,
-            api_key=settings.api_key,
-        )
-        logger.debug("Updated task %s status to %s", task_run_id, status)
-    except Exception as e:
-        # Suppress warnings about interpreter shutdown
-        if "interpreter shutdown" not in str(e):
-            logger.warning("Failed to update task status: %s", e)
-
-
-def _update_task_status_sync(
-    task_run_id: str,
-    status: str,
-    job_id: str | None = None,
-    error_message: str | None = None,
-    trace_name: str | None = None,
-    task_id: str | None = None,
-    group_id: str | None = None,
-    extra_metadata: dict[str, Any] | None = None,
-) -> None:
-    """Synchronous task status update."""
-    if not settings.telemetry_enabled:
-        return
-
-    try:
-        data: dict[str, Any] = {"status": status}
-
-        # Resolve effective job_id from explicit param, OTel baggage, or current job context
-        effective_job_id: str | None = job_id
-        if not effective_job_id:
-            bj = baggage.get_baggage("hud.job_id")
-            if isinstance(bj, str) and bj:
-                effective_job_id = bj
-        if not effective_job_id:
-            try:
-                from hud.telemetry.job import get_current_job  # Local import to avoid cycles
-
-                current_job = get_current_job()
-                if current_job:
-                    effective_job_id = current_job.id
-            except Exception:
-                effective_job_id = None
-
-        if effective_job_id:
-            data["job_id"] = effective_job_id
-        if error_message:
-            data["error_message"] = error_message
-
-        # Build metadata with trace name and step counts
-        metadata = {}
-        if trace_name:
-            metadata["trace_name"] = trace_name
-
-        # Include all three step counts in metadata
-        metadata["base_mcp_steps"] = get_base_mcp_steps()
-        metadata["mcp_tool_steps"] = get_mcp_tool_steps()
-        metadata["agent_steps"] = get_agent_steps()
-
-        # Merge any extra metadata provided by callers
-        if extra_metadata:
-            with contextlib.suppress(Exception):
-                metadata.update(extra_metadata)
-
-        if metadata:
-            data["metadata"] = metadata
-
-        if task_id:
-            data["task_id"] = task_id
-
-        if group_id:
-            data["group_id"] = group_id
-
-        make_request_sync(
-            method="POST",
-            url=f"{settings.hud_telemetry_url}/trace/{task_run_id}/status",
-            json=data,
-            api_key=settings.api_key,
-        )
-        logger.debug("Updated task %s status to %s", task_run_id, status)
-    except Exception as e:
-        # Suppress warnings about interpreter shutdown
-        if "interpreter shutdown" not in str(e):
-            logger.warning("Failed to update task status: %s", e)
-
-
-def _print_trace_url(task_run_id: str) -> None:
-    """Print the trace URL in a colorful box."""
-    # Only print HUD URL if HUD telemetry is enabled and has API key
-    if not (settings.telemetry_enabled and settings.api_key):
-        return
-
-    url = f"https://hud.ai/trace/{task_run_id}"
-    header = "🚀 See your agent live at:"
-
-    # ANSI color codes
-    DIM = "\033[90m"  # Dim/Gray for border (visible on both light and dark terminals)
-    GOLD = "\033[33m"  # Gold/Yellow for URL
-    RESET = "\033[0m"
-    BOLD = "\033[1m"
-
-    # Calculate box width based on the longest line
-    box_width = max(len(url), len(header)) + 6
-
-    # Box drawing characters
-    top_border = "╔" + "═" * (box_width - 2) + "╗"
-    bottom_border = "╚" + "═" * (box_width - 2) + "╝"
-    divider = "╟" + "─" * (box_width - 2) + "╢"
-
-    # Center the content
-    header_padding = (box_width - len(header) - 2) // 2
-    url_padding = (box_width - len(url) - 2) // 2
-
-    # Print the box
-    print(f"\n{DIM}{top_border}{RESET}")  # noqa: T201
-    print(  # noqa: T201
-        f"{DIM}║{RESET}{' ' * header_padding}{header}{' ' * (box_width - len(header) - header_padding - 3)}{DIM}║{RESET}"  # noqa: E501
-    )
-    print(f"{DIM}{divider}{RESET}")  # noqa: T201
-    print(  # noqa: T201
-        f"{DIM}║{RESET}{' ' * url_padding}{BOLD}{GOLD}{url}{RESET}{' ' * (box_width - len(url) - url_padding - 2)}{DIM}║{RESET}"  # noqa: E501
-    )
-    print(f"{DIM}{bottom_border}{RESET}\n")  # noqa: T201
-
-
-def _print_trace_complete_url(task_run_id: str, error_occurred: bool = False) -> None:
-    """Print the trace completion URL with appropriate messaging."""
-    # Only print HUD URL if HUD telemetry is enabled and has API key
-    if not (settings.telemetry_enabled and settings.api_key):
-        return
-
-    url = f"https://hud.ai/trace/{task_run_id}"
-
-    # ANSI color codes
-    GREEN = "\033[92m"
-    RED = "\033[91m"
-    GOLD = "\033[33m"
-    RESET = "\033[0m"
-    DIM = "\033[2m"
-    BOLD = "\033[1m"
-
-    if error_occurred:
-        print(  # noqa: T201
-            f"\n{RED}✗ Trace errored!{RESET} {DIM}More error details available at:{RESET} {BOLD}{GOLD}{url}{RESET}\n"  # noqa: E501
-        )
-    else:
-        print(f"\n{GREEN}✓ Trace complete!{RESET} {DIM}View at:{RESET} {BOLD}{GOLD}{url}{RESET}\n")  # noqa: T201
-
-
-class trace:
-    """Internal OpenTelemetry trace context manager.
-
-    This is the sync implementation. For async code, use hud.async_trace() instead.
-    """
-
-    def __init__(
-        self,
-        task_run_id: str,
-        is_root: bool = True,
-        span_name: str = "hud.task",
-        attributes: dict[str, Any] | None = None,
-        job_id: str | None = None,
-        task_id: str | None = None,
-        group_id: str | None = None,
-    ) -> None:
-        self.task_run_id = task_run_id
-        self.job_id = job_id
-        self.task_id = task_id
-        self.group_id = group_id
-        self.is_root = is_root
-        self.span_name = span_name
-        self.attributes = attributes or {}
-        self._span: otel_trace.Span | None = None
-        self._span_manager: Any | None = None
-        self._otel_token: object | None = None
-        self._task_run_token = None
-        self._root_token = None
-
-    def __enter__(self) -> str:
-        """Enter the trace context and return the task_run_id."""
-        # Set context variables
-        self._task_run_token = set_current_task_run_id(self.task_run_id)
-        self._root_token = is_root_trace_var.set(self.is_root)
-
-        # Set OpenTelemetry baggage for propagation
-        ctx = baggage.set_baggage(TASK_RUN_ID_KEY, self.task_run_id)
-        ctx = baggage.set_baggage(IS_ROOT_TRACE_KEY, str(self.is_root), context=ctx)
-        if self.job_id:
-            ctx = baggage.set_baggage("hud.job_id", self.job_id, context=ctx)
-        if self.task_id:
-            ctx = baggage.set_baggage("hud.task_id", self.task_id, context=ctx)
-        if self.group_id:
-            ctx = baggage.set_baggage("hud.group_id", self.group_id, context=ctx)
-        self._otel_token = context.attach(ctx)
-
-        # Start a span as current
-        tracer = otel_trace.get_tracer("hud-sdk")
-        span_attrs = {
-            "hud.task_run_id": self.task_run_id,
-            "hud.is_root_trace": self.is_root,
-            **self.attributes,
-        }
-        if self.job_id:
-            span_attrs["hud.job_id"] = self.job_id
-        if self.task_id:
-            span_attrs["hud.task_id"] = self.task_id
-        if self.group_id:
-            span_attrs["hud.group_id"] = self.group_id
-
-        # Use start_as_current_span context manager
-        self._span_manager = tracer.start_as_current_span(
-            self.span_name,
-            attributes=span_attrs,
-        )
-        self._span = self._span_manager.__enter__()
-
-        # Update task status to running (sync call - blocking is expected)
-        if self.is_root and settings.telemetry_enabled and settings.api_key:
-            _update_task_status_sync(
-                self.task_run_id,
-                "running",
-                job_id=self.job_id,
-                trace_name=self.span_name,
-                task_id=self.task_id,
-                group_id=self.group_id,
-            )
-            if not self.job_id:
-                _print_trace_url(self.task_run_id)
-
-        logger.debug("Started HUD trace context for task_run_id=%s", self.task_run_id)
-        return self.task_run_id
-
-    def __exit__(
-        self,
-        exc_type: type[BaseException] | None,
-        exc_val: BaseException | None,
-        exc_tb: TracebackType | None,
-    ) -> None:
-        """Exit the trace context."""
-        # Update task status (sync call - blocking is expected for sync context manager)
-        if self.is_root and settings.telemetry_enabled and settings.api_key:
-            status = "error" if exc_type else "completed"
-            error_msg = None
-            if exc_type is not None:
-                error_msg = "".join(traceback.format_exception(exc_type, exc_val, exc_tb))
-            _update_task_status_sync(
-                self.task_run_id,
-                status,
-                job_id=self.job_id,
-                error_message=error_msg,
-                trace_name=self.span_name,
-                task_id=self.task_id,
-                group_id=self.group_id,
-            )
-            if not self.job_id:
-                _print_trace_complete_url(self.task_run_id, error_occurred=bool(exc_type))
-
-        # End the span
-        if self._span and self._span_manager is not None:
-            if exc_type is not None and exc_val is not None:
-                self._span.record_exception(exc_val)
-                self._span.set_status(Status(StatusCode.ERROR, str(exc_val)))
-            else:
-                self._span.set_status(Status(StatusCode.OK))
-            self._span_manager.__exit__(exc_type, exc_val, exc_tb)
-
-        # Detach OpenTelemetry context
-        if self._otel_token is not None:
-            try:
-                context.detach(self._otel_token)  # type: ignore[arg-type]
-            except Exception:
-                logger.warning("Failed to detach OpenTelemetry context")
-
-        # Reset context variables
-        if self._task_run_token is not None:
-            current_task_run_id.reset(self._task_run_token)  # type: ignore
-        if self._root_token is not None:
-            is_root_trace_var.reset(self._root_token)  # type: ignore
-
-        logger.debug("Ended HUD trace context for task_run_id=%s", self.task_run_id)
diff --git a/hud/otel/exporters.py b/hud/otel/exporters.py
deleted file mode 100644
index ee9558fa..00000000
--- a/hud/otel/exporters.py
+++ /dev/null
@@ -1,543 +0,0 @@
-"""Custom OpenTelemetry exporter for HUD telemetry backend.
-
-This exporter sends spans to the HUD telemetry HTTP endpoint, grouping them
-by task_run_id for efficient batch uploads.
-
-Performance optimizations:
-- Detects async contexts and runs exports in a thread pool to avoid blocking
-- Uses persistent HTTP client with connection pooling for reduced overhead
-- Tracks pending export futures to ensure completion during shutdown
-
-The exporter derives from SpanExporter (synchronous interface) but handles
-async contexts intelligently to prevent event loop blocking during high-concurrency
-workloads.
-"""
-
-from __future__ import annotations
-
-import atexit
-import concurrent.futures as cf
-import contextlib
-import json
-import logging
-from collections import defaultdict
-from concurrent.futures import ThreadPoolExecutor
-from datetime import UTC, datetime
-from typing import TYPE_CHECKING, Any
-
-from mcp.types import ClientRequest, ServerResult
-from opentelemetry.sdk.trace.export import SpanExporter, SpanExportResult
-from pydantic import BaseModel, ConfigDict, Field
-
-from hud.shared import make_request_sync
-from hud.types import TraceStep as HudSpanAttributes
-
-if TYPE_CHECKING:
-    from opentelemetry.sdk.trace import ReadableSpan
-
-logger = logging.getLogger(__name__)
-
-# Global singleton thread pool for span exports
-_export_executor: ThreadPoolExecutor | None = None
-
-
-def get_export_executor() -> ThreadPoolExecutor:
-    """Get or create the global thread pool for span exports.
-
-    Returns a singleton ThreadPoolExecutor used for running span exports
-    in a thread pool when called from async contexts, preventing event
-    loop blocking during high-concurrency workloads.
-
-    The executor is automatically cleaned up on process exit via atexit.
-
-    Returns:
-        ThreadPoolExecutor with 8 workers for high-throughput parallel uploads
-    """
-    global _export_executor
-    if _export_executor is None:
-        # Use 8 workers to handle high-volume parallel uploads efficiently
-        _export_executor = ThreadPoolExecutor(max_workers=8, thread_name_prefix="span-export")
-
-        def cleanup() -> None:
-            if _export_executor is not None:
-                _export_executor.shutdown(wait=True)
-
-        atexit.register(cleanup)
-    return _export_executor
-
-
-# ---------------------------------------------------------------------------
-# Models
-# ---------------------------------------------------------------------------
-
-
-class HudSpan(BaseModel):
-    """A telemetry span ready for export."""
-
-    name: str
-    trace_id: str = Field(pattern=r"^[0-9a-fA-F]{32}$")
-    span_id: str = Field(pattern=r"^[0-9a-fA-F]{16}$")
-    parent_span_id: str | None = Field(None, pattern=r"^[0-9a-fA-F]{16}$")
-
-    start_time: str  # ISO format
-    end_time: str  # ISO format
-
-    status_code: str  # "UNSET", "OK", "ERROR"
-    status_message: str | None = None
-
-    attributes: HudSpanAttributes
-    exceptions: list[dict[str, Any]] | None = None
-
-    model_config = ConfigDict(extra="forbid")
-
-
-def extract_span_attributes(
-    attrs: dict[str, Any], method_name: str | None = None, span_name: str | None = None
-) -> HudSpanAttributes:
-    """Extract and parse span attributes into typed model.
-
-    This handles:
-    - Detecting span type (MCP vs Agent)
-    - Renaming verbose OpenTelemetry semantic conventions
-    - Parsing JSON strings to MCP types
-    """
-    # Start with core attributes - map to TraceStep field names
-    result_attrs = {
-        "task_run_id": attrs.get(
-            "hud.task_run_id"
-        ),  # TraceStep expects task_run_id, not hud.task_run_id
-        "job_id": attrs.get("hud.job_id"),  # TraceStep expects job_id, not hud.job_id
-        "type": attrs.get("span.kind", "CLIENT"),  # TraceStep expects type, not span.kind
-    }
-
-    # Determine span type based on presence of agent or MCP attributes
-    # Note: The input attrs might already have "category" set
-    existing_category = attrs.get("category")
-
-    if existing_category:
-        # Use the explicit category if provided
-        result_attrs["category"] = existing_category
-    elif span_name and span_name.startswith("agent."):
-        # Legacy support for spans named "agent.*"
-        result_attrs["category"] = "agent"
-    else:
-        result_attrs["category"] = "mcp"  # Default to MCP
-
-    # No special processing needed for different categories
-    # The backend will handle them based on the category field
-
-    # Add method_name and request_id for MCP spans
-    if result_attrs["category"] == "mcp":
-        if method_name:
-            result_attrs["method_name"] = method_name
-        # Check for request_id with and without semconv_ai prefix
-        request_id = attrs.get("semconv_ai.mcp.request_id") or attrs.get("mcp.request.id")
-        if request_id:
-            result_attrs["request_id"] = request_id
-
-    # Parse input/output - check both with and without semconv_ai prefix
-    input_str = attrs.get("semconv_ai.traceloop.entity.input") or attrs.get(
-        "traceloop.entity.input"
-    )
-    output_str = attrs.get("semconv_ai.traceloop.entity.output") or attrs.get(
-        "traceloop.entity.output"
-    )
-
-    logger.debug(
-        "Category: %s, has input: %s, has output: %s",
-        result_attrs.get("category"),
-        bool(input_str),
-        bool(output_str),
-    )
-
-    # Check for direct request/result attributes first
-    if "request" in attrs and not result_attrs.get("request"):
-        req = attrs["request"]
-        if isinstance(req, str):
-            with contextlib.suppress(json.JSONDecodeError):
-                req = json.loads(req)
-        result_attrs["request"] = req
-
-    if "result" in attrs and not result_attrs.get("result"):
-        res = attrs["result"]
-        if isinstance(res, str):
-            with contextlib.suppress(json.JSONDecodeError):
-                res = json.loads(res)
-        result_attrs["result"] = res
-
-    # Process input/output from MCP instrumentation
-    if input_str and not result_attrs.get("request"):
-        try:
-            input_data = json.loads(input_str) if isinstance(input_str, str) else input_str
-
-            # For MCP category, try to parse as ClientRequest to extract the root
-            if result_attrs["category"] == "mcp" and isinstance(input_data, dict):
-                try:
-                    if "method" in input_data and "params" in input_data:
-                        client_request = ClientRequest.model_validate(input_data)
-                        result_attrs["request"] = client_request.root
-                    else:
-                        result_attrs["request"] = input_data
-                except Exception:
-                    result_attrs["request"] = input_data
-            else:
-                # For all other categories, just store the data
-                result_attrs["request"] = input_data
-        except Exception as e:
-            logger.debug("Failed to parse request JSON: %s", e)
-
-    if output_str and not result_attrs.get("result"):
-        try:
-            output_data = json.loads(output_str) if isinstance(output_str, str) else output_str
-
-            # For MCP category, try to parse as ServerResult to extract the root
-            if result_attrs["category"] == "mcp" and isinstance(output_data, dict):
-                # Check for error
-                if "error" in output_data:
-                    result_attrs["mcp_error"] = True
-                try:
-                    server_result = ServerResult.model_validate(output_data)
-                    result_attrs["result"] = server_result.root
-                    # Check for isError in the result
-                    if getattr(server_result.root, "isError", False):
-                        result_attrs["mcp_error"] = True
-                except Exception:
-                    result_attrs["result"] = output_data
-            else:
-                # For all other categories, just store the data
-                result_attrs["result"] = output_data
-        except Exception as e:
-            logger.debug("Failed to parse result JSON: %s", e)
-
-    # Don't include the verbose attributes or ones we've already processed
-    exclude_keys = {
-        "hud.task_run_id",
-        "hud.job_id",
-        "span.kind",
-        "semconv_ai.mcp.method_name",
-        "mcp.method.name",  # Also exclude non-prefixed version
-        "semconv_ai.mcp.request_id",
-        "mcp.request.id",  # Also exclude non-prefixed version
-        "semconv_ai.traceloop.entity.input",
-        "semconv_ai.traceloop.entity.output",
-        "traceloop.entity.input",  # Also exclude non-prefixed versions
-        "traceloop.entity.output",
-        "mcp_request",  # Exclude to prevent overwriting parsed values
-        "mcp_result",  # Exclude to prevent overwriting parsed values
-        "request",  # Exclude to prevent overwriting parsed values
-        "result",  # Exclude to prevent overwriting parsed values
-        "category",  # Already handled above
-    }
-
-    # Add any extra attributes
-    for key, value in attrs.items():
-        if key not in exclude_keys:
-            result_attrs[key] = value  # noqa: PERF403
-
-    logger.debug(
-        """Final result_attrs before creating HudSpanAttributes:
-        request=%s,
-        result=%s""",
-        result_attrs.get("request"),
-        result_attrs.get("result"),
-    )
-    return HudSpanAttributes(**result_attrs)
-
-
-# ---------------------------------------------------------------------------
-# Helpers
-# ---------------------------------------------------------------------------
-
-
-def _ts_ns_to_iso(ts_ns: int) -> str:
-    """Convert a ``Span`` timestamp (nanoseconds) to ISO-8601 string."""
-    # OpenTelemetry times are epoch nanoseconds
-    dt = datetime.fromtimestamp(ts_ns / 1_000_000_000, tz=UTC)
-    return dt.isoformat().replace("+00:00", "Z")
-
-
-def _span_to_dict(span: ReadableSpan) -> dict[str, Any]:
-    """Convert an OpenTelemetry span to a dict using typed models."""
-
-    attrs = dict(span.attributes or {})
-
-    # Extract method name from span name if not in attributes
-    # Check both with and without semconv_ai prefix
-    raw_method = attrs.get("semconv_ai.mcp.method_name") or attrs.get("mcp.method.name")
-    method_name: str | None = None
-    if isinstance(raw_method, str):
-        method_name = raw_method
-    if method_name is None and isinstance(span.name, str) and span.name.endswith(".mcp"):
-        method_name = span.name[:-4]  # Remove .mcp suffix
-
-    # Create typed attributes
-    typed_attrs = extract_span_attributes(attrs, method_name, str(span.name))
-
-    # Record span kind as extra attribute (TraceStep allows extras)
-    try:
-        typed_attrs.span_kind = span.kind.name  # type: ignore[attr-defined]
-    except Exception:
-        logger.warning("Failed to set span kind attribute")
-
-    # Build typed span
-    # Guard context/parent/timestamps
-    context = getattr(span, "context", None)
-    trace_id_hex = (
-        format(context.trace_id, "032x") if context and hasattr(context, "trace_id") else "0" * 32
-    )
-    span_id_hex = (
-        format(context.span_id, "016x") if context and hasattr(context, "span_id") else "0" * 16
-    )
-    parent = getattr(span, "parent", None)
-    parent_id_hex = (
-        format(parent.span_id, "016x") if parent and hasattr(parent, "span_id") else None
-    )
-    start_ns = span.start_time or 0
-    end_ns = span.end_time or start_ns
-
-    typed_span = HudSpan(
-        name=span.name,
-        trace_id=trace_id_hex,
-        span_id=span_id_hex,
-        parent_span_id=parent_id_hex,
-        start_time=_ts_ns_to_iso(int(start_ns)),
-        end_time=_ts_ns_to_iso(int(end_ns)),
-        status_code=span.status.status_code.name if span.status else "UNSET",
-        status_message=span.status.description if span.status else None,
-        attributes=typed_attrs,
-        exceptions=None,
-    )
-
-    # Add error information if present
-    if span.events:
-        exceptions = []
-        exceptions = [
-            {
-                "timestamp": _ts_ns_to_iso(event.timestamp),
-                "attributes": dict(event.attributes or {}),
-            }
-            for event in span.events
-        ]
-        if exceptions:
-            typed_span.exceptions = exceptions
-
-    # Convert to dict for export
-    return typed_span.model_dump(mode="json", by_alias=True, exclude_none=True)
-
-
-# ---------------------------------------------------------------------------
-# Exporter
-# ---------------------------------------------------------------------------
-
-
-class HudSpanExporter(SpanExporter):
-    """OpenTelemetry span exporter for the HUD backend.
-
-    This exporter groups spans by task_run_id and sends them to the HUD
-    telemetry endpoint. Performance optimizations include:
-
-    - Auto-detects async contexts and runs exports in thread pool (non-blocking)
-    - Tracks pending export futures for proper shutdown coordination
-
-    Handles high-concurrency scenarios (200+ parallel tasks) by offloading
-    synchronous HTTP operations to a thread pool when called from async
-    contexts, preventing event loop blocking.
-    """
-
-    def __init__(self, *, telemetry_url: str, api_key: str) -> None:
-        """Initialize the HUD span exporter.
-
-        Args:
-            telemetry_url: Base URL for the HUD telemetry backend
-            api_key: API key for authentication
-        """
-        super().__init__()
-        self._telemetry_url = telemetry_url.rstrip("/")
-        self._api_key = api_key
-
-        # Track pending export futures for shutdown coordination
-        self._pending_futures: list[cf.Future[SpanExportResult]] = []
-
-    def export(self, spans: list[ReadableSpan]) -> SpanExportResult:  # type: ignore[override]
-        """Export spans to HUD backend.
-
-        Auto-detects async contexts: if called from an async event loop, runs
-        the export in a thread pool to avoid blocking. Otherwise runs synchronously.
-
-        Args:
-            spans: List of ReadableSpan objects to export
-
-        Returns:
-            SpanExportResult.SUCCESS (returns immediately in async contexts)
-        """
-        if not spans:
-            return SpanExportResult.SUCCESS
-
-        # Group spans by task_run_id for batched uploads
-        grouped: dict[str, list[ReadableSpan]] = defaultdict(list)
-        for span in spans:
-            run_id = span.attributes.get("hud.task_run_id") if span.attributes else None
-            if not run_id:
-                # Skip spans outside HUD traces
-                continue
-            grouped[str(run_id)].append(span)
-
-        # Detect async context to avoid event loop blocking
-        import asyncio
-
-        try:
-            loop = asyncio.get_running_loop()
-            # In async context - offload to thread pool
-            executor = get_export_executor()
-
-            def _sync_export() -> SpanExportResult:
-                # Send each group synchronously (retry inside make_request_sync)
-                for run_id, span_batch in grouped.items():
-                    try:
-                        url = f"{self._telemetry_url}/trace/{run_id}/telemetry-upload"
-                        telemetry_spans = [_span_to_dict(s) for s in span_batch]
-                        # Include current step count in metadata
-                        metadata = {}
-                        # Get the HIGHEST step count from the batch (most recent)
-                        step_count = 0
-                        for span in span_batch:
-                            if span.attributes and "hud.step_count" in span.attributes:
-                                current_step = span.attributes["hud.step_count"]
-                                if isinstance(current_step, int) and current_step > step_count:
-                                    step_count = current_step
-
-                        payload = {
-                            "metadata": metadata,
-                            "telemetry": telemetry_spans,
-                        }
-
-                        # Only include step_count if we found any steps
-                        if step_count > 0:
-                            payload["step_count"] = step_count
-
-                        logger.debug("HUD exporter sending %d spans to %s", len(span_batch), url)
-                        make_request_sync(
-                            method="POST",
-                            url=url,
-                            json=payload,
-                            api_key=self._api_key,
-                        )
-                    except Exception as exc:
-                        logger.exception(
-                            "HUD exporter failed to send spans for task %s: %s", run_id, exc
-                        )
-                        return SpanExportResult.FAILURE
-                return SpanExportResult.SUCCESS
-
-            # Run in thread to avoid blocking event loop
-            future = loop.run_in_executor(executor, _sync_export)
-            # Track and cleanup when done
-            self._pending_futures.append(future)  # type: ignore[list-item]
-
-            def _cleanup_done(f: cf.Future[SpanExportResult]) -> None:
-                with contextlib.suppress(Exception):
-                    # Consume exception to avoid "exception was never retrieved"
-                    _ = f.exception()
-                # Remove from pending list
-                with contextlib.suppress(ValueError):
-                    self._pending_futures.remove(f)
-
-            future.add_done_callback(_cleanup_done)  # type: ignore[arg-type]
-            # Don't wait for it - return immediately
-            return SpanExportResult.SUCCESS
-
-        except RuntimeError:
-            # No event loop - run synchronously
-            # Send each group synchronously (retry inside make_request_sync)
-            for run_id, span_batch in grouped.items():
-                try:
-                    url = f"{self._telemetry_url}/trace/{run_id}/telemetry-upload"
-                    telemetry_spans = [_span_to_dict(s) for s in span_batch]
-                    # Include current step count in metadata
-                    metadata = {}
-                    # Get the HIGHEST step count from the batch (most recent)
-                    step_count = 0
-                    for span in span_batch:
-                        if span.attributes and "hud.step_count" in span.attributes:
-                            current_step = span.attributes["hud.step_count"]
-                            if isinstance(current_step, int) and current_step > step_count:
-                                step_count = current_step
-
-                    payload = {
-                        "metadata": metadata,
-                        "telemetry": telemetry_spans,
-                    }
-
-                    # Only include step_count if we found any steps
-                    if step_count > 0:
-                        payload["step_count"] = step_count
-
-                    logger.debug("HUD exporter sending %d spans to %s", len(span_batch), url)
-                    make_request_sync(
-                        method="POST",
-                        url=url,
-                        json=payload,
-                        api_key=self._api_key,
-                    )
-                except Exception as exc:
-                    logger.exception(
-                        "HUD exporter failed to send spans for task %s: %s", run_id, exc
-                    )
-                    # If *any* group fails we return FAILURE so the OTEL SDK can retry
-                    return SpanExportResult.FAILURE
-
-            return SpanExportResult.SUCCESS
-
-    def shutdown(self) -> None:  # type: ignore[override]
-        """Shutdown the exporter and wait for pending exports.
-
-        Waits up to 10 seconds for any in-flight exports to complete.
-        """
-        try:
-            if self._pending_futures:
-                with contextlib.suppress(Exception):
-                    cf.wait(self._pending_futures, timeout=10.0)
-        finally:
-            self._pending_futures.clear()
-
-    def force_flush(self, timeout_millis: int | None = None) -> bool:  # type: ignore[override]
-        """Force flush all pending span exports.
-
-        Waits for all pending export futures to complete before returning.
-        This is called by the OpenTelemetry SDK during shutdown to ensure
-        all telemetry is uploaded.
-
-        Args:
-            timeout_millis: Maximum time to wait in milliseconds
-
-        Returns:
-            True if all exports completed, False otherwise
-        """
-        try:
-            if not self._pending_futures:
-                return True
-
-            total_pending = len(self._pending_futures)
-            if total_pending > 10:
-                # Show progress for large batches
-                logger.info("Flushing %d pending telemetry uploads...", total_pending)
-
-            timeout = (timeout_millis or 30000) / 1000.0
-            done, not_done = cf.wait(self._pending_futures, timeout=timeout)
-
-            # Consume exceptions to avoid "exception was never retrieved" warnings
-            for f in list(done):
-                with contextlib.suppress(Exception):
-                    _ = f.exception()
-
-            # Remove completed futures
-            for f in list(done):
-                with contextlib.suppress(ValueError):
-                    self._pending_futures.remove(f)
-
-            if total_pending > 10:
-                logger.info("Completed %d/%d telemetry uploads", len(done), total_pending)
-
-            return len(not_done) == 0
-        except Exception:
-            return False
diff --git a/hud/otel/instrumentation.py b/hud/otel/instrumentation.py
deleted file mode 100644
index db62089e..00000000
--- a/hud/otel/instrumentation.py
+++ /dev/null
@@ -1,135 +0,0 @@
-"""MCP instrumentation support for HUD.
-
-This module provides functions to enable MCP OpenTelemetry instrumentation
-for automatic tracing of MCP protocol communication.
-"""
-
-from __future__ import annotations
-
-import logging
-from typing import TYPE_CHECKING, Any
-
-if TYPE_CHECKING:
-    from collections.abc import AsyncGenerator, Callable
-
-    from opentelemetry.trace import TracerProvider
-
-logger = logging.getLogger(__name__)
-
-
-def install_mcp_instrumentation(provider: TracerProvider) -> None:
-    """Enable community MCP OpenTelemetry instrumentation if present.
-
-    Args:
-        provider: The TracerProvider to use for instrumentation
-    """
-    import logging
-
-    logger = logging.getLogger(__name__)
-
-    try:
-        # First, patch the _instruments to use our fork
-        import opentelemetry.instrumentation.mcp.instrumentation as mcp_inst
-
-        mcp_inst._instruments = ("hud-mcp-python-sdk >= 3.13.1",)
-
-        from opentelemetry.instrumentation.mcp.instrumentation import (
-            McpInstrumentor,
-        )
-
-        # Then, patch the instrumentation to handle 3-value transports correctly
-        _patch_mcp_instrumentation()
-
-        McpInstrumentor().instrument(tracer_provider=provider)
-        logger.debug("MCP instrumentation installed with fastmcp compatibility patch")
-    except ImportError:
-        logger.debug("opentelemetry-instrumentation-mcp not available, skipping")
-    except Exception as exc:
-        logger.warning("Failed to install MCP instrumentation: %s", exc)
-
-
-def _patch_mcp_instrumentation() -> None:
-    """Patch MCP instrumentation to handle 3-value transport yields correctly."""
-    from contextlib import asynccontextmanager
-
-    try:
-        from opentelemetry.instrumentation.mcp.instrumentation import McpInstrumentor
-
-        # First, patch the get_error_type function to handle invalid HTTP status codes
-        _patch_get_error_type()
-
-        def patched_transport_wrapper(self: Any, tracer: Any) -> Callable[..., Any]:
-            @asynccontextmanager
-            async def traced_method(
-                wrapped: Callable[..., Any], instance: Any, args: Any, kwargs: Any
-            ) -> AsyncGenerator[Any, None]:
-                async with wrapped(*args, **kwargs) as result:
-                    # Check if we got a tuple with 3 values
-                    if isinstance(result, tuple) and len(result) == 3:
-                        read_stream, write_stream, third_value = result
-                        # Import here to avoid circular imports
-                        from opentelemetry.instrumentation.mcp.instrumentation import (
-                            InstrumentedStreamReader,
-                            InstrumentedStreamWriter,
-                        )
-
-                        yield (
-                            InstrumentedStreamReader(read_stream, tracer),
-                            InstrumentedStreamWriter(write_stream, tracer),
-                            third_value,
-                        )
-                    else:
-                        # Fall back to 2-value case
-                        read_stream, write_stream = result
-                        from opentelemetry.instrumentation.mcp.instrumentation import (
-                            InstrumentedStreamReader,
-                            InstrumentedStreamWriter,
-                        )
-
-                        yield (
-                            InstrumentedStreamReader(read_stream, tracer),
-                            InstrumentedStreamWriter(write_stream, tracer),
-                        )
-
-            return traced_method
-
-        # Apply the patch
-        McpInstrumentor._transport_wrapper = patched_transport_wrapper
-
-    except Exception as e:
-        import logging
-
-        logger = logging.getLogger(__name__)
-        logger.warning("Failed to patch MCP instrumentation: %s", e)
-
-
-def _patch_get_error_type() -> None:
-    """Patch get_error_type to handle invalid HTTP status codes gracefully."""
-    import re
-    from http import HTTPStatus
-
-    try:
-        import opentelemetry.instrumentation.mcp.instrumentation as mcp_inst
-
-        def patched_get_error_type(error_message: str) -> str | None:
-            """Extract HTTP status from error message, handling invalid codes."""
-            if not isinstance(error_message, str):
-                return None
-            match = re.search(r"\b(4\d{2}|5\d{2})\b", error_message)
-            if match:
-                num = int(match.group())
-                try:
-                    # Only return if it's a valid HTTPStatus
-                    if 400 <= num <= 599:
-                        return HTTPStatus(num).name
-                except ValueError:
-                    # Not a valid HTTP status code
-                    logger.debug("Ignoring invalid HTTP status code: %s", num)
-            return None
-
-        # Apply the patch
-        mcp_inst.get_error_type = patched_get_error_type
-        logger.debug("Patched get_error_type to handle invalid HTTP status codes")
-
-    except Exception as e:
-        logger.warning("Failed to patch get_error_type: %s", e)
diff --git a/hud/otel/processors.py b/hud/otel/processors.py
deleted file mode 100644
index f41a1b0a..00000000
--- a/hud/otel/processors.py
+++ /dev/null
@@ -1,121 +0,0 @@
-from __future__ import annotations
-
-import logging
-import time
-from typing import Any
-
-from opentelemetry import baggage
-from opentelemetry.sdk.trace import ReadableSpan, Span, SpanProcessor
-
-from .context import (
-    get_agent_steps,
-    get_base_mcp_steps,
-    get_mcp_tool_steps,
-    increment_agent_steps,
-    increment_base_mcp_steps,
-    increment_mcp_tool_steps,
-)
-
-logger = logging.getLogger(__name__)
-
-
-class HudEnrichmentProcessor(SpanProcessor):
-    """Span processor that enriches every span with HUD-specific context.
-
-    • Adds ``hud.task_run_id`` attribute if available.
-    • Adds ``hud.job_id`` attribute if available in baggage.
-    • Adds ``hud.step_count`` attribute if available in baggage.
-    """
-
-    def __init__(self) -> None:
-        # No state, everything comes from context vars
-        super().__init__()
-
-    # --- callback hooks -------------------------------------------------
-    def on_start(self, span: Span, parent_context: Any) -> None:  # type: ignore[override]
-        try:
-            # Get task_run_id from baggage in parent context
-            run_id = baggage.get_baggage("hud.task_run_id", context=parent_context)
-            if run_id and span.is_recording():
-                span.set_attribute("hud.task_run_id", str(run_id))
-
-            # Get job_id from baggage if available
-            job_id = baggage.get_baggage("hud.job_id", context=parent_context)
-            if job_id and span.is_recording():
-                span.set_attribute("hud.job_id", str(job_id))
-
-            # Check what type of step this is and increment appropriate counters
-            if span.is_recording():
-                step_type = self._get_step_type(span)
-
-                if step_type == "agent":
-                    # Increment agent steps
-                    new_agent_count = increment_agent_steps()
-                    span.set_attribute("hud.agent_steps", new_agent_count)
-                    logger.debug("Incremented agent steps to %d", new_agent_count)
-
-                elif step_type == "base_mcp":
-                    # Increment base MCP steps
-                    new_base_count = increment_base_mcp_steps()
-                    span.set_attribute("hud.base_mcp_steps", new_base_count)
-                    logger.debug("Incremented base MCP steps to %d", new_base_count)
-
-                elif step_type == "mcp_tool":
-                    # Increment both base MCP and MCP tool steps
-                    new_base_count = increment_base_mcp_steps()
-                    new_tool_count = increment_mcp_tool_steps()
-                    span.set_attribute("hud.base_mcp_steps", new_base_count)
-                    span.set_attribute("hud.mcp_tool_steps", new_tool_count)
-                    logger.debug(
-                        "Incremented MCP steps to base=%d, tool=%d", new_base_count, new_tool_count
-                    )
-
-                # Always set all current step counts on the span
-                span.set_attribute("hud.base_mcp_steps", get_base_mcp_steps())
-                span.set_attribute("hud.mcp_tool_steps", get_mcp_tool_steps())
-                span.set_attribute("hud.agent_steps", get_agent_steps())
-
-        except Exception as exc:  # defensive; never fail the tracer
-            logger.debug("HudEnrichmentProcessor.on_start error: %s", exc, exc_info=False)
-
-    def _get_step_type(self, span: Span) -> str | None:
-        """Determine what type of step this span represents.
-
-        Returns:
-            'base_mcp' for any MCP span
-            'mcp_tool' for MCP tool calls (tools/call.mcp)
-            'agent' for agent spans
-            None if not a step
-        """
-        # Check span attributes
-        attrs = span.attributes or {}
-        span_name = span.name
-
-        # Check for agent steps (instrumented with span_type="agent")
-        if attrs.get("category") == "agent":
-            return "agent"
-
-        # Check span name pattern for MCP calls
-        if span_name:
-            # tools/call.mcp is an mcp_tool step
-            if span_name == "tools/call.mcp":
-                return "mcp_tool"
-
-            # Any other .mcp suffixed span is a base MCP step
-            elif span_name.endswith(".mcp"):
-                return "base_mcp"
-
-        return None
-
-    def on_end(self, span: ReadableSpan) -> None:
-        # Nothing to do enrichment is on_start only
-        pass
-
-    # Required to fully implement abstract base, but we don't batch spans
-    def shutdown(self) -> None:  # type: ignore[override]
-        pass
-
-    def force_flush(self, timeout_millis: int | None = None) -> bool:  # type: ignore[override]
-        if timeout_millis:
-            time.sleep(timeout_millis / 1000)
-        return True
diff --git a/hud/otel/tests/__init__.py b/hud/otel/tests/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/hud/otel/tests/test_instrumentation.py b/hud/otel/tests/test_instrumentation.py
deleted file mode 100644
index cfee2873..00000000
--- a/hud/otel/tests/test_instrumentation.py
+++ /dev/null
@@ -1,207 +0,0 @@
-from __future__ import annotations
-
-from unittest.mock import MagicMock, patch
-
-import pytest
-
-from hud.otel.instrumentation import (
-    _patch_get_error_type,
-    _patch_mcp_instrumentation,
-    install_mcp_instrumentation,
-)
-
-
-def test_install_mcp_instrumentation_success():
-    """Test successful installation of MCP instrumentation."""
-    mock_provider = MagicMock()
-
-    with (
-        patch("opentelemetry.instrumentation.mcp.instrumentation"),
-        patch(
-            "opentelemetry.instrumentation.mcp.instrumentation.McpInstrumentor"
-        ) as mock_instrumentor_class,
-        patch("hud.otel.instrumentation._patch_mcp_instrumentation"),
-    ):
-        mock_instrumentor = MagicMock()
-        mock_instrumentor_class.return_value = mock_instrumentor
-
-        install_mcp_instrumentation(mock_provider)
-
-        mock_instrumentor.instrument.assert_called_once_with(tracer_provider=mock_provider)
-
-
-def test_install_mcp_instrumentation_import_error():
-    """Test installation handles ImportError gracefully."""
-    mock_provider = MagicMock()
-
-    # Mock the import to raise ImportError
-    import sys
-
-    with patch.dict(sys.modules, {"opentelemetry.instrumentation.mcp.instrumentation": None}):
-        # Should not raise
-        install_mcp_instrumentation(mock_provider)
-
-
-def test_install_mcp_instrumentation_general_exception():
-    """Test installation handles general exceptions gracefully."""
-    mock_provider = MagicMock()
-
-    with (
-        patch("opentelemetry.instrumentation.mcp.instrumentation"),
-        patch(
-            "opentelemetry.instrumentation.mcp.instrumentation.McpInstrumentor"
-        ) as mock_instrumentor_class,
-    ):
-        mock_instrumentor_class.side_effect = Exception("Unexpected error")
-
-        # Should not raise
-        install_mcp_instrumentation(mock_provider)
-
-
-def test_patch_mcp_instrumentation_success():
-    """Test successful patching of MCP instrumentation."""
-    with (
-        patch("opentelemetry.instrumentation.mcp.instrumentation.McpInstrumentor") as mock_class,
-        patch("hud.otel.instrumentation._patch_get_error_type"),
-    ):
-        mock_class._transport_wrapper = None
-
-        _patch_mcp_instrumentation()
-
-        # Should have set the _transport_wrapper
-        assert mock_class._transport_wrapper is not None
-
-
-def test_patch_mcp_instrumentation_exception():
-    """Test patching handles exceptions gracefully."""
-    with patch(
-        "opentelemetry.instrumentation.mcp.instrumentation.McpInstrumentor",
-        side_effect=Exception("Error"),
-    ):
-        # Should not raise
-        _patch_mcp_instrumentation()
-
-
-def test_patch_get_error_type_success():
-    """Test successful patching of get_error_type."""
-    with patch("opentelemetry.instrumentation.mcp.instrumentation") as mock_mcp_inst:
-        mock_mcp_inst.get_error_type = None
-
-        _patch_get_error_type()
-
-        # Should have set get_error_type
-        assert mock_mcp_inst.get_error_type is not None
-
-
-def test_patch_get_error_type_exception():
-    """Test patching get_error_type handles exceptions."""
-    with patch(
-        "opentelemetry.instrumentation.mcp.instrumentation", side_effect=ImportError("Not found")
-    ):
-        # Should not raise
-        _patch_get_error_type()
-
-
-def test_patched_get_error_type_valid_4xx():
-    """Test patched get_error_type with valid 4xx status code."""
-    with patch("opentelemetry.instrumentation.mcp.instrumentation") as mock_mcp_inst:
-        _patch_get_error_type()
-
-        patched_func = mock_mcp_inst.get_error_type
-
-        # Test with a valid 4xx error
-        result = patched_func("Error 404 not found")
-        assert result == "NOT_FOUND"
-
-
-def test_patched_get_error_type_valid_5xx():
-    """Test patched get_error_type with valid 5xx status code."""
-    with patch("opentelemetry.instrumentation.mcp.instrumentation") as mock_mcp_inst:
-        _patch_get_error_type()
-
-        patched_func = mock_mcp_inst.get_error_type
-
-        # Test with a valid 5xx error
-        result = patched_func("Error 500 internal server error")
-        assert result == "INTERNAL_SERVER_ERROR"
-
-
-def test_patched_get_error_type_invalid_status():
-    """Test patched get_error_type with invalid status code."""
-    with patch("opentelemetry.instrumentation.mcp.instrumentation") as mock_mcp_inst:
-        _patch_get_error_type()
-
-        patched_func = mock_mcp_inst.get_error_type
-
-        # Test with an invalid HTTP status code (e.g., 499 doesn't exist in HTTPStatus)
-        result = patched_func("Error 499 custom error")
-        # Should return the name even if it's not a standard HTTPStatus
-        assert result is None or isinstance(result, str)
-
-
-def test_patched_get_error_type_no_status():
-    """Test patched get_error_type with no status code."""
-    with patch("opentelemetry.instrumentation.mcp.instrumentation") as mock_mcp_inst:
-        _patch_get_error_type()
-
-        patched_func = mock_mcp_inst.get_error_type
-
-        result = patched_func("Error message without status code")
-        assert result is None
-
-
-def test_patched_get_error_type_non_string():
-    """Test patched get_error_type with non-string input."""
-    with patch("opentelemetry.instrumentation.mcp.instrumentation") as mock_mcp_inst:
-        _patch_get_error_type()
-
-        patched_func = mock_mcp_inst.get_error_type
-
-        result = patched_func(None)
-        assert result is None
-
-        result = patched_func(123)
-        assert result is None
-
-
-def test_patched_get_error_type_3xx_ignored():
-    """Test patched get_error_type ignores 3xx codes."""
-    with patch("opentelemetry.instrumentation.mcp.instrumentation") as mock_mcp_inst:
-        _patch_get_error_type()
-
-        patched_func = mock_mcp_inst.get_error_type
-
-        result = patched_func("Error 301 moved")
-        assert result is None
-
-
-@pytest.mark.asyncio
-async def test_transport_wrapper_three_values():
-    """Test transport wrapper handles 3-value tuple."""
-    with (
-        patch("opentelemetry.instrumentation.mcp.instrumentation.McpInstrumentor") as mock_class,
-        patch("hud.otel.instrumentation._patch_get_error_type"),
-    ):
-        mock_class._transport_wrapper = None
-
-        _patch_mcp_instrumentation()
-
-        # Get the patched wrapper
-        wrapper_func = mock_class._transport_wrapper
-        assert wrapper_func is not None
-
-
-@pytest.mark.asyncio
-async def test_transport_wrapper_two_values():
-    """Test transport wrapper handles 2-value tuple."""
-    with (
-        patch("opentelemetry.instrumentation.mcp.instrumentation.McpInstrumentor") as mock_class,
-        patch("hud.otel.instrumentation._patch_get_error_type"),
-    ):
-        mock_class._transport_wrapper = None
-
-        _patch_mcp_instrumentation()
-
-        # Get the patched wrapper
-        wrapper_func = mock_class._transport_wrapper
-        assert wrapper_func is not None
diff --git a/hud/otel/tests/test_processors.py b/hud/otel/tests/test_processors.py
deleted file mode 100644
index 50ea14d4..00000000
--- a/hud/otel/tests/test_processors.py
+++ /dev/null
@@ -1,197 +0,0 @@
-"""Tests for OpenTelemetry processors."""
-
-from __future__ import annotations
-
-from unittest.mock import MagicMock, patch
-
-from hud.otel.processors import HudEnrichmentProcessor
-
-
-class TestHudEnrichmentProcessor:
-    """Test HudEnrichmentProcessor."""
-
-    def test_on_start_with_run_id(self):
-        """Test on_start with current task run ID."""
-
-        processor = HudEnrichmentProcessor()
-
-        # Mock span
-        span = MagicMock()
-        span.set_attribute = MagicMock()
-        span.is_recording.return_value = True
-
-        # Mock baggage to return run ID
-        parent_context = {}
-        with patch("hud.otel.processors.baggage.get_baggage") as mock_get_baggage:
-            # Return run ID for task_run_id, None for job_id
-            mock_get_baggage.side_effect = (
-                lambda key, context: "test-run-123" if key == "hud.task_run_id" else None
-            )
-            processor.on_start(span, parent_context)
-
-        # Verify attribute was set
-        span.set_attribute.assert_called_with("hud.task_run_id", "test-run-123")
-
-    def test_on_start_no_run_id(self):
-        """Test on_start without current task run ID."""
-
-        processor = HudEnrichmentProcessor()
-
-        # Mock span
-        span = MagicMock()
-        span.set_attribute = MagicMock()
-        span.is_recording.return_value = True
-        span.name = "test_span"
-
-        # Set up attributes to return None (not matching any step type)
-        span.attributes = {}
-
-        # Mock baggage to return None
-        parent_context = {}
-        with patch("hud.otel.processors.baggage.get_baggage", return_value=None):
-            processor.on_start(span, parent_context)
-
-        # Verify only step count attributes were set (no run_id or job_id)
-        calls = span.set_attribute.call_args_list
-        set_attrs = {call[0][0] for call in calls}
-
-        # Should have step counts but not run_id/job_id
-        assert "hud.task_run_id" not in set_attrs
-        assert "hud.job_id" not in set_attrs
-        assert "hud.base_mcp_steps" in set_attrs
-        assert "hud.mcp_tool_steps" in set_attrs
-        assert "hud.agent_steps" in set_attrs
-
-    def test_on_end(self):
-        """Test on_end does nothing."""
-
-        processor = HudEnrichmentProcessor()
-        span = MagicMock()
-
-        # Should not raise
-        processor.on_end(span)
-
-    def test_shutdown(self):
-        """Test shutdown does nothing."""
-
-        processor = HudEnrichmentProcessor()
-
-        # Should not raise
-        processor.shutdown()
-
-    def test_force_flush(self):
-        """Test force_flush returns True."""
-
-        processor = HudEnrichmentProcessor()
-
-        # Should return True
-        result = processor.force_flush()
-        assert result is True
-
-    def test_on_start_with_job_id(self):
-        """Test on_start with job ID in baggage."""
-
-        processor = HudEnrichmentProcessor()
-
-        # Mock span
-        span = MagicMock()
-        span.set_attribute = MagicMock()
-        span.is_recording.return_value = True
-
-        # Mock baggage with job ID
-        parent_context = {}
-        with patch("hud.otel.processors.baggage.get_baggage") as mock_get_baggage:
-            # Return None for task_run_id, job-123 for job_id
-            mock_get_baggage.side_effect = (
-                lambda key, context: "job-123" if key == "hud.job_id" else None
-            )
-            processor.on_start(span, parent_context)
-
-        # Verify job ID attribute was set
-        span.set_attribute.assert_called_with("hud.job_id", "job-123")
-
-    def test_on_start_exception_handling(self):
-        """Test on_start handles exceptions gracefully."""
-
-        processor = HudEnrichmentProcessor()
-
-        # Mock span that raises exception
-        span = MagicMock()
-        span.is_recording.side_effect = Exception("Test error")
-
-        # Should not raise
-        processor.on_start(span, parent_context=None)
-
-    def test_on_start_exception_handling_extended(self):
-        """Test that exceptions in on_start are caught and logged."""
-        from hud.otel.processors import HudEnrichmentProcessor
-
-        processor = HudEnrichmentProcessor()
-
-        # Create a mock span that raises when setting attributes
-        mock_span = MagicMock()
-        mock_span.is_recording.return_value = True
-        mock_span.set_attribute.side_effect = RuntimeError("Attribute error")
-
-        parent_context = {}
-
-        # Patch logger and baggage to force an exception when setting attribute
-        with (
-            patch("hud.otel.processors.logger") as mock_logger,
-            patch("hud.otel.processors.baggage.get_baggage", return_value="test-id"),
-        ):
-            # Should not raise, exception should be caught
-            processor.on_start(mock_span, parent_context)
-
-            # Verify logger.debug was called with the exception
-            mock_logger.debug.assert_called_once()
-            args = mock_logger.debug.call_args[0]
-            assert "HudEnrichmentProcessor.on_start error" in args[0]
-            assert "Attribute error" in str(args[1])
-
-    def test_on_start_with_baggage_get_exception(self):
-        """Test exception handling when baggage.get_baggage fails for task_run_id."""
-        processor = HudEnrichmentProcessor()
-
-        mock_span = MagicMock()
-        mock_span.is_recording.return_value = True
-
-        parent_context = {}
-
-        # Make baggage.get_baggage raise an exception for task_run_id
-        with (
-            patch(
-                "hud.otel.processors.baggage.get_baggage",
-                side_effect=ValueError("Context error"),
-            ),
-            patch("hud.otel.processors.logger") as mock_logger,
-        ):
-            # Should not raise
-            processor.on_start(mock_span, parent_context)
-
-            # Verify logger.debug was called
-            mock_logger.debug.assert_called_once()
-            args = mock_logger.debug.call_args[0]
-            assert "Context error" in str(args[1])
-
-    def test_on_start_with_baggage_exception(self):
-        """Test exception handling when baggage.get_baggage fails."""
-        processor = HudEnrichmentProcessor()
-
-        mock_span = MagicMock()
-        mock_span.is_recording.return_value = True
-
-        parent_context = {}
-
-        # Make baggage.get_baggage raise an exception
-        with (
-            patch("hud.otel.processors.baggage.get_baggage", side_effect=KeyError("Baggage error")),
-            patch("hud.otel.processors.logger") as mock_logger,
-        ):
-            # Should not raise
-            processor.on_start(mock_span, parent_context)
-
-            # Verify logger.debug was called
-            mock_logger.debug.assert_called_once()
-            args = mock_logger.debug.call_args[0]
-            assert "Baggage error" in str(args[1])
diff --git a/hud/patches/__init__.py b/hud/patches/__init__.py
new file mode 100644
index 00000000..64397eb2
--- /dev/null
+++ b/hud/patches/__init__.py
@@ -0,0 +1,19 @@
+"""
+HUD runtime patches for third-party libraries.
+
+This module applies monkey-patches to fix issues in dependencies
+without requiring forked packages.
+"""
+
+from hud.patches.mcp_patches import apply_all_patches, suppress_fastmcp_logging
+from hud.patches.warnings import apply_default_warning_filters, suppress_mcp_use_import_warnings
+
+# Apply patches on import
+apply_all_patches()
+
+__all__ = [
+    "apply_all_patches",
+    "apply_default_warning_filters",
+    "suppress_fastmcp_logging",
+    "suppress_mcp_use_import_warnings",
+]
diff --git a/hud/patches/mcp_patches.py b/hud/patches/mcp_patches.py
new file mode 100644
index 00000000..e987152c
--- /dev/null
+++ b/hud/patches/mcp_patches.py
@@ -0,0 +1,151 @@
+"""
+Runtime patches for the standard mcp package.
+
+These patches apply fixes from the HUD fork without requiring a separate package.
+Import this module early (e.g., in hud/__init__.py) to apply patches.
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import Any
+
+logger = logging.getLogger(__name__)
+
+
+def patch_streamable_http_error_handling() -> None:
+    """
+    Patch StreamableHTTPTransport.post_writer to handle request errors properly.
+
+    The original implementation doesn't catch errors in handle_request_async,
+    which can cause silent failures. This patch wraps the handler to send
+    errors to the read stream so clients know the request failed.
+    """
+    try:
+        from mcp.client.streamable_http import StreamableHTTPTransport
+
+        async def patched_post_writer(
+            self: Any,
+            client: Any,
+            write_stream_reader: Any,
+            read_stream_writer: Any,
+            write_stream: Any,
+            start_get_stream: Any,
+            tg: Any,
+        ) -> None:
+            """Patched post_writer with error handling for handle_request_async."""
+            from mcp.client.streamable_http import RequestContext
+            from mcp.shared.message import ClientMessageMetadata
+            from mcp.types import JSONRPCRequest
+
+            try:
+                async with write_stream_reader:
+                    async for session_message in write_stream_reader:
+                        message = session_message.message
+                        metadata = (
+                            session_message.metadata
+                            if isinstance(session_message.metadata, ClientMessageMetadata)
+                            else None
+                        )
+
+                        is_resumption = bool(metadata and metadata.resumption_token)
+
+                        logger.debug("Sending client message: %s", message)
+
+                        if self._is_initialized_notification(message):
+                            start_get_stream()
+
+                        ctx = RequestContext(
+                            client=client,
+                            headers=self.request_headers,
+                            session_id=self.session_id,
+                            session_message=session_message,
+                            metadata=metadata,
+                            read_stream_writer=read_stream_writer,
+                            sse_read_timeout=self.sse_read_timeout,
+                        )
+
+                        # Patched: Accept ctx and is_resumption as params, add error handling
+                        async def handle_request_async(
+                            ctx: RequestContext = ctx,
+                            is_resumption: bool = is_resumption,
+                        ) -> None:
+                            try:
+                                if is_resumption:
+                                    await self._handle_resumption_request(ctx)
+                                else:
+                                    await self._handle_post_request(ctx)
+                            except Exception as e:
+                                # Send error to read stream so client knows request failed
+                                logger.error("Request handler error: %s", e)
+                                await ctx.read_stream_writer.send(e)
+
+                        if isinstance(message.root, JSONRPCRequest):
+                            tg.start_soon(handle_request_async, ctx, is_resumption)
+                        else:
+                            await handle_request_async(ctx, is_resumption)
+
+            except Exception:
+                logger.exception("Error in post_writer")
+            finally:
+                await read_stream_writer.aclose()
+                await write_stream.aclose()
+
+        StreamableHTTPTransport.post_writer = patched_post_writer
+        logger.debug("Patched StreamableHTTPTransport.post_writer")
+
+    except ImportError:
+        logger.debug("mcp.client.streamable_http not available, skipping patch")
+    except Exception as e:
+        logger.warning("Failed to patch streamable_http: %s", e)
+
+
+def patch_client_session_validation() -> None:
+    """
+    Patch ClientSession to skip structured output validation.
+
+    The original validation is strict and raises errors for non-conforming
+    but usable responses. We replace it with a no-op.
+    """
+    try:
+        from mcp.client.session import ClientSession
+
+        async def noop_validate(self: Any, name: str, result: Any) -> None:
+            """Skip structured output validation entirely."""
+
+        ClientSession._validate_tool_result = noop_validate
+        logger.debug("Patched ClientSession._validate_tool_result to skip validation")
+
+    except ImportError:
+        logger.debug("mcp.client.session not available, skipping patch")
+    except Exception as e:
+        logger.warning("Failed to patch client session: %s", e)
+
+
+def suppress_fastmcp_logging(level: int = logging.WARNING) -> None:
+    """
+    Suppress verbose fastmcp logging.
+
+    FastMCP logs a lot of INFO-level messages that clutter output.
+    This sets all fastmcp loggers to the specified level.
+
+    Args:
+        level: Logging level to set (default: WARNING)
+    """
+    loggers_to_suppress = [
+        "fastmcp",
+        "fastmcp.server.server",
+        "fastmcp.server.openapi",
+        "fastmcp.tools.tool_manager",
+    ]
+    for logger_name in loggers_to_suppress:
+        logging.getLogger(logger_name).setLevel(level)
+    logger.debug("Suppressed fastmcp logging to level %s", level)
+
+
+def apply_all_patches() -> None:
+    """Apply all MCP patches."""
+    patch_streamable_http_error_handling()
+    patch_client_session_validation()
+    suppress_fastmcp_logging()
+    logger.debug("All MCP patches applied")
diff --git a/hud/patches/warnings.py b/hud/patches/warnings.py
new file mode 100644
index 00000000..0944ebb3
--- /dev/null
+++ b/hud/patches/warnings.py
@@ -0,0 +1,54 @@
+"""
+Centralized warning filters for noisy third-party dependencies.
+
+Keep these helpers here so the rest of the codebase can stay clean and avoid
+scattering warning filters across unrelated modules.
+"""
+
+from __future__ import annotations
+
+import warnings
+from contextlib import contextmanager
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from collections.abc import Iterator
+
+
+def apply_default_warning_filters(*, verbose: bool) -> None:
+    """Apply our default warning filters for non-verbose CLI/server modes."""
+    if verbose:
+        return
+
+    warnings.filterwarnings("ignore", category=DeprecationWarning)
+
+    # Pydantic v2 emits PydanticDeprecatedSince20 for v1-style config usage in deps.
+    try:
+        from pydantic.warnings import PydanticDeprecatedSince20
+    except Exception:
+        return
+
+    warnings.filterwarnings("ignore", category=PydanticDeprecatedSince20)
+
+
+@contextmanager
+def suppress_mcp_use_import_warnings() -> Iterator[None]:
+    """Suppress known noisy warnings emitted during `mcp_use` imports."""
+    try:
+        from pydantic.warnings import PydanticDeprecatedSince20
+    except Exception:  # pragma: no cover
+        PydanticDeprecatedSince20 = None  # type: ignore[assignment]
+
+    with warnings.catch_warnings():
+        # mcp_use currently emits DeprecationWarning from its package __init__.py.
+        warnings.filterwarnings("ignore", category=DeprecationWarning, module=r"mcp_use(\..*)?$")
+
+        # mcp_use currently defines Pydantic v1-style `class Config` in oauth models.
+        if PydanticDeprecatedSince20 is not None:
+            warnings.filterwarnings(
+                "ignore",
+                category=PydanticDeprecatedSince20,
+                module=r"mcp_use\.client\.auth\.oauth$",
+            )
+
+        yield
diff --git a/hud/rl/README.md b/hud/rl/README.md
deleted file mode 100644
index af451e24..00000000
--- a/hud/rl/README.md
+++ /dev/null
@@ -1,30 +0,0 @@
-We suggest running hud rl (or with the --local flag) for optimal hyperparameters and native HuggingFace running.
-
-However, to run this independently, sping up an instance with at least 2 GPUs and run:
-```bash
-sudo apt-get update -y && sudo apt-get install -y cuda-toolkit-12-6
-uv pip install -e .[rl]
-uv pip install ninja
-uv pip install flash-attn --no-build-isolation
-```
-
-Launch a vllm server with:
-```bash
-export VLLM_ALLOW_RUNTIME_LORA_UPDATING=True
-export TOKENIZERS_PARALLELISM=false
-export VLLM_LOGGING_LEVEL=INFO
-export CUDA_VISIBLE_DEVICES=7 # Set this to your last GPU
-
-uv run vllm serve Qwen/Qwen2.5-VL-3B-Instruct \
-    --api-key token-abc123 --host 0.0.0.0 --port 8000 --tensor-parallel-size 1 --trust-remote-code \
-    --max-model-len 16384 --enable-lora --max-lora-rank 64 --max-cpu-loras 4 --enable-auto-tool-choice \
-    --tool-call-parser hermes --disable-log-requests --dtype auto
-```
-
-And training with (replace 2 with your spare GPUs):
-```bash
-hud get hud-evals/2048-basic
-torchrun --nproc-per-node 2 -m hud.rl.train --tasks 2048-basic.json --verbose
-```
-
-Add a `--config path/to/config.json` flag to run a specific configuration (or change the defaults in config.py)
diff --git a/hud/rl/__init__.py b/hud/rl/__init__.py
deleted file mode 100644
index 604974ce..00000000
--- a/hud/rl/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-"""RL module for HUD."""
diff --git a/hud/rl/actor.py b/hud/rl/actor.py
deleted file mode 100644
index 4c9a3390..00000000
--- a/hud/rl/actor.py
+++ /dev/null
@@ -1,178 +0,0 @@
-"""Actor for episode collection using vLLM and HUD."""
-
-from __future__ import annotations
-
-import asyncio
-import logging
-
-import httpx
-from openai import AsyncOpenAI
-
-import hud
-from hud.agents.openai_chat import OpenAIChatAgent
-from hud.clients.utils.retry_transport import create_retry_httpx_client
-from hud.types import Task, Trace
-from hud.utils.hud_console import HUDConsole
-
-from .config import Config
-
-logger = logging.getLogger(__name__)
-hud_console = HUDConsole(logger)
-
-
-class Actor:
-    """Collects episodes using vLLM-served models via HUD agents."""
-
-    def __init__(self, config: Config) -> None:
-        self.config = config
-        self.actor_config = config.actor
-        self.current_adapter = config.model.base_model
-
-        # Setup OpenAI client for vLLM
-        base_url = self.actor_config.vllm_base_url.replace("localhost", "127.0.0.1")
-        self.openai_client = self._create_openai_client(base_url)
-
-    def _create_openai_client(self, base_url: str) -> AsyncOpenAI:
-        """Create OpenAI client with optimized settings for vLLM."""
-        # Match connection limits to parallel_episodes to avoid bottlenecks
-        # Use shorter per-request timeout and keep retries modest to avoid long blocking
-        http_client = create_retry_httpx_client(
-            timeout=httpx.Timeout(60.0),
-        )
-        return AsyncOpenAI(
-            base_url=base_url,
-            api_key=self.actor_config.vllm_api_key,
-            http_client=http_client,
-            max_retries=2,
-        )
-
-    def create_agent(self) -> OpenAIChatAgent:
-        """Create an agent with the current adapter."""
-        return OpenAIChatAgent(
-            openai_client=self.openai_client,
-            model_name=self.current_adapter,
-            allowed_tools=self.actor_config.allowed_tools,
-            append_setup_output=False,
-            system_prompt=self.actor_config.system_prompt,
-            verbose=self.config.verbose,
-            completion_kwargs={
-                "temperature": self.actor_config.temperature,
-                "max_tokens": self.actor_config.max_new_tokens,
-                "tool_choice": "required" if self.actor_config.force_tool_choice else "auto",
-            },
-        )
-
-    def update_adapter(self, adapter_name: str) -> None:
-        """Update the current adapter being used."""
-        self.current_adapter = adapter_name
-        hud_console.info(f"[Actor] Using adapter: {adapter_name}")
-
-    async def run_tasks(self, tasks: list[Task], job_id: str) -> list[Trace]:
-        """Run tasks and collect traces."""
-        traces = []
-
-        # Process tasks in batches respecting max_parallel_episodes limit
-        for batch_start in range(0, len(tasks), self.actor_config.max_parallel_episodes):
-            batch_end = min(batch_start + self.actor_config.max_parallel_episodes, len(tasks))
-            batch = tasks[batch_start:batch_end]
-
-            # Run batch in parallel with per-episode timeout protection
-            async def run_with_timeout(t: Task) -> Trace:
-                try:
-                    return await asyncio.wait_for(
-                        self._run_task(t, job_id),
-                        timeout=self.actor_config.episode_timeout_sec,
-                    )
-                except TimeoutError:
-                    hud_console.warning_log(f"Episode timed out for task {t.id}")
-                    # Attach task so buffer grouping has key
-                    return Trace(isError=True, content="Episode timeout", task=t)
-
-            results = await asyncio.gather(
-                *[run_with_timeout(t) for t in batch],
-                return_exceptions=True,
-            )
-
-            # Normalize exceptions to error traces and ensure task is attached
-            for t, res in zip(batch, results, strict=False):
-                if isinstance(res, Exception):
-                    hud_console.warning_log(f"Episode error: {res}")
-                    traces.append(Trace(isError=True, content=str(res), task=t))
-                else:
-                    traces.append(res)
-
-        return traces
-
-    async def _run_task(self, task: Task, job_id: str) -> Trace:
-        """Run a single task."""
-        agent = self.create_agent()
-
-        # Run the task
-        try:
-            async with hud.async_trace(f"Training | {task.prompt}", job_id=job_id):
-                result = await agent.run(task, max_steps=self.actor_config.max_steps_per_episode)
-
-        except Exception:
-            logger.info("GOT EXCEPTION")
-            # Preserve task on exception for grouping
-            return Trace(isError=True, task=task)
-
-        result.info["tool_spec"] = agent.get_tool_schemas()
-
-        return result
-
-
-if __name__ == "__main__":
-    from hud.types import Task
-
-    async def test_actor() -> None:
-        """Test the actor with a single 2048 task using local hud-browser image."""
-        config = Config()
-        config.actor.max_parallel_episodes = 1
-        config.actor.max_steps_per_episode = 6
-        config.actor.episode_timeout_sec = 120
-        config.verbose = True
-
-        # Create test task with local hud-browser image
-        task_data = {
-            "id": "test_2048_128",
-            "prompt": "Play the browser-based 2048 game and try to reach the 128 tile. Start by taking a screenshot, then make strategic moves using arrow keys.",  # noqa: E501
-            "mcp_config": {
-                "local": {
-                    "command": "sh",
-                    "args": [
-                        "-c",
-                        "docker run --rm --platform linux/amd64 -i hud-browser:latest 2>/dev/null",
-                    ],
-                }
-            },
-            "setup_tool": {"name": "launch_app", "arguments": {"app_name": "2048"}},
-            "evaluate_tool": {
-                "name": "evaluate",
-                "arguments": {"name": "game_2048_max_number", "arguments": {"target": 128}},
-            },
-            "agent_config": {
-                "system_prompt": "You are an expert 2048 game player. Use arrow keys to reach the target tile. First take a screenshot, then make strategic moves.",  # noqa: E501
-            },
-        }
-
-        task = Task(**task_data)
-        actor = Actor(config)
-
-        logger.info("Testing actor with task: %s", task.id)
-        logger.info("Model: %s", config.model.base_model)
-        logger.info("VLLM: %s", config.actor.vllm_base_url)
-
-        traces = await actor.run_tasks([task], job_id="test_2048")
-
-        for trace in traces:
-            if trace.isError:
-                logger.info("Error: %s", trace.content)
-            else:
-                logger.info("Success!")
-                logger.info("Trace info: %s", trace.info if hasattr(trace, "info") else "No info")
-                # Check for evaluation in the trace info
-                if hasattr(trace, "info") and "evaluation" in trace.info:
-                    logger.info("  Evaluation: %s", trace.info["evaluation"])
-
-    asyncio.run(test_actor())
diff --git a/hud/rl/buffer.py b/hud/rl/buffer.py
deleted file mode 100644
index 17cdff87..00000000
--- a/hud/rl/buffer.py
+++ /dev/null
@@ -1,405 +0,0 @@
-"""Replay buffer for storing and sampling episodes."""
-
-from __future__ import annotations
-
-import logging
-import random
-from collections import deque
-from typing import TYPE_CHECKING, Generic, TypeVar
-
-from hud.types import Task, Trace
-from hud.utils.hud_console import HUDConsole
-
-logger = logging.getLogger(__name__)
-hud_console = HUDConsole(logger=logger)
-
-T = TypeVar("T")
-
-if TYPE_CHECKING:
-    from collections.abc import Callable
-
-    from hud.rl.config import Config
-
-
-class Buffer(Generic[T]):
-    """Simple buffer for a list of tasks, traces or episodes."""
-
-    def __init__(self, max_size: int = 10000) -> None:
-        self.max_size = max_size
-        self.buffer: deque[T] = deque(maxlen=max_size)
-
-    def add(self, items: list[T] | T, shuffle: bool = False) -> None:
-        """Add items to buffer."""
-        if isinstance(items, list):
-            for item in items:
-                self.buffer.append(item)
-        else:
-            self.buffer.append(items)
-        if shuffle:
-            random.shuffle(self.buffer)
-
-    def add_fill(self, items: list[T] | T, target_size: int, shuffle: bool = False) -> None:
-        """Add items to buffer until the buffer is at least the target size."""
-        while len(self.buffer) < target_size:
-            self.add(items, shuffle)
-
-    def get(self, n: int = 0) -> list[T]:
-        """Get items from the buffer."""
-        if n == 0:
-            return list(self.buffer)
-        if n > len(self.buffer):
-            raise ValueError("Not enough items in buffer")
-        return list(self.buffer)[-n:]
-
-    def consume(self, n: int = 0) -> list[T]:
-        """Consume items from the buffer."""
-        if n == 0:
-            return list(self.buffer)
-        if n > len(self.buffer):
-            raise ValueError("Not enough items in buffer")
-
-        return [self.buffer.pop() for _ in range(n)]
-
-    def get_filtered(
-        self, n: int = 0, filter_fn: Callable[[T], bool] | None = None, consume: bool = False
-    ) -> list[T]:
-        """Filter the buffer by a filter function."""
-        filtered = (
-            [item for item in self.buffer if filter_fn(item)] if filter_fn else list(self.buffer)
-        )
-        if n == 0:
-            return filtered
-        return self.consume(n) if consume else self.get(n)
-
-    def sample(
-        self,
-        batch_size: int,
-        n: int = 0,
-        filter_fn: Callable[[T], bool] | None = None,
-        consume: bool = False,
-    ) -> list[T]:
-        """Sample a batch of items with optional filtering."""
-        items = self.get_filtered(n, filter_fn, consume)
-
-        if len(items) < batch_size:
-            hud_console.warning(f"Buffer has {len(items)} items, requested {batch_size}")
-            return items
-
-        return random.sample(items, batch_size)
-
-    def clear(self) -> None:
-        """Clear the buffer."""
-        self.buffer.clear()
-
-    def __len__(self) -> int:
-        """Use len() directly on Buffer instances."""
-        return len(self.buffer)
-
-
-class DatasetBuffer(Buffer[Task]):
-    """
-    Buffer for a dataset.
-    Loads in individual tasks that will be trained for a specified number of training steps.
-    """
-
-    def __init__(
-        self,
-        dataset: list[Task] | Task,
-        config: Config,
-    ) -> None:
-        self.config = config
-
-        self.group_size = config.training.group_size
-        self.batch_size = config.training.batch_size
-        self.training_steps = config.training.training_steps
-
-        if self.group_size > self.batch_size:
-            raise ValueError(
-                f"Group size is greater than batch size, {self.group_size} > {self.batch_size}"
-            )
-
-        if self.batch_size % self.group_size != 0:
-            raise ValueError(
-                f"A batch cannot have irregular groups, {self.group_size} % {self.batch_size} != 0"
-            )
-
-        if self.group_size % config.training.mini_batch_size != 0:
-            raise ValueError(
-                f"Group size is not a multiple of mini batch size, {self.group_size} % {config.training.mini_batch_size} != 0"  # noqa: E501
-            )
-
-        self.groups_per_batch = self.batch_size // self.group_size
-        self.number_of_tasks = self.training_steps * self.groups_per_batch
-
-        super().__init__(self.number_of_tasks)
-
-        dataset = dataset if isinstance(dataset, list) else [dataset]
-        tasks = self._validate_tasks(dataset)
-        if config.training.shuffle_dataset:
-            random.shuffle(tasks)
-        if len(tasks) > self.number_of_tasks:
-            leftovers = len(tasks) - self.number_of_tasks
-            hud_console.warning(
-                f"Training steps ({self.training_steps}) will lead to {leftovers} tasks not being trained"  # noqa: E501
-            )
-            tasks = tasks[: self.number_of_tasks]
-
-        # Check if the dataset is imbalanced
-        self.dataset_size = len(tasks)
-        if self.training_steps % self.dataset_size != 0:
-            leftovers = self.number_of_tasks % self.dataset_size
-            hud_console.warning(
-                f"Dataset imbalanced ({leftovers} tasks will be trained 1 more time)"
-            )
-            hud_console.warning(
-                f"This is because the number of training steps ({self.training_steps}) is not a multiple of the dataset size ({self.dataset_size})"  # noqa: E501
-            )
-
-        if config.verbose:
-            hud_console.info(f"Sample task: {tasks[0]}")
-
-        self.add_fill(tasks, self.number_of_tasks, config.training.shuffle_dataset)
-
-    def _validate_tasks(self, tasks: list[Task]) -> list[Task]:
-        """Validate that all tasks are proper HUD Task objects."""
-        if not tasks:
-            raise ValueError("No tasks provided to DatasetBuffer")
-
-        validated_tasks = []
-        for i, task in enumerate(tasks):
-            if not isinstance(task, Task):
-                raise TypeError(f"Task at index {i} is not a HUD Task object, got {type(task)}")
-            validated_tasks.append(task)
-
-        return validated_tasks
-
-    @property
-    def info(self) -> dict[str, int | float | str]:
-        """Get the info of the buffer."""
-        return {
-            "total_items": len(self),
-            "total_traces": self.number_of_tasks * self.group_size,
-            "total_batches": self.training_steps,
-            "task_repeats": self.number_of_tasks // self.dataset_size,
-            "dataset_size": self.dataset_size,
-            "group_size": self.group_size,
-            "batch_size": self.batch_size,
-        }
-
-    def get_tasks(self, consume: bool = True) -> list[Task]:
-        """Get tasks for a batch."""
-        tasks = self.consume(self.groups_per_batch) if consume else self.get(self.groups_per_batch)
-        # Create groups where each group contains group_size copies of the same task
-        result = []
-        for task in tasks:
-            result.extend([task] * self.group_size)
-        return result
-
-
-class ReplayBuffer(Buffer[Trace]):
-    """Buffer for traces."""
-
-    def __init__(self, config: Config) -> None:
-        self.config = config
-
-        self.buffer_steps = config.training.buffer_steps
-        self.select_strategy = config.training.select_strategy
-        self.group_size = config.training.group_size
-        self.batch_size = config.training.batch_size
-
-        buffer_size = self.buffer_steps * self.batch_size
-
-        super().__init__(buffer_size)
-
-    def sample_traces(self) -> list[Trace]:
-        """Sample traces for a batch."""
-        if self.select_strategy == "recent":
-            return self.get(self.batch_size)
-        elif self.select_strategy == "random":
-            return self.sample(self.batch_size)
-        elif self.select_strategy == "variance":
-            return self._sample_high_variance_traces()
-        else:
-            raise ValueError(f"Invalid select strategy: {self.select_strategy}")
-
-    def _extract_group_key(self, trace: Trace) -> tuple[str, str]:
-        """Return a stable grouping key for a trace.
-
-        Preference order:
-        1) task.id when present (kind='id')
-        2) task.prompt exact string (kind='prompt') when id is None
-        3) 'NA' for missing/errored entries (kind='NA')
-        """
-        if getattr(trace, "isError", False):
-            return ("NA", "NA")
-
-        task = getattr(trace, "task", None)
-        if task is None:
-            return ("NA", "NA")
-
-        tid = getattr(task, "id", None)
-        if tid is not None:
-            return ("id", str(tid))
-
-        prompt = getattr(task, "prompt", None)
-        if prompt:
-            return ("prompt", str(prompt))
-
-        return ("NA", "NA")
-
-    def _validate_and_split_groups(
-        self, recent_traces: list[Trace]
-    ) -> tuple[list[list[Trace]], list[tuple[str, str]]]:
-        """Validate and split recent traces into homogeneous groups by id or prompt.
-
-        - Uses id when present; otherwise falls back to prompt equality.
-        - Any NA/error traces are excluded and the group is filled by duplicating
-          existing valid members in that group.
-        - Always returns len == groups_per_batch groups of size == group_size.
-        """
-        from collections import Counter
-
-        groups_per_batch = self.batch_size // self.group_size
-
-        window_keys = [self._extract_group_key(t) for t in recent_traces]
-        window_counter = Counter(k for k in window_keys if k[0] != "NA")
-
-        validated_groups: list[list[Trace]] = []
-        selected_keys: list[tuple[str, str]] = []
-
-        for g_idx in range(groups_per_batch):
-            start = g_idx * self.group_size
-            end = start + self.group_size
-            chunk = recent_traces[start:end]
-
-            key_counts = Counter()
-            per_item_keys: list[tuple[str, str]] = []
-            for tr in chunk:
-                k = self._extract_group_key(tr)
-                per_item_keys.append(k)
-                if k[0] != "NA":
-                    key_counts[k] += 1
-
-            if key_counts:
-                best_key = key_counts.most_common(1)[0][0]
-            elif window_counter:
-                best_key = window_counter.most_common(1)[0][0]
-            else:
-                best_key = ("NA", "NA")
-
-            homogeneous = [tr for tr, k in zip(chunk, per_item_keys, strict=False) if k == best_key]
-
-            while len(homogeneous) < self.group_size:
-                if homogeneous:
-                    homogeneous.append(homogeneous[-1])
-                else:
-                    idx = next((i for i, wk in enumerate(window_keys) if wk[0] != "NA"), None)
-                    if idx is not None:
-                        homogeneous.append(recent_traces[idx])
-                    elif chunk:
-                        homogeneous.append(chunk[0])
-                    else:
-                        homogeneous.append(recent_traces[0])
-
-            validated_groups.append(homogeneous)
-            selected_keys.append(best_key)
-
-        return validated_groups, selected_keys
-
-    def _sample_high_variance_traces(self) -> list[Trace]:
-        from collections import Counter, defaultdict, deque
-
-        buf_list = list(self.buffer)
-        if len(buf_list) < self.batch_size:
-            hud_console.warning(
-                f"[group-sampler] Buffer has only {len(buf_list)} traces, need {self.batch_size}"
-            )
-            while len(buf_list) < self.batch_size:
-                take = min(len(buf_list) or 1, self.batch_size - len(buf_list))
-                buf_list.extend(buf_list[:take])
-        recent_traces = buf_list[-self.batch_size :]
-
-        recent_keys = [self._extract_group_key(t) for t in recent_traces]
-        hud_console.info(f"[group-sampler] recent-window histogram: {Counter(recent_keys)}")
-
-        hud_console.info(
-            f"[group-sampler] Building earlier traces lookup, buffer size: {len(buf_list)}"
-        )
-        earlier_traces_by_key: dict[tuple[str, str], deque[Trace]] = defaultdict(deque)
-        for tr in buf_list[: -self.batch_size]:
-            k = self._extract_group_key(tr)
-            if k[0] != "NA":
-                earlier_traces_by_key[k].append(tr)
-
-        groups, group_keys = self._validate_and_split_groups(recent_traces)
-
-        final_traces: list[Trace] = []
-        for g_idx, (homogeneous, target_key) in enumerate(zip(groups, group_keys, strict=False)):
-
-            def current_mean(h: list[Trace]) -> float:
-                if not h:
-                    return 0.0
-                vals = [float(getattr(t, "reward", 0.0) or 0.0) for t in h]
-                return sum(vals) / len(vals)
-
-            pool = earlier_traces_by_key.get(target_key, deque())
-            if pool:
-                pool_vals = [float(getattr(tr, "reward", 0.0) or 0.0) for tr in list(pool)]
-                if pool_vals:
-                    pool_mean = sum(pool_vals) / len(pool_vals)
-                    pool_var = sum((v - pool_mean) * (v - pool_mean) for v in pool_vals) / len(
-                        pool_vals
-                    )
-                    hud_console.info(
-                        f"[group-sampler] Group {g_idx}: earlier-pool size={len(pool_vals)} "
-                        f"mean={pool_mean:.4f} std={(pool_var**0.5):.4f}"
-                    )
-
-                replace_k = max(1, self.group_size // 4)
-                replace_k = min(replace_k, len(pool), self.group_size)
-
-                if replace_k > 0:
-                    mu = current_mean(homogeneous)
-                    pool_list = list(pool)
-                    pool_indices = list(range(len(pool_list)))
-                    pool_indices.sort(
-                        key=lambda i: abs(
-                            (float(getattr(pool_list[i], "reward", 0.0) or 0.0)) - mu
-                        ),
-                        reverse=True,
-                    )
-                    chosen_pool_idx = set(pool_indices[:replace_k])
-                    replacements = [pool_list[i] for i in pool_indices[:replace_k]]
-
-                    remaining = [tr for i, tr in enumerate(pool_list) if i not in chosen_pool_idx]
-                    earlier_traces_by_key[target_key] = deque(remaining)
-
-                    group_indices = list(range(len(homogeneous)))
-                    mu = current_mean(homogeneous)
-                    group_indices.sort(
-                        key=lambda i: abs(
-                            (float(getattr(homogeneous[i], "reward", 0.0) or 0.0)) - mu
-                        )
-                    )
-                    target_positions = group_indices[:replace_k]
-
-                    for pos, new_tr in zip(target_positions, replacements, strict=False):
-                        homogeneous[pos] = new_tr
-
-            if any(self._extract_group_key(t) != target_key for t in homogeneous):
-                raise RuntimeError(f"Group {g_idx} is not homogeneous after sampling")
-            final_traces.extend(homogeneous)
-
-        for i in range(0, len(final_traces), self.group_size):
-            block = final_traces[i : i + self.group_size]
-            keys = {self._extract_group_key(t) for t in block}
-            if len(keys) != 1:
-                raise RuntimeError(f"Homogeneity validation failed for block starting at index {i}")
-
-        hud_console.info(
-            f"[group-sampler] final histogram: "
-            f"{Counter(self._extract_group_key(t) for t in final_traces)}"
-        )
-        return final_traces
-
-        # --------------------------------------------------------------------
diff --git a/hud/rl/chat_template.jinja b/hud/rl/chat_template.jinja
deleted file mode 100644
index 00fd8c18..00000000
--- a/hud/rl/chat_template.jinja
+++ /dev/null
@@ -1,101 +0,0 @@
-{% set image_count = namespace(value=0) %}
-{% set video_count = namespace(value=0) %}
-{{- '<|im_start|>system\n' }}
-{%- if messages[0]['role'] == 'system' -%}
-    {%- if messages[0]['content'] is string -%}
-        {{ messages[0]['content'] }}
-    {%- else -%}
-        {%- for content in messages[0]['content'] -%}
-            {%- if content['type'] == 'image' or 'image' in content or 'image_url' in content -%}
-                {%- set image_count.value = image_count.value + 1 -%}
-                {%- if add_vision_id -%}
-                    {{ 'Picture ' ~ image_count.value ~ ': ' }}
-                {%- endif -%}
-                {{ '<|vision_start|><|image_pad|><|vision_end|>' }}
-            {%- elif content['type'] == 'video' or 'video' in content -%}
-                {%- set video_count.value = video_count.value + 1 -%}
-                {%- if add_vision_id -%}
-                    {{ 'Video ' ~ video_count.value ~ ': ' }}
-                {%- endif -%}
-                {{ '<|vision_start|><|video_pad|><|vision_end|>' }}
-            {%- elif 'text' in content -%}
-                {{ content['text'] }}
-            {%- endif -%}
-        {%- endfor -%}
-    {%- endif -%}
-{%- else -%}
-    {{ 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
-{%- endif -%}
-{%- if tools -%}
-    {{ '\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>\n' }}
-    {{- tools | map('tojson') | join('\n') -}}
-    {{ '\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{"name": <function-name>, "arguments": <args-json-object>}\n</tool_call>' }}
-{%- endif -%}
-{{ '<|im_end|>\n' }}
-{%- for message in messages -%}
-    {# Skip the first system message as it was already rendered. #}
-    {%- if loop.first and message.role == 'system' %}{% continue %}{% endif -%}
-
-    {# Render tool messages. The logic is slightly different with other messages. #}
-    {%- if message['role'] == 'tool' -%}
-        {%- if loop.first or messages[loop.index0 - 1]['role'] != 'tool' -%}
-            {{ '<|im_start|>user' }}
-        {%- endif -%}
-        {{ '\n<tool_response>\n' }}
-    {%- else -%}
-        {{ '<|im_start|>' ~ message['role'] ~ '\n' }}
-    {%- endif -%}
-
-    {%- if message['content'] is string -%}
-        {{ message['content'] }}
-    {%- else -%}
-        {%- for content in message['content'] -%}
-            {%- if content['type'] == 'image' or 'image' in content or 'image_url' in content -%}
-                {%- set image_count.value = image_count.value + 1 -%}
-                {%- if add_vision_id -%}
-                    {{ 'Picture ' ~ image_count.value ~ ': ' }}
-                {%- endif -%}
-                {{ '<|vision_start|><|image_pad|><|vision_end|>' }}
-            {%- elif content['type'] == 'video' or 'video' in content -%}
-                {%- set video_count.value = video_count.value + 1 -%}
-                {%- if add_vision_id -%}
-                    {{ 'Video ' ~ video_count.value ~ ': ' }}
-                {%- endif -%}
-                {{ '<|vision_start|><|video_pad|><|vision_end|>' }}
-            {%- elif 'text' in content and message['role'] == 'assistant' -%}
-                {% generation %} {{ content['text'] }} {% endgeneration %}
-            {%- elif 'text' in content -%}
-                {{ content['text'] }}
-            {%- endif -%}
-        {%- endfor -%}
-    {%- endif -%}
-    {# Render tool_calls in AI messages. #}
-    {%- if message['role'] == 'assistant' and 'tool_calls' in message -%}
-        {# It will be cleaner if I can use some map function and join them with '\n' #}
-        {%- for tool_call in message['tool_calls'] -%}
-            {%- if tool_call['function'] is defined -%}
-                {%- set tool_call = tool_call['function'] -%}
-            {%- endif -%}
-            {# Handle the case where arguments is already a JSON string (OpenAI format) #}
-            {%- if tool_call.arguments is string -%}
-                {% generation %} {{ '<tool_call>\n{"name": "' }}{{ tool_call.name }}{{ '", "arguments": ' }}{{ tool_call.arguments }}{{ '}\n</tool_call>' }} {% endgeneration %}
-            {%- else -%}
-                {% generation %} {{ '<tool_call>\n' }}{{ tool_call | tojson }}{{ '\n</tool_call>' }} {% endgeneration %}
-            {%- endif -%}
-            {%- if not loop.last -%}
-                {% generation %} {{ '\n' }} {% endgeneration %}
-            {%- endif -%}
-        {%- endfor -%}
-    {%- endif -%}
-    {%- if message['role'] == 'tool' -%}
-        {{ '\n</tool_response>' }}
-        {%- if loop.last or messages[loop.index0 + 1]['role'] != 'tool' -%}
-            {{ '<|im_end|>\n' }}
-        {%- endif -%}
-    {%- else -%}
-        {{ '<|im_end|>\n' }}
-    {%- endif -%}
-{%- endfor -%}
-{%- if add_generation_prompt -%}
-    {{ '<|im_start|>assistant\n' }}
-{%- endif -%}
diff --git a/hud/rl/config.py b/hud/rl/config.py
deleted file mode 100644
index f795e3ff..00000000
--- a/hud/rl/config.py
+++ /dev/null
@@ -1,193 +0,0 @@
-"""Configuration for RL training."""
-
-from __future__ import annotations
-
-from dataclasses import dataclass, field
-from typing import Literal
-
-# List of supported VL (Vision-Language) models
-SUPPORTED_MODELS = [
-    "Qwen/Qwen2.5-VL-3B-Instruct",
-    "Qwen/Qwen2.5-VL-7B-Instruct",
-    "Qwen/Qwen2.5-VL-14B-Instruct",
-    "Qwen/Qwen2.5-VL-32B-Instruct",
-    "Qwen/Qwen2.5-VL-72B-Instruct",
-    "Qwen/Qwen2.5-7B-Instruct",
-    "Qwen/Qwen2.5-3B-Instruct",
-]
-
-
-def validate_vl_model(model_name: str) -> None:
-    """Validate that the model is a supported VL model.
-
-    Args:
-        model_name: The model name to validate
-
-    Raises:
-        ValueError: If the model is not a supported VL model
-    """
-    if not any(model_name.startswith(supported) for supported in SUPPORTED_MODELS):
-        raise ValueError(
-            f"Model '{model_name}' is not a supported VL model. "
-            f"Only VL (Vision-Language) models are supported for RL training.\n"
-            f"Supported models: {', '.join(SUPPORTED_MODELS)}\n"
-            f"Note: '{model_name}' appears to be a text-only model."
-        )
-
-
-@dataclass
-class ModelConfig:
-    """Model and LoRA configuration."""
-
-    base_model: str = "Qwen/Qwen2.5-VL-3B-Instruct"
-    lora_r: int = 16
-    lora_alpha: int = 32
-    lora_dropout: float = 0.1
-    target_modules: tuple[str, ...] = (
-        "q_proj",
-        "k_proj",
-        "v_proj",
-        "o_proj",
-        "gate_proj",
-        "up_proj",
-        "down_proj",
-    )
-    min_pixels: int = 256 * 28 * 28
-    max_pixels: int = 512 * 28 * 28
-    attn_implementation: str = "flash_attention_2"
-    use_liger: bool = True
-    gradient_checkpointing: bool = True
-    adapter_path: str | None = None  # Path to existing LoRA adapter to load as baseline
-
-
-@dataclass
-class TrainingConfig:
-    """Training hyperparameters."""
-
-    # GPU parameters
-    gpu_type: str = "A100"
-    num_gpus: int = 2
-
-    # Training parameters
-    training_steps: int = 100
-    shuffle_dataset: bool = False
-    save_every_batches: int = 1
-
-    # Batching parameters
-    epochs: int = 1
-    batch_size: int = 16
-    group_size: int = 8
-    mini_batch_size: int = 1
-    update_after_group: bool = True  # Whether to update the policy after each task group
-    accumulate_over_minibatches: bool = False  # Whether to accumulate over minibatches
-
-    # Advantage calculation parameters
-    batch_level: Literal["group", "batch"] = "group"
-    no_std: bool = False
-    leave_one_out: bool = True
-
-    # Replay buffer parameters
-    buffer_steps: int = 8
-    select_strategy: Literal["recent", "variance", "random"] = "variance"
-
-    # Aggregation parameters
-    ppo_mode: Literal["per_token", "per_trace"] = "per_token"
-    token_agg: Literal["mean", "sum"] = "mean"  # noqa: S105
-
-    # Regularization parameters
-    kl_beta: float = 0.001
-    entropy_beta: float = 0.001
-    top_eps: float = 0.2
-    bottom_eps: float = 0.1
-
-    # Training hyperparameters
-    lr: float = 3e-5
-    grad_clip: float = 1.0
-
-    # Adam hyperparameters
-    use_8bit_optimizer: bool = True
-    adam_betas: tuple[float, float] = (0.9, 0.999)
-    adam_eps: float = 1e-8
-
-
-@dataclass
-class ActorConfig:
-    """Actor/episode collection configuration."""
-
-    # Execution parameters
-    max_steps_per_episode: int = 5
-    max_parallel_episodes: int = 48
-    max_new_tokens: int = 1024
-    force_tool_choice: bool = True
-    allowed_tools: list[str] | None = None
-
-    # Model parameters
-    temperature: float = 0.7
-
-    # Hud agent parameters
-    system_prompt: str = "You are an expert agent. Complete the task efficiently."
-    vllm_base_url: str = "http://localhost:8000/v1"
-    vllm_api_key: str = "token-abc123"
-
-    # Episode execution timeout (seconds)
-    episode_timeout_sec: int = 600
-
-
-@dataclass
-class Config:
-    """Main configuration combining all sub-configs."""
-
-    model: ModelConfig = field(default_factory=ModelConfig)
-    training: TrainingConfig = field(default_factory=TrainingConfig)
-    actor: ActorConfig = field(default_factory=ActorConfig)
-
-    # Telemetry configuration
-    job_name: str = "RL Training"
-    job_id: str | None = None  # Use existing job ID if provided
-    stats_interval: int = 1
-    verbose: bool = False
-    very_verbose: bool = False
-
-    # Paths
-    out_dir: str = "./checkpoints"
-    adapter_prefix: str = "cua-grpo-step"
-
-    # Misc
-    seed: int = 1234
-
-    @classmethod
-    def from_dict(cls, d: dict) -> Config:
-        """Create config from dictionary."""
-        model = ModelConfig(**d.get("model", {}))
-        training = TrainingConfig(**d.get("training", {}))
-        actor = ActorConfig(**d.get("actor", {}))
-
-        return cls(
-            model=model,
-            training=training,
-            actor=actor,
-            job_name=d.get("job_name", "RL Training"),
-            job_id=d.get("job_id"),
-            stats_interval=d.get("stats_interval", 1),
-            verbose=d.get("verbose", False),
-            very_verbose=d.get("very_verbose", False),
-            out_dir=d.get("out_dir", "./checkpoints"),
-            adapter_prefix=d.get("adapter_prefix", "cua-grpo-step"),
-            seed=d.get("seed", 1234),
-        )
-
-    def to_dict(self) -> dict:
-        """Convert config to dictionary."""
-        return {
-            "model": self.model.__dict__,
-            "training": self.training.__dict__,
-            "actor": self.actor.__dict__,
-            "job_name": self.job_name,
-            "job_id": self.job_id,
-            "stats_interval": self.stats_interval,
-            "verbose": self.verbose,
-            "very_verbose": self.very_verbose,
-            "out_dir": self.out_dir,
-            "adapter_prefix": self.adapter_prefix,
-            "seed": self.seed,
-        }
diff --git a/hud/rl/distributed.py b/hud/rl/distributed.py
deleted file mode 100644
index 6bade77c..00000000
--- a/hud/rl/distributed.py
+++ /dev/null
@@ -1,132 +0,0 @@
-"""Distributed training utilities for GRPO."""
-
-from __future__ import annotations
-
-import os
-from datetime import timedelta
-from typing import Any
-
-import torch
-import torch.distributed as dist
-
-
-def setup_distributed() -> None:
-    """Initialize distributed training environment."""
-    if "RANK" in os.environ and int(os.environ["WORLD_SIZE"]) > 1:
-        # Set device for this process
-        local_rank = int(os.environ["LOCAL_RANK"])
-        torch.cuda.set_device(local_rank)
-
-        # Initialize process group
-        # Increase watchdog timeout to accommodate long eval/sampling phases
-        # and enable clearer NCCL error handling.
-        os.environ.setdefault("TORCH_NCCL_ASYNC_ERROR_HANDLING", "1")
-        dist.init_process_group("nccl", timeout=timedelta(minutes=20))
-
-
-def get_local_rank() -> int:
-    """Get local rank from environment."""
-    return int(os.environ.get("LOCAL_RANK", 0))
-
-
-def get_global_rank() -> int:
-    """Get global rank from environment."""
-    return int(os.environ.get("RANK", 0))
-
-
-def get_world_size() -> int:
-    """Get world size from environment."""
-    return int(os.environ.get("WORLD_SIZE", 1))
-
-
-def cleanup_distributed() -> None:
-    """Clean up distributed environment."""
-    if dist.is_initialized():
-        dist.destroy_process_group()
-
-
-def is_main_process() -> bool:
-    """Check if this is the main process (rank 0)."""
-    if not dist.is_initialized():
-        return True
-    return dist.get_rank() == 0
-
-
-def synchronize() -> None:
-    """Synchronize all processes."""
-    if dist.is_initialized():
-        dist.barrier()
-
-
-def all_reduce_mean(tensor: torch.Tensor) -> torch.Tensor:
-    """Average a tensor across all processes."""
-    if not dist.is_initialized():
-        return tensor
-
-    world_size = dist.get_world_size()
-    dist.all_reduce(tensor, op=dist.ReduceOp.SUM)
-    tensor /= world_size
-    return tensor
-
-
-def broadcast_object(obj: Any, src: int = 0) -> Any:
-    """Broadcast a Python object from src rank to all ranks.
-
-    Args:
-        obj: Object to broadcast (used on src rank)
-        src: Source rank
-        device: Device for temporary tensor buffer during pickling transfer
-    """
-    if not dist.is_initialized():
-        return obj
-
-    obj_list = [obj] if dist.get_rank() == src else [None]
-    dist.broadcast_object_list(obj_list, src=src)
-    return obj_list[0]
-
-
-def scatter_object(
-    obj_list: list[Any] | None,
-    src: int = 0,
-) -> Any:
-    """Scatter a list of Python objects from src so each rank receives one object.
-
-    Usage:
-        - On src rank: pass the full list (length == world_size)
-        - On non-src ranks: pass None
-
-    Returns:
-        The object intended for this rank.
-    """
-    if not dist.is_initialized():
-        # Single-process: return first element if provided, else None
-        if obj_list is None or len(obj_list) == 0:
-            return None
-        return obj_list[0]
-
-    out: list[Any] = [None]
-    if dist.get_rank() == src:
-        dist.scatter_object_list(out, obj_list, src=src)
-    else:
-        dist.scatter_object_list(out, None, src=src)
-    return out[0]
-
-
-def gather_tensors(tensor: torch.Tensor) -> list[torch.Tensor] | None:
-    """Gather tensors from all ranks to rank 0.
-
-    Returns:
-        List of tensors on rank 0, None on other ranks
-    """
-    if not dist.is_initialized():
-        return [tensor]
-
-    world_size = dist.get_world_size()
-
-    if dist.get_rank() == 0:
-        gathered = [torch.zeros_like(tensor) for _ in range(world_size)]
-        dist.gather(tensor, gathered, dst=0)
-        return gathered
-    else:
-        dist.gather(tensor, None, dst=0)
-        return None
diff --git a/hud/rl/learner.py b/hud/rl/learner.py
deleted file mode 100644
index 859d7ec4..00000000
--- a/hud/rl/learner.py
+++ /dev/null
@@ -1,648 +0,0 @@
-"""GRPO learner for vision-language and text models."""
-
-from __future__ import annotations
-
-import logging
-import os
-from typing import TYPE_CHECKING, Any
-
-import torch
-from peft import LoraConfig, get_peft_model
-from torch.nn.parallel import DistributedDataParallel as DDP
-from transformers import (
-    AutoModelForCausalLM,
-    AutoProcessor,
-    AutoTokenizer,
-    Qwen2_5_VLForConditionalGeneration,
-)
-
-try:
-    from liger_kernel.transformers import apply_liger_kernel_to_qwen2_5_vl  # type: ignore
-
-    LIGER_AVAILABLE = True
-except ImportError:
-    LIGER_AVAILABLE = False
-
-try:
-    import bitsandbytes as bnb  # type: ignore
-
-    BNB_AVAILABLE = True
-except ImportError:
-    BNB_AVAILABLE = False
-
-from contextlib import nullcontext
-
-from hud.rl.distributed import (
-    get_local_rank,
-    get_world_size,
-    is_main_process,
-)
-from hud.rl.utils import (
-    batch_training_samples,
-    entropy_from_logits,
-    get_gpu_utilization,
-    get_memory_usage,
-    prepare_inputs,
-)
-from hud.utils.hud_console import HUDConsole
-
-from .types import TrainingMetrics, TrainingSample
-
-logger = logging.getLogger(__name__)
-hud_console = HUDConsole(logger)
-
-if TYPE_CHECKING:
-    from .config import Config
-
-
-class GRPOLearner:
-    """GRPO learning algorithm for Vision-Language Models (VLMs) and Text Models."""
-
-    def __init__(self, config: Config) -> None:
-        self.config = config
-        self.local_rank = get_local_rank()
-        self.world_size = get_world_size()
-        self.device = torch.device(
-            f"cuda:{self.local_rank}" if torch.cuda.is_available() else "cpu"
-        )
-
-        # Detect model type
-        self.is_vl_model = "VL" in config.model.base_model
-
-        # Load models and processor
-        self.processor, self.policy, self.ref, self.optimizer = self._load_models()
-        self.metrics: list[TrainingMetrics] = []
-
-    def log(self, message: str) -> None:
-        hud_console.info_log(f"[{self.local_rank}] {message}")
-
-    def _load_models(self) -> tuple[Any, Any, Any, Any]:
-        """Load policy, reference models and optimizer."""
-        model_cfg = self.config.model
-
-        # Detect if this is a VL model or standard text model
-        is_vl_model = "VL" in model_cfg.base_model
-        model_type = "Vision-Language" if is_vl_model else "Text"
-        self.log(f"Loading {model_type} model: {model_cfg.base_model}")
-
-        # Apply Liger kernel optimizations if available and enabled
-        if model_cfg.use_liger and LIGER_AVAILABLE:
-            if is_vl_model:
-                self.log("Applying Liger kernel optimizations to Qwen2.5-VL")
-                apply_liger_kernel_to_qwen2_5_vl(
-                    rope=True,  # Optimized RoPE
-                    rms_norm=True,  # Optimized RMSNorm
-                    swiglu=True,  # Optimized SwiGLU
-                    fused_linear_cross_entropy=True,  # Fused Linear+CrossEntropy for memory
-                )
-        elif model_cfg.use_liger and not LIGER_AVAILABLE:
-            self.log(
-                "Liger kernel requested but not installed. Install with: pip install liger-kernel"
-            )
-
-        # Load processor/tokenizer based on model type
-        if is_vl_model:
-            # Some environments require remote code for Qwen2.5-VL processors
-            processor = AutoProcessor.from_pretrained(
-                model_cfg.base_model,
-                min_pixels=model_cfg.min_pixels,
-                max_pixels=model_cfg.max_pixels,
-                trust_remote_code=True,
-            )
-        else:
-            processor = AutoTokenizer.from_pretrained(model_cfg.base_model)
-
-        # Load policy model with LoRA
-        # Use attention implementation from config
-        attn_implementation = model_cfg.attn_implementation
-
-        # Choose the appropriate model class
-        model_class = Qwen2_5_VLForConditionalGeneration if is_vl_model else AutoModelForCausalLM
-
-        try:
-            policy = model_class.from_pretrained(
-                model_cfg.base_model,
-                torch_dtype=torch.bfloat16,
-                attn_implementation=attn_implementation,
-                trust_remote_code=True,
-            )
-            self.log(f"Using {attn_implementation} for attention")
-        except (ImportError, ValueError) as e:
-            # Only fallback if explicitly using flash_attention_2 and it's not available
-            if attn_implementation == "flash_attention_2":
-                self.log(f"Flash Attention 2 not available ({e}), using eager attention")
-                policy = model_class.from_pretrained(
-                    model_cfg.base_model,
-                    torch_dtype=torch.bfloat16,
-                    attn_implementation="eager",
-                )
-            else:
-                raise  # Re-raise if it's a different error
-
-        # Move model to device
-        policy = policy.to(self.device)  # type: ignore
-        # Enable gradient checkpointing for memory efficiency
-        if model_cfg.gradient_checkpointing:
-            policy.gradient_checkpointing_enable()
-            self.log("Gradient checkpointing enabled for memory efficiency")
-
-        # Add LoRA adapters or load existing adapter
-        policy.config.use_cache = False
-
-        if model_cfg.adapter_path:
-            # Load existing adapter as baseline
-            self.log(f"Loading existing LoRA adapter from: {model_cfg.adapter_path}")
-            from peft import PeftModel
-
-            policy = PeftModel.from_pretrained(policy, model_cfg.adapter_path)
-            # Enable adapter training
-            policy.train()
-        else:
-            # Create new LoRA adapter
-            lora_config = LoraConfig(
-                r=model_cfg.lora_r,
-                lora_alpha=model_cfg.lora_alpha,
-                lora_dropout=model_cfg.lora_dropout,
-                task_type="CAUSAL_LM",
-                bias="none",
-                target_modules=list(model_cfg.target_modules),
-            )
-            policy = get_peft_model(policy, lora_config)
-
-        # Wrap with DDP if in distributed mode
-        if self.world_size > 1:
-            policy = DDP(
-                policy,
-                device_ids=[self.local_rank],
-                output_device=self.local_rank,
-                broadcast_buffers=False,
-                find_unused_parameters=True,
-            )
-            self.log("Wrapped model (find_unused_parameters=True)")
-
-        # Create optimizer - need to access underlying model if DDP
-        base_model = policy.module if hasattr(policy, "module") else policy
-        trainable_params = [p for _, p in base_model.named_parameters() if p.requires_grad]  # type: ignore
-
-        # Use 8-bit optimizer if configured
-        if self.config.training.use_8bit_optimizer and BNB_AVAILABLE:
-            hud_console.info("Using 8-bit AdamW optimizer from bitsandbytes")
-            optimizer = bnb.optim.AdamW8bit(  # type: ignore
-                trainable_params,
-                lr=self.config.training.lr,
-                betas=self.config.training.adam_betas,
-                eps=self.config.training.adam_eps,
-            )
-        else:
-            self.log("Using standard FP32 AdamW optimizer")
-            optimizer = torch.optim.AdamW(
-                trainable_params,
-                lr=self.config.training.lr,
-                betas=self.config.training.adam_betas,
-                eps=self.config.training.adam_eps,
-            )
-
-        # Log optimizer info
-        self.log(f"Optimizer: {type(optimizer).__name__}")
-        num_params = sum(p.numel() for p in trainable_params)
-        self.log(f"Number of trainable parameters: {num_params:,}")
-
-        return processor, policy, None, optimizer
-
-    def prepare_groups(
-        self,
-        samples: list[TrainingSample],
-    ) -> list[list[TrainingSample]]:
-        """Prepare groups of samples for training."""
-        # Prepare inputs with messages
-        batch = []
-        for sample in samples:
-            inputs = prepare_inputs(sample, self.processor)
-            # If inputs are invalid, create dummy inputs to maintain batch size
-            if (
-                not inputs
-                or "input_ids" not in inputs
-                or inputs.get("input_ids", torch.tensor([])).numel() == 0
-            ):
-                hud_console.warning_log("Sample has invalid inputs, using dummy values")
-                # Create minimal dummy inputs to keep batch size consistent
-                inputs = {
-                    "input_ids": torch.zeros(1, 2, dtype=torch.long),  # Minimal sequence
-                    "attention_mask": torch.ones(1, 2, dtype=torch.long),
-                    "assistant_mask": torch.zeros(1, 1, dtype=torch.bool),  # T-1 length
-                }
-            elif "assistant_mask" not in inputs:
-                hud_console.warning_log("Sample missing assistant_mask, creating zero mask")
-                seq_len = inputs["input_ids"].shape[-1]
-                inputs["assistant_mask"] = torch.zeros(
-                    inputs["input_ids"].shape[0], seq_len - 1, dtype=torch.bool
-                )
-
-            new_sample = TrainingSample(**sample.model_dump())
-            new_sample.inputs = inputs
-            new_sample.advantage = sample.advantage
-            batch.append(new_sample)
-
-        with hud_console.progress("Processing batch of traces...") as progress, torch.no_grad():
-            for i, sample in enumerate(batch):
-                if is_main_process():
-                    progress.update(f"Processing batch of traces... {i}/{len(batch)}")
-                if sample.inputs:
-                    sample = sample.to_device(self.device)
-                    sample.old_logprobs, _ = self.compute_logprobs(self.policy, sample.inputs)
-                    # Free GPU memory for this sample immediately
-                    sample.to_device(torch.device("cpu"))
-
-            policy_module = self.policy.module if hasattr(self.policy, "module") else self.policy
-            with policy_module.disable_adapter():
-                for i, sample in enumerate(batch):
-                    if is_main_process():
-                        progress.update(f"Processing batch of traces... {i}/{len(batch)}")
-                    if sample.inputs:
-                        # Move back to GPU for reference computation, then free
-                        sample = sample.to_device(self.device)
-                        sample.ref_logprobs, _ = self.compute_logprobs(self.policy, sample.inputs)
-                        sample.to_device(torch.device("cpu"))
-
-        hud_console.info_log("Creating mini-batches...")
-        group_size = self.config.training.group_size
-        processed_batch = []
-        if not self.config.training.accumulate_over_minibatches:
-            # Find minibatches and group them via batch_training_samples
-            # Minibatches control the batch size of the forward pass to the model
-            mb_size = self.config.training.mini_batch_size
-            group_size = group_size // mb_size
-            for i in range(0, len(batch), mb_size):
-                processed_batch.extend(batch_training_samples(batch[i : i + mb_size]))
-        else:
-            processed_batch = batch
-
-        for sample in processed_batch:
-            sample.to_device(torch.device("cpu"))
-
-        # Convert to grouped batches (if updating the model after each task group)
-        if self.config.training.update_after_group:
-            return [
-                processed_batch[i : i + group_size]
-                for i in range(0, len(processed_batch), group_size)
-            ]
-        else:
-            return [processed_batch]
-
-    def update(self, samples: list[TrainingSample]) -> TrainingMetrics:
-        """Perform a gradient update on a batch."""
-        import time
-
-        training_start_time = time.time()
-
-        # Always create metrics for synchronization
-        self.metrics.append(TrainingMetrics())
-        metrics = self.metrics[-1]
-
-        # Prepare groups for GRPO training
-        groups = self.prepare_groups(samples)
-        self.log(f"Updating over {len(groups)} groups")
-
-        # Update over mini batch size
-        with hud_console.progress("Gradient update...") as progress:
-            for epoch in range(self.config.training.epochs):  # Do not accumulate across epochs
-                progress.update(f"Training epoch {epoch + 1}/{self.config.training.epochs}")
-                for group_idx, group in enumerate(groups):  # Do not accumulate across "groups"
-                    self.optimizer.zero_grad(set_to_none=True)
-
-                    debug_per_group = ""
-                    grad_accum_steps = len(group)
-                    # Tensor for distributed sync
-                    global_skip = torch.zeros(1, device=self.device)
-
-                    for s_idx, sample_minibatch in enumerate(group):
-                        # self.log(f"{group_idx} {sample_minibatch.inputs['assistant_mask'].sum()}")
-                        # mini_updated = sample_minibatch.inputs["assistant_mask"].sum() > 0
-
-                        # Update mini_updated globally
-                        # self.log(f"{group_idx} Mini updated: {mini_updated}")
-
-                        # Do not sync until the last minibatch
-                        if s_idx < len(group) - 1 and self.world_size > 1:
-                            ddp_ctx = self.policy.no_sync()
-                        else:
-                            ddp_ctx = nullcontext()
-
-                        with ddp_ctx, torch.autocast(device_type="cuda", dtype=torch.bfloat16):
-                            try:
-                                # if mini_updated:
-                                loss = self.compute_loss(sample_minibatch) / grad_accum_steps
-                                debug_per_group += f"l{s_idx}:{round(loss.item(), 3)!s} "
-                                loss.backward()
-                                # else: # Dummy backward that touches all params, produces zero g
-                                #     dummy = sum(p.sum() for p in self.policy.parameters()) * 0.0
-                                #     debug_per_group += f"d{s_idx}:{str(round(dummy.item(), 3))} "
-                                #     dummy.backward()
-                                # self.log(f"{group_idx} GPU Backward: {get_gpu_utilization():.1f}% | Memory: {get_memory_usage():.2f} GB") # noqa: E501
-                            except torch.cuda.OutOfMemoryError:
-                                hud_console.warning_log(
-                                    f"{group_idx} CUDA OOM for {sample_minibatch.inputs['input_ids'].numel()} tokens; skipping minibatch"  # noqa: E501
-                                )
-                                # Dummy backward to keep DDP happy
-                                dummy = torch.sum(p.sum() for p in self.policy.parameters()) * 0.0  # type: ignore
-                                debug_per_group += f"o{s_idx}:{round(dummy.item(), 3)!s} "
-                                dummy.backward()
-                                # mark global skip if OOM
-                                global_skip.fill_(1)
-                                continue
-
-                    if torch.cuda.is_available():
-                        torch.cuda.empty_cache()
-
-                    # After minibatches loop, sync skip across ranks
-                    if torch.distributed.is_initialized():
-                        torch.distributed.all_reduce(global_skip, op=torch.distributed.ReduceOp.MAX)
-                    skip_any = bool(global_skip.item())
-
-                    if skip_any:
-                        self.log(f"G[{group_idx}] {debug_per_group} N/A (skipped)")
-                        continue
-
-                    grad_norm = torch.nn.utils.clip_grad_norm_(
-                        self.policy.parameters(),
-                        self.config.training.grad_clip,
-                        error_if_nonfinite=True,
-                    )
-                    self.optimizer.step()
-
-                    debug_per_group += f"g:{round(grad_norm.item(), 3)!s}"
-                    self.log(f"G[{group_idx}] {debug_per_group}")
-
-                    metrics.update(
-                        {
-                            "grad_norm": grad_norm.item()
-                            if isinstance(grad_norm, torch.Tensor)
-                            else float(grad_norm),
-                        }
-                    )
-
-        # Calculate training time and throughput
-        training_time = time.time() - training_start_time
-        total_samples = (
-            len(groups) * self.config.training.group_size * self.config.training.mini_batch_size
-        )
-        samples_per_second = total_samples / training_time if training_time > 0 else 0.0
-
-        metrics.update(
-            {
-                "training_time": training_time,
-                "samples_per_second": samples_per_second,
-            }
-        )
-
-        return metrics
-
-    def compute_loss(self, sample: TrainingSample) -> torch.Tensor:
-        """Compute GRPO loss for a batch of samples."""
-        training_cfg = self.config.training
-        metrics = self.metrics[-1] if len(self.metrics) > 0 else TrainingMetrics()
-
-        sample.to_device(self.device)
-
-        pol_logp, pol_entropy = self.compute_logprobs(
-            self.policy,
-            sample.inputs,
-        )
-
-        sanity_check(sample, pol_logp, sample.old_logprobs, sample.ref_logprobs)
-
-        metrics.update(
-            {
-                "gpu_util": get_gpu_utilization(),  # Track peak utilization
-                "gpu_memory": get_memory_usage(),  # Track memory usage
-            }
-        )
-        self.log(f"GPU Util: {get_gpu_utilization():.1f}% | Memory: {get_memory_usage():.2f} GB")
-
-        old_logp = sample.old_logprobs
-        ref_logp = sample.ref_logprobs
-
-        if old_logp is None or ref_logp is None or sample.advantage is None:
-            raise ValueError("old_logp, ref_logp, or sample.advantage is None")
-
-        # Use assistant mask to remove non-assistant tokens
-        m = sample.inputs["assistant_mask"]
-
-        # Aggregate per trace or per token
-        if training_cfg.ppo_mode == "per_trace":
-            counts = m.sum(dim=1).clamp_min(1.0)
-            pol_logp = (pol_logp * m.float()).sum(dim=1) / counts
-            pol_entropy = (pol_entropy * m.float()).sum(dim=1) / counts
-            old_logp = (old_logp * m.float()).sum(dim=1) / counts
-            ref_logp = (ref_logp * m.float()).sum(dim=1) / counts
-
-        # Clip log probability differences
-        log_ratio = torch.where(m, pol_logp - old_logp, torch.zeros_like(pol_logp))
-        ratio_tok = torch.exp(log_ratio.clamp(-20.0, 20.0))
-
-        # Ensure advantage shape matches ratio_tok for broadcasting
-        advantage = (
-            sample.advantage.view(-1, 1) if ratio_tok.dim() == 2 else sample.advantage.squeeze(-1)
-        )
-
-        unclipped = ratio_tok * advantage
-        clipped = (
-            torch.clamp(ratio_tok, 1 - training_cfg.top_eps, 1 + training_cfg.bottom_eps)
-            * advantage
-        )
-
-        policy_term = -torch.minimum(unclipped, clipped)
-
-        # Clip log probability differences in KL
-        log_rho = torch.where(m, pol_logp - ref_logp, torch.zeros_like(pol_logp))
-        rho_tok = torch.exp(log_rho.clamp(-20.0, 20.0))
-        kl_approx = rho_tok - torch.log(rho_tok) - 1
-
-        total_loss = (
-            policy_term + training_cfg.kl_beta * kl_approx + training_cfg.entropy_beta * pol_entropy
-        )
-
-        # Aggregate loss
-        if training_cfg.ppo_mode == "per_trace":
-            total_loss = total_loss.mean() if training_cfg.token_agg == "mean" else total_loss.sum()  # noqa: S105
-        else:
-            if training_cfg.token_agg == "mean":  # noqa: S105
-                total_loss = (total_loss * m).sum() / m.sum().clamp_min(1.0)
-            else:
-                total_loss = (total_loss * m).sum()
-
-        # Compute metrics only over masked (assistant) tokens
-        mask_count = m.sum().clamp_min(1.0)
-        metrics.update(
-            {
-                "policy_ratio": (ratio_tok * m).sum().item() / mask_count.item()
-                if mask_count.item() > 0
-                else 1.0,
-                "kl": (kl_approx * m).sum().item() / mask_count.item()
-                if mask_count.item() > 0
-                else 0.0,
-                "entropy": (pol_entropy * m).sum().item() / mask_count.item()
-                if mask_count.item() > 0
-                else 0.0,
-                "tokens": sample.inputs["input_ids"].numel(),
-                "loss": total_loss.item(),
-            }
-        )
-
-        sample.to_device(torch.device("cpu"))
-
-        return total_loss
-
-    def compute_logprobs(self, model: Any, inputs: Any) -> tuple[torch.Tensor, torch.Tensor]:
-        """Compute masked per-token log probabilities via the model.
-
-        Returns log probabilities for the actual next tokens.
-        """
-        try:
-            model_inputs = {k: v for k, v in inputs.items() if k != "assistant_mask"}
-            out = model(**model_inputs)
-
-            logits = out.logits / self.config.actor.temperature
-
-            targets = inputs["input_ids"][:, 1:]
-
-            # Align logits to predict next token: use logits[:, :-1, :]
-            next_logits = logits[:, :-1, :]
-
-            token_log_probs = _selective_log_softmax(next_logits, targets)
-
-            # Compute entropy only for assistant tokens to save memory
-            assistant_mask = inputs["assistant_mask"]
-            entropy = torch.zeros_like(token_log_probs)
-            if assistant_mask.any():
-                entropy[assistant_mask] = entropy_from_logits(logits[:, :-1][assistant_mask])
-
-            return token_log_probs, entropy
-        except (IndexError, RuntimeError) as e:
-            # Handle empty inputs or DDP errors
-            hud_console.warning_log(f"Error in compute_logprobs: {e}. Returning dummy values.")
-            # Return dummy values that match expected shapes
-            seq_len = inputs["input_ids"].shape[1] - 1 if "input_ids" in inputs else 0
-            batch_size = inputs["input_ids"].shape[0] if "input_ids" in inputs else 1
-            # Create dummy tensors that still participate in autograd so backward doesn't fail
-            try:
-                # Touch params to build a graph
-                param_sum = torch.sum(next(self.policy.parameters()))
-                base = param_sum * 0.0
-            except StopIteration:
-                base = torch.tensor(0.0, device=self.device)
-            dummy_logprobs = (
-                base + torch.zeros(batch_size, seq_len, device=self.device)
-            ).requires_grad_(True)
-            dummy_entropy = (
-                base + torch.zeros(batch_size, seq_len, device=self.device)
-            ).requires_grad_(True)
-            return dummy_logprobs, dummy_entropy
-
-    def save(self, path: str) -> None:
-        """Save the current policy checkpoint (only on rank 0)."""
-        if is_main_process():
-            os.makedirs(path, exist_ok=True)
-            # Unwrap DDP model if needed
-            model_to_save = self.policy.module if hasattr(self.policy, "module") else self.policy
-            model_to_save.save_pretrained(path)
-            self.log(f"Saved checkpoint to {path}")
-
-    def load(self, path: str) -> None:
-        """Load a policy checkpoint."""
-        # Would need to reload LoRA weights
-        self.log(f"Loading checkpoint from {path}")
-        # Implementation depends on PEFT version
-
-
-def sanity_check(
-    sample: TrainingSample,
-    pol_logp: torch.Tensor,
-    old_logp: torch.Tensor | None,
-    ref_logp: torch.Tensor | None,
-) -> None:
-    assert "assistant_mask" in sample.inputs
-    m = sample.inputs["assistant_mask"]
-    if old_logp is None or ref_logp is None:
-        return
-    with torch.no_grad():
-        B, K = pol_logp.shape
-        assert old_logp.shape == (B, K), "old_logp shape mismatch"
-        assert ref_logp.shape == (B, K), "ref_logp shape mismatch"
-        assert m.shape == (B, K), "assistant_mask shape mismatch"
-
-        # Check mask is subset of attention_mask[:, 1:]
-        att = sample.inputs.get("attention_mask", None)
-        if att is not None and att.dim() == 2:
-            att_shift = att[:, 1:].bool()
-            bad = (m & ~att_shift).sum().item()
-            if bad > 0:
-                hud_console.warning_log(f"assistant_mask overlaps padding: {bad} tokens")
-
-        # Finiteness on masked entries only
-        def _stats(name: str, t: torch.Tensor) -> None:
-            sel = t[m]
-            if sel.numel() == 0:
-                hud_console.warning_log(f"{name} empty under mask")
-                return
-            finite = torch.isfinite(sel)
-            if finite.sum() < sel.numel():
-                hud_console.warning_log(
-                    f"{name} non-finite: {((~finite).sum().item())}/{sel.numel()}"
-                )
-            sel = sel[finite].float()
-
-        _stats("pol_logp", pol_logp)
-        _stats("old_logp", old_logp)
-        _stats("ref_logp", ref_logp)
-
-        # Log-probabilities should be <= 0 (log-softmax)
-        if (pol_logp[m] > 1e-6).any():
-            hud_console.warning_log("pol_logp has positive values under mask")
-
-        # Precompute masked deltas and ratios for diagnostics (before exp)
-        masked_log_ratio = torch.zeros_like(pol_logp)
-        masked_log_ratio[m] = (pol_logp - old_logp)[m]
-        masked_log_rho = torch.zeros_like(pol_logp)
-        masked_log_rho[m] = (pol_logp - ref_logp)[m]
-
-        _stats("log_ratio(masked)", masked_log_ratio)
-        _stats("log_rho(masked)", masked_log_rho)
-
-        # Ratios after clamp (diagnostic only)
-        ratio_diag = torch.zeros_like(pol_logp)
-        rho_diag = torch.zeros_like(pol_logp)
-        ratio_diag[m] = torch.exp(masked_log_ratio[m].clamp(-20.0, 20.0))
-        rho_diag[m] = torch.exp(masked_log_rho[m].clamp(-20.0, 20.0))
-        _stats("ratio_tok(masked)", ratio_diag)
-        _stats("rho_tok(masked)", rho_diag)
-
-
-def _selective_log_softmax(
-    logits_bt_v: torch.Tensor,
-    index_bt: torch.Tensor,
-) -> torch.Tensor:
-    """Gather log softmax for selected indices with reduced peak memory.
-
-    Uses logsumexp subtraction for float32/64; falls back to per-row
-    log_softmax for bf16/fp16.
-    logits_bt_v: [B, T, V]
-    index_bt:    [B, T]
-    Returns:     [B, T]
-    """
-    if logits_bt_v.dtype in (torch.float32, torch.float64):
-        # Compute logsumexp per [B, T] in a loop over batch to reduce
-        # peak from B*T*V to T*V
-        logsumexp_values = torch.stack([torch.logsumexp(lg, dim=-1) for lg in logits_bt_v])
-        selected_logits = torch.gather(logits_bt_v, dim=-1, index=index_bt.unsqueeze(-1)).squeeze(
-            -1
-        )
-        return selected_logits - logsumexp_values
-    # Reduced precision: numerically stable route using per-row log_softmax
-    token_logprobs_rows: list[torch.Tensor] = []
-    for logits_row, index_row in zip(logits_bt_v, index_bt, strict=True):
-        logprobs_row = logits_row.log_softmax(dim=-1)
-        token_logprobs_rows.append(
-            torch.gather(logprobs_row, dim=-1, index=index_row.unsqueeze(-1)).squeeze(-1)
-        )
-    return torch.stack(token_logprobs_rows)
diff --git a/hud/rl/tests/__init__.py b/hud/rl/tests/__init__.py
deleted file mode 100644
index e9f6eb2b..00000000
--- a/hud/rl/tests/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-"""Tests for RL module."""
diff --git a/hud/rl/tests/test_learner.py b/hud/rl/tests/test_learner.py
deleted file mode 100644
index 1055e62c..00000000
--- a/hud/rl/tests/test_learner.py
+++ /dev/null
@@ -1,186 +0,0 @@
-from __future__ import annotations
-
-import pytest
-import torch
-
-from hud.rl.config import Config
-from hud.rl.learner import GRPOLearner
-from hud.rl.types import TrainingSample
-
-
-@pytest.fixture()
-def learner_stub(monkeypatch):
-    cfg = Config()
-    # Speed up: tiny settings
-    cfg.training.epochs = 1
-    cfg.training.group_size = 1
-    cfg.training.mini_batch_size = 1
-    cfg.training.use_8bit_optimizer = False
-
-    # Stub _load_models to avoid heavy model init
-    def _stub_load_models(self):
-        class DummyPolicy(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.w = torch.nn.Parameter(torch.zeros(1))
-
-        dummy_policy = DummyPolicy()
-        dummy_opt = torch.optim.SGD(dummy_policy.parameters(), lr=0.1)
-        return None, dummy_policy, None, dummy_opt
-
-    monkeypatch.setattr(GRPOLearner, "_load_models", _stub_load_models, raising=True)
-    return GRPOLearner(cfg)
-
-
-def make_sample(
-    pol_logp_tok: torch.Tensor,
-    old_logp_tok: torch.Tensor,
-    ref_logp_tok: torch.Tensor,
-    advantage: float,
-):
-    # Minimal-but-correct object for GRPOLearner.compute_loss.
-    # Needs assistant_mask (T-1) and attention_mask (T) for sanity_check().
-    Tm1 = pol_logp_tok.size(-1)
-    inputs = {
-        "input_ids": torch.zeros(1, Tm1 + 1, dtype=torch.long),
-        "attention_mask": torch.ones(1, Tm1 + 1, dtype=torch.long),
-        "assistant_mask": torch.ones(1, Tm1, dtype=torch.bool),
-    }
-    return TrainingSample(
-        inputs=inputs,
-        old_logprobs=old_logp_tok,
-        ref_logprobs=ref_logp_tok,
-        # advantage must be 1D so .view(-1,1) works in compute_loss
-        advantage=torch.tensor([advantage], dtype=torch.float32),
-    )
-
-
-def patch_compute_logprobs(
-    monkeypatch, learner: GRPOLearner, pol_logp_tok: torch.Tensor, pol_entropy_tok: torch.Tensor
-):
-    # Return (pol_logp, pol_entropy) as expected by compute_loss
-    def _stub_compute_logprobs(self, model, inputs):
-        return pol_logp_tok.to(inputs["input_ids"].device), pol_entropy_tok.to(
-            inputs["input_ids"].device
-        )
-
-    monkeypatch.setattr(GRPOLearner, "compute_logprobs", _stub_compute_logprobs, raising=True)
-
-
-def test_per_token_mean_vs_sum(monkeypatch, learner_stub: GRPOLearner):
-    # Setup
-    _, Tm1 = 1, 4
-    pol = torch.tensor([[-1.0, -1.0, -1.0, -1.0]], dtype=torch.float32)  # logp
-    old = torch.tensor([[-1.2, -0.8, -1.0, -1.1]], dtype=torch.float32)
-    ref = torch.tensor([[-1.0, -1.0, -1.0, -1.0]], dtype=torch.float32)
-    ent = torch.zeros_like(pol)
-    patch_compute_logprobs(monkeypatch, learner_stub, pol, ent)
-
-    # Common config
-    learner_stub.config.training.kl_beta = 0.0
-    learner_stub.config.training.entropy_beta = 0.0
-    learner_stub.config.training.top_eps = 0.2
-    learner_stub.config.training.bottom_eps = 0.1
-
-    sample = make_sample(pol, old, ref, advantage=1.0)
-
-    # token_agg=mean
-    learner_stub.config.training.ppo_mode = "per_token"
-    learner_stub.config.training.token_agg = "mean"
-    loss_mean = learner_stub.compute_loss(sample).item()
-
-    # token_agg=sum
-    learner_stub.config.training.token_agg = "sum"
-    loss_sum = learner_stub.compute_loss(sample).item()
-
-    # Expect sum ≈ mean * num_tokens
-    assert pytest.approx(loss_sum, rel=1e-5) == loss_mean * Tm1
-
-
-def test_per_trace_vs_per_token(monkeypatch, learner_stub: GRPOLearner):
-    # Equal per-token deltas -> per_trace matches per_token(mean)
-    pol = torch.tensor([[-1.0, -1.0, -1.0]], dtype=torch.float32)
-    old = torch.tensor([[-1.2, -1.2, -1.2]], dtype=torch.float32)
-    ref = torch.tensor([[-1.1, -1.1, -1.1]], dtype=torch.float32)
-    ent = torch.zeros_like(pol)
-    patch_compute_logprobs(monkeypatch, learner_stub, pol, ent)
-
-    learner_stub.config.training.kl_beta = 0.0
-    learner_stub.config.training.entropy_beta = 0.0
-    learner_stub.config.training.top_eps = 0.2
-    learner_stub.config.training.bottom_eps = 0.1
-
-    sample = make_sample(pol, old, ref, advantage=1.0)
-
-    learner_stub.config.training.ppo_mode = "per_token"
-    learner_stub.config.training.token_agg = "mean"
-    ltok = learner_stub.compute_loss(sample).item()
-
-    learner_stub.config.training.ppo_mode = "per_trace"
-    ltraj = learner_stub.compute_loss(sample).item()
-
-    assert pytest.approx(ltraj, rel=1e-6) == ltok
-
-
-def test_entropy_beta_effect(monkeypatch, learner_stub: GRPOLearner):
-    pol = torch.tensor([[-1.0, -1.1]], dtype=torch.float32)
-    old = torch.tensor([[-1.0, -1.1]], dtype=torch.float32)
-    ref = torch.tensor([[-1.0, -1.1]], dtype=torch.float32)
-    ent = torch.tensor([[0.5, 1.5]], dtype=torch.float32)
-    patch_compute_logprobs(monkeypatch, learner_stub, pol, ent)
-
-    # No policy/kl effect, only entropy
-    learner_stub.config.training.ppo_mode = "per_token"
-    learner_stub.config.training.token_agg = "mean"
-    learner_stub.config.training.kl_beta = 0.0
-
-    sample = make_sample(pol, old, ref, advantage=0.0)
-
-    learner_stub.config.training.entropy_beta = 0.0
-    l0 = learner_stub.compute_loss(sample).item()
-
-    learner_stub.config.training.entropy_beta = 2.0
-    l1 = learner_stub.compute_loss(sample).item()
-
-    # Mean entropy = (0.5+1.5)/2 = 1.0, scaled by beta=2.0 -> +2.0
-    assert pytest.approx(l1 - l0, rel=1e-6) == 2.0
-
-
-def test_skip_update_when_zero_adv(monkeypatch, learner_stub: GRPOLearner):
-    # Patch prepare_groups to yield a single group with a minibatch-like object
-    class MiniBatch:
-        def __init__(self):
-            self.advantage = torch.zeros(1)
-
-        def to_device(self, device: torch.device) -> MiniBatch:
-            return self
-
-    def _stub_prepare_groups(self, samples: list[TrainingSample]) -> list[list[MiniBatch]]:
-        return [[MiniBatch(), MiniBatch()]]
-
-    monkeypatch.setattr(GRPOLearner, "prepare_groups", _stub_prepare_groups, raising=True)
-
-    # Return a zero scalar loss that *depends* on params so backward works,
-    # but has zero gradients (no update signal).
-    def _zero_loss(self, sample) -> torch.Tensor:
-        return sum(p.sum() for p in self.policy.parameters()) * 0.0  # type: ignore
-
-    monkeypatch.setattr(GRPOLearner, "compute_loss", _zero_loss, raising=True)
-
-    # Count optimizer.step calls
-    steps = {"n": 0}
-    # orig_step = learner_stub.optimizer.step
-
-    def _count_step():
-        steps["n"] += 1
-
-    monkeypatch.setattr(learner_stub.optimizer, "step", _count_step, raising=False)
-
-    # Ensure dummy backward can touch a parameter
-    assert any(p.requires_grad for p in learner_stub.policy.parameters())
-
-    learner_stub.update([])
-    # With the current learner implementation we still call optimizer.step()
-    # even if the per-minibatch "advantage" is zero (the step is a no-op
-    # because the gradients are zero). So we expect exactly one step here.
-    assert steps["n"] == 1
diff --git a/hud/rl/train.py b/hud/rl/train.py
deleted file mode 100644
index 3c7d6988..00000000
--- a/hud/rl/train.py
+++ /dev/null
@@ -1,394 +0,0 @@
-"""Main training loop for GRPO RL."""
-
-from __future__ import annotations
-
-import os
-
-# Disable tokenizer parallelism warnings
-os.environ["TOKENIZERS_PARALLELISM"] = "false"
-os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
-import argparse
-import asyncio
-import json
-import logging
-from pathlib import Path
-from typing import TYPE_CHECKING, cast
-
-import hud
-from hud.rl.actor import Actor
-from hud.rl.buffer import DatasetBuffer, ReplayBuffer
-from hud.rl.config import Config
-from hud.rl.distributed import (
-    broadcast_object,
-    cleanup_distributed,
-    get_global_rank,
-    get_world_size,
-    is_main_process,
-    scatter_object,
-    setup_distributed,
-    synchronize,
-)
-from hud.rl.learner import GRPOLearner
-from hud.rl.utils import (
-    aggregate_metrics_across_ranks,
-    ensure_dir,
-    preprocess_advantages,
-    set_seed,
-)
-from hud.rl.vllm_adapter import VLLMAdapter
-from hud.utils.hud_console import HUDConsole
-from hud.utils.tasks import load_tasks
-
-if TYPE_CHECKING:
-    from hud.types import Task
-hud_console = HUDConsole(logging.getLogger(__name__))
-
-
-async def train(config: Config, tasks: list[Task]) -> None:
-    """Main training loop."""
-    # Setup distributed environment
-    setup_distributed()
-
-    # Initialize components
-    set_seed(config.seed + get_global_rank())  # Different seed per rank
-    ensure_dir(config.out_dir)
-    if config.verbose:
-        logging.basicConfig(level=logging.INFO)
-        # Remove httpx logger
-        logging.getLogger("httpx").setLevel(logging.WARNING)
-    if config.very_verbose:
-        logging.basicConfig(level=logging.DEBUG)
-        # Remove httpx logger
-        logging.getLogger("httpx").setLevel(logging.INFO)
-
-    if is_main_process():
-        hud_console.header("Starting GRPO Training")
-        hud_console.section_title(
-            f"\n[1/3] Initializing components (world_size={get_world_size()})..."
-        )
-
-    num_gpus = get_world_size()
-
-    # Actor is responsible for running tasks and collecting episodes
-    actor = Actor(config) if is_main_process() else None
-
-    # Learner is responsible for updating the policy
-    learner = GRPOLearner(config)
-
-    # Dataset buffer is responsible for storing tasks
-    dataset_buffer = DatasetBuffer(tasks, config)
-    if is_main_process():
-        hud_console.key_value_table(dataset_buffer.info)
-
-    if dataset_buffer.groups_per_batch % num_gpus != 0:
-        hud_console.warning(
-            f"Groups per batch {dataset_buffer.groups_per_batch} is not divisible by number of GPUs {num_gpus}"  # noqa: E501
-        )
-        exit(1)
-
-    # Replay buffer is responsible for storing episodes for training
-    trace_buffer = ReplayBuffer(config)
-
-    # VLLM adapter is responsible for loading and unloading adapters (only on main process)
-    vllm = (
-        VLLMAdapter(config.actor.vllm_base_url, config.actor.vllm_api_key)
-        if is_main_process()
-        else None
-    )
-
-    # Load initial adapter if provided
-    if is_main_process() and config.model.adapter_path and vllm:
-        hud_console.info(f"Loading baseline adapter from: {config.model.adapter_path}")
-        success = vllm.load_adapter(config.model.base_model, config.model.adapter_path)
-        if success and actor is not None:
-            hud_console.info("Successfully loaded baseline adapter as 'base_model'")
-            # Update actor to use the loaded adapter
-            actor.update_adapter(config.model.base_model)
-        else:
-            hud_console.error("Failed to load baseline adapter")
-            exit(1)
-
-    # Training state
-    step = 0
-    last_metrics = None  # Store last successful metrics for error recovery
-
-    if is_main_process():
-        hud_console.section_title("\n[2/3] Running training loop...")
-
-    # Create job on main process and distribute ID across GPUs
-    if is_main_process():
-        hud_console.info(f"Creating job with config.job_id: {config.job_id}")
-        job_obj = hud.create_job(
-            job_id=config.job_id,
-            name=config.job_name,
-            metadata={"config": config.to_dict(), "agent_class": config.model.base_model},
-        )
-        hud_console.info(f"Created job with job_obj.id: {job_obj.id}")
-        job_obj.update_status_sync("running")
-        job_id = job_obj.id
-    else:
-        job_obj = None
-        job_id = None
-
-    # Broadcast job ID to all ranks
-    job_id = broadcast_object(job_id, src=0)
-
-    try:
-        while len(dataset_buffer) > 0:
-            if is_main_process():
-                hud_console.section_title(f"Step {step + 1}/{dataset_buffer.training_steps}")
-                hud_console.info(f"{len(dataset_buffer)} tasks remaining")
-            # Get batch of tasks (all ranks need same tasks)
-            tasks = dataset_buffer.get_tasks()
-
-            # Initialize variables on all ranks
-            global_reward_stats = None
-            global_advantage_stats = None
-
-            # Step-state gate: ensure all ranks branch coherently
-            state = {"ok": False, "err": None, "num_samples": 0}
-            rank_samples = None
-            episode_time_value = None
-
-            # Only rank 0 runs tasks and prepares distribution
-            if is_main_process() and actor is not None:
-                import time
-
-                try:
-                    episode_start_time = time.time()
-                    traces = await actor.run_tasks(tasks, job_id=job_id)
-                    episode_time = time.time() - episode_start_time
-                    hud_console.info(f"Sampled {len(traces)} traces in {episode_time:.1f}s")
-                    trace_buffer.add(traces)
-                    global_reward_stats = [trace.reward for trace in traces]
-
-                    # Get all traces from buffer for distribution
-                    all_traces = trace_buffer.sample_traces()
-
-                    # Preprocess traces to training samples
-                    preprocessed_traces = preprocess_advantages(all_traces, config)
-
-                    # Store these for later use in metrics
-                    global_advantage_stats = [sample.advantage for sample in preprocessed_traces]
-
-                    # Distribute preprocessed samples in groups across ranks via scatter
-                    # Ensure list length is a multiple of num_gpus by allowing empty per-rank slices
-                    gpu_batch_size = max(1, (len(preprocessed_traces) + num_gpus - 1) // num_gpus)
-                    rank_samples = [
-                        preprocessed_traces[i : i + gpu_batch_size]
-                        for i in range(0, len(preprocessed_traces), gpu_batch_size)
-                    ]
-                    # Pad rank_samples to exactly num_gpus entries
-                    if len(rank_samples) < num_gpus:
-                        rank_samples.extend([[] for _ in range(num_gpus - len(rank_samples))])
-
-                    # Log distribution info
-                    dist_msg = (
-                        f"Distributing {len(preprocessed_traces)} samples as {gpu_batch_size} "
-                        f"sized batches across {num_gpus} GPUs"
-                    )
-                    hud_console.info(dist_msg)
-                    for rank in range(num_gpus):
-                        n_samples = len(rank_samples[rank]) if rank < len(rank_samples) else 0
-                        hud_console.info(f"  Rank {rank}: {n_samples} samples")
-
-                    hud_console.section_title(f"Training on {len(all_traces)} traces")
-                    episode_time_value = episode_time
-
-                    state.update({"ok": True, "num_samples": len(preprocessed_traces)})
-                except Exception as e:
-                    state.update({"ok": False, "err": str(e)})
-
-            # Broadcast step-state to keep ranks in lockstep
-            state = broadcast_object(state, src=0)
-            if not state.get("ok", False):
-                hud_console.warning("Step failed on rank 0; skipping this step coherently")
-                synchronize()
-                continue
-
-            # Scatter per-rank samples; each rank receives only its slice
-            my_samples = scatter_object(rank_samples if is_main_process() else None, src=0)
-            # Broadcast the episode time (small object)
-            episode_time_value = broadcast_object(episode_time_value, src=0)
-
-            # Process only assigned samples
-            last_metrics = learner.update(my_samples)
-
-            # Add episode time (same for all ranks since episodes run on rank 0)
-            if episode_time_value is not None:
-                last_metrics.update(
-                    {
-                        "episode_time": episode_time_value,
-                    }
-                )
-
-            # Aggregate metrics across all GPUs for proper statistics
-            aggregate_metrics_across_ranks(last_metrics)
-
-            if is_main_process() and job_obj is not None:
-                # Use the global statistics we collected before distribution
-                if global_reward_stats is not None and global_advantage_stats is not None:
-                    last_metrics.update(
-                        {
-                            "advantage": global_advantage_stats,
-                            "reward": global_reward_stats,
-                        }
-                    )
-                else:
-                    # Fallback: use only this rank's data
-                    hud_console.warning("Global statistics not available, using partial data")
-                    last_metrics.update(
-                        {
-                            "advantage": [sample.advantage for sample in my_samples]
-                            if my_samples
-                            else [],
-                            "reward": [sample.reward for sample in my_samples]
-                            if my_samples
-                            else [],
-                        }
-                    )
-
-                job_obj.log_sync(last_metrics.to_dict())
-
-                if step % config.stats_interval == 0:
-                    hud_console.key_value_table(last_metrics.to_dict())
-
-            # Increment step counter on all processes
-            step += 1
-
-            # Save checkpoint and update vLLM (only on main process)
-            if step % config.training.save_every_batches == 0:
-                if is_main_process() and vllm is not None and actor is not None:
-                    hud_console.section_title("Saving checkpoint and updating vLLM")
-                    checkpoint_path = Path(config.out_dir) / f"{config.adapter_prefix}-{step}"
-                    learner.save(str(checkpoint_path))
-
-                    # Wait for 6 seconds to ensure the checkpoint is saved
-                    await asyncio.sleep(6)
-
-                    # If there is a previous adapter, unload it
-                    current_adapter = vllm.get_current()
-                    if current_adapter is not None:
-                        vllm.unload_adapter(current_adapter)
-
-                    adapter_name = f"{config.adapter_prefix}-{step}"
-                    if vllm.load_adapter(adapter_name, str(checkpoint_path)):
-                        actor.update_adapter(adapter_name)
-                        hud_console.info(f"✓ Checkpoint saved and loaded: {adapter_name}")
-                    else:
-                        hud_console.warning(f"Failed to hot-load adapter {adapter_name}")
-
-                # Ensure all processes wait for checkpoint operations to complete
-                synchronize()
-
-        if is_main_process():
-            hud_console.section_title("\n[3/3] Training completed!")
-            # Update job status to completed
-            if job_obj:
-                job_obj.update_status_sync("completed")
-    except Exception as e:
-        # Log error and any available metrics before failing
-        hud_console.error(f"Training failed on rank {get_global_rank()}: {e}")
-
-        if is_main_process():
-            # Log final metrics if we have any
-            if last_metrics and job_obj:
-                try:
-                    job_obj.log_sync(last_metrics.to_dict())
-                except Exception:
-                    hud_console.warning("Failed to log final metrics")
-
-            # Update job status to failed
-            if job_obj:
-                job_obj.update_status_sync("failed")
-
-        # Don't re-raise immediately to allow cleanup
-        raise
-
-    finally:
-        # Try to sync one last time, but don't fail if it doesn't work
-        try:
-            synchronize()
-        except Exception:
-            hud_console.warning("Failed to synchronize during cleanup")
-
-        # Clean up distributed environment
-        cleanup_distributed()
-
-
-async def main() -> None:
-    parser = argparse.ArgumentParser(description="GRPO RL Training")
-    parser.add_argument("--config", type=str, help="Path to config JSON file")
-    parser.add_argument("--test", action="store_true", help="Run in test mode")
-    parser.add_argument("--debug", action="store_true", help="Enable debug mode")
-    parser.add_argument("--verbose", action="store_true", help="Enable verbose mode")
-    # Task input arguments
-    parser.add_argument(
-        "--tasks", type=str, help="Path to tasks JSONL file or HuggingFace dataset name"
-    )
-    parser.add_argument("--tasks-json", type=json.loads, help="Tasks as JSON list string")
-
-    args = parser.parse_args()
-
-    # Load config
-    if args.config:
-        with open(args.config, encoding="utf-8") as f:  # noqa: ASYNC230
-            config_dict = json.load(f)
-        config = Config.from_dict(config_dict)
-    else:
-        config = Config()
-
-    # Apply test mode settings
-    if args.test:
-        hud_console.info("[TEST MODE] Using minimal configuration")
-        eps = 6
-        config.training.batch_size = eps
-        config.actor.max_parallel_episodes = 12
-        config.training.group_size = eps
-        config.training.mini_batch_size = 3
-        config.training.training_steps = 4
-        config.actor.max_steps_per_episode = 4
-
-    # Calculate the memory usage
-    INITIAL_MEMORY = 8.0
-    SCALING_FACTOR = 4 / (28 * 28 * 256 * 1024)
-    token_estimate = (
-        config.training.mini_batch_size
-        * config.actor.max_steps_per_episode
-        * config.actor.max_new_tokens
-    )
-    hud_console.info(f"Estimated tokens per forward pass: {token_estimate}")
-    image_estimate = config.model.max_pixels
-    total_memory = INITIAL_MEMORY + SCALING_FACTOR * token_estimate * image_estimate
-    hud_console.info(f"Estimated memory peak: {total_memory:.2f} GB")
-    if total_memory > 75.0:
-        hud_console.warning(
-            "Potential memory usage is too high, decrease either training steps or mini batch size"
-        )
-        exit(1)
-
-    # Load tasks
-    if args.tasks_json:
-        # Tasks provided as JSON list via command line
-        tasks = load_tasks(args.tasks_json)
-    elif args.tasks:
-        # Tasks provided as file path or HuggingFace dataset
-        tasks = load_tasks(args.tasks)
-    else:
-        # Default to browser_2048_tasks.jsonl if it exists
-        default_tasks_path = "browser_2048_tasks.jsonl"
-        if Path(default_tasks_path).exists():
-            hud_console.info(f"No tasks specified, using default: {default_tasks_path}")
-            tasks = load_tasks(default_tasks_path)
-        else:
-            raise ValueError(
-                "No tasks specified. Use --tasks, --tasks-json, or specify tasks_file in config"
-            )
-
-    # Run training
-    tasks_typed = cast("list[Task]", tasks)
-    await train(config, tasks_typed)
-
-
-if __name__ == "__main__":
-    asyncio.run(main())
diff --git a/hud/rl/types.py b/hud/rl/types.py
deleted file mode 100644
index e0fc5006..00000000
--- a/hud/rl/types.py
+++ /dev/null
@@ -1,101 +0,0 @@
-"""Shared types for RL training."""
-
-from __future__ import annotations
-
-import math
-from typing import Any
-
-from pydantic import ConfigDict, Field
-from pydantic.dataclasses import dataclass
-
-from hud.types import Trace
-
-try:
-    import torch
-except ImportError:
-    raise ImportError("uv tool install hud-python[rl] to use this module") from None
-
-
-class TrainingSample(Trace):
-    """A single training sample for GRPO."""
-
-    model_config = ConfigDict(arbitrary_types_allowed=True)
-    # Tokenized inputs to the model (model.forward(*inputs))
-    # This includes the input tokens, logit mask, etc.
-    inputs: dict[str, torch.Tensor] = Field(default_factory=dict)
-    old_logprobs: torch.Tensor | None = Field(default=None)
-    ref_logprobs: torch.Tensor | None = Field(default=None)
-
-    # Weighted advantage of group calculation
-    advantage: torch.Tensor | None = Field(default=None)
-
-    def to_device(self, device: torch.device) -> TrainingSample:
-        """Move sample to device."""
-        self.inputs = {
-            k: (t.to(device, non_blocking=True) if hasattr(t, "to") else t)
-            for k, t in self.inputs.items()
-        }
-        self.advantage = self.advantage.to(device) if self.advantage is not None else None
-        self.old_logprobs = self.old_logprobs.to(device) if self.old_logprobs is not None else None
-        self.ref_logprobs = self.ref_logprobs.to(device) if self.ref_logprobs is not None else None
-        return self
-
-
-@dataclass
-class Metric:
-    """A tuple for metrics."""
-
-    name: str = Field(default="")
-    mean: float = Field(default=0.0)
-    std: float = Field(default=0.0)
-    values: list[float] = Field(default_factory=list)
-
-    def update(
-        self, value: float | torch.Tensor | list[float] | list[int] | list[torch.Tensor]
-    ) -> None:
-        """Update metric."""
-        if isinstance(value, list):
-            self.values.extend(value.item() if isinstance(value, torch.Tensor) else value)  # type: ignore
-        else:
-            self.values.append(value.item() if isinstance(value, torch.Tensor) else value)  # type: ignore
-        mean_val = sum(self.values) / len(self.values)
-        self.mean = mean_val.item() if isinstance(mean_val, torch.Tensor) else float(mean_val)  # type: ignore
-        variance = sum((x - self.mean) ** 2 for x in self.values) / len(self.values)
-        variance_val = variance.item() if isinstance(variance, torch.Tensor) else float(variance)  # type: ignore
-        self.std = math.sqrt(variance_val)
-
-
-@dataclass
-class TrainingMetrics:
-    """Metrics for GRPO training (per training step)."""
-
-    # Learner metrics
-    grad_norm: Metric = Field(default=Metric())
-    loss: Metric = Field(default=Metric())
-    kl: Metric = Field(default=Metric())
-    reward: Metric = Field(default=Metric())
-    advantage: Metric = Field(default=Metric())
-    policy_ratio: Metric = Field(default=Metric())
-    tokens: Metric = Field(default=Metric())
-    entropy: Metric = Field(default=Metric())
-
-    # Computation metrics
-    gpu_util: Metric = Field(default=Metric())  # GPU utilization percentage
-    gpu_memory: Metric = Field(default=Metric())  # GPU memory usage in GB
-    episode_time: Metric = Field(default=Metric())  # Time to run episodes (actor)
-    training_time: Metric = Field(default=Metric())  # Time for gradient updates (learner)
-    samples_per_second: Metric = Field(default=Metric())  # Training throughput
-
-    def update(self, metrics: dict[str, Any]) -> None:
-        """Update metrics."""
-        for key, value in metrics.items():
-            if key in self.__dataclass_fields__:
-                getattr(self, key).update(value)
-
-    def to_dict(self) -> dict[str, Any]:
-        """Convert metrics to dictionary."""
-        final_metrics = {}
-        for key in self.__dataclass_fields__:
-            final_metrics[f"{key}_mean"] = getattr(self, key).mean
-            final_metrics[f"{key}_std"] = getattr(self, key).std
-        return final_metrics
diff --git a/hud/rl/utils.py b/hud/rl/utils.py
deleted file mode 100644
index 29665f81..00000000
--- a/hud/rl/utils.py
+++ /dev/null
@@ -1,524 +0,0 @@
-"""Utility functions for RL training."""
-
-from __future__ import annotations
-
-import base64
-import io
-import logging
-import os
-import random
-from pathlib import Path
-from typing import TYPE_CHECKING, Any
-
-import numpy as np
-import torch
-from PIL import Image
-from transformers.utils.chat_template_utils import render_jinja_template
-
-from hud.utils.hud_console import HUDConsole
-
-from .types import TrainingSample
-
-if TYPE_CHECKING:
-    from hud.types import Trace
-
-    from .config import Config
-
-logger = logging.getLogger(__name__)
-hud_console = HUDConsole(logger)
-
-
-def set_seed(seed: int) -> None:
-    """Set random seeds for reproducibility."""
-    random.seed(seed)
-    torch.manual_seed(seed)
-    if torch.cuda.is_available():
-        torch.cuda.manual_seed_all(seed)
-
-
-def load_chat_template(path: str) -> str:
-    """Load chat template from file."""
-    with open(path) as f:
-        return f.read()
-
-
-def ensure_dir(path: str) -> None:
-    """Create directory if it doesn't exist."""
-    os.makedirs(path, exist_ok=True)
-
-
-def get_memory_usage() -> float:
-    if torch.cuda.is_available():
-        torch.cuda.synchronize()
-        return torch.cuda.memory_allocated() / 1024**3
-    return 0.0
-
-
-def get_gpu_utilization() -> float:
-    """Get current GPU utilization percentage (0-100)."""
-    if not torch.cuda.is_available():
-        return 0.0
-
-    try:
-        import nvidia_ml_py as nvml  # type: ignore
-
-        nvml.nvmlInit()
-        device_id = torch.cuda.current_device()
-        handle = nvml.nvmlDeviceGetHandleByIndex(device_id)
-        util = nvml.nvmlDeviceGetUtilizationRates(handle)
-        return float(util.gpu)
-    except Exception:
-        # Fallback: estimate based on memory usage
-        # This is less accurate but works without nvidia-ml-py
-        return min(100.0, (torch.cuda.memory_allocated() / torch.cuda.max_memory_allocated()) * 100)
-
-
-def aggregate_metrics_across_ranks(
-    metrics: Any, metrics_to_aggregate: list[str] | None = None
-) -> None:
-    """Aggregate metrics across all ranks for proper distributed statistics.
-
-    Args:
-        metrics: TrainingMetrics object to update in-place
-        metrics_to_aggregate: List of metric names to aggregate. If None, aggregates all numeric metrics.
-
-    This function:
-    1. Gathers metric values from all ranks
-    2. Computes proper mean/std across all GPUs
-    3. Updates the metrics object in-place (only on rank 0)
-    """  # noqa: E501
-    from hud.rl.distributed import get_local_rank, get_world_size, is_main_process
-
-    if get_world_size() <= 1:
-        return  # Nothing to aggregate in single GPU mode
-
-    # Default metrics that typically vary across GPUs
-    if metrics_to_aggregate is None:
-        metrics_to_aggregate = [
-            "training_time",
-            "samples_per_second",
-            "gpu_util",
-            "gpu_memory",
-            "grad_norm",
-            # Include core training scalars
-            "loss",
-            "kl",
-            "entropy",
-            "tokens",
-            "policy_ratio",
-        ]
-
-    # Collect current values from this rank
-    local_values = {}
-    for metric_name in metrics_to_aggregate:
-        if hasattr(metrics, metric_name):
-            metric_obj = getattr(metrics, metric_name)
-            # Get the last value if available, otherwise 0
-            local_values[metric_name] = metric_obj.values[-1] if metric_obj.values else 0.0
-
-    # Convert to tensor for distributed gathering
-    values_tensor = torch.tensor(
-        list(local_values.values()), device=f"cuda:{get_local_rank()}", dtype=torch.float32
-    )
-
-    # Gather from all ranks using NCCL-supported all_gather
-    world_size = get_world_size()
-    gather_list = [torch.zeros_like(values_tensor) for _ in range(world_size)]
-    torch.distributed.all_gather(gather_list, values_tensor)
-
-    # Update metrics on main process only
-    if is_main_process():
-        # Reshape: [num_gpus, num_metrics]
-        all_values = torch.stack(gather_list).cpu().numpy()
-
-        # Update each metric with aggregated values
-        for i, metric_name in enumerate(local_values.keys()):
-            metric_obj = getattr(metrics, metric_name)
-            gpu_values = all_values[:, i].tolist()
-
-            # Replace last value with cross-rank mean for reporting
-            if len(metric_obj.values) == 0:
-                metric_obj.values.append(0.0)
-            metric_obj.values[-1] = float(sum(gpu_values) / len(gpu_values))
-            # Recompute mean/std across history using updated last value
-            metric_obj.mean = float(sum(metric_obj.values) / len(metric_obj.values))
-            variance = sum((x - metric_obj.mean) ** 2 for x in metric_obj.values) / len(
-                metric_obj.values
-            )
-            metric_obj.std = float(variance**0.5)
-
-
-def b64_to_pil(b64_str: str) -> Image.Image:
-    """Convert base64 string to PIL Image."""
-    return Image.open(io.BytesIO(base64.b64decode(b64_str))).convert("RGB")
-
-
-def build_assistant_masks(
-    input_ids: list[list[int]],
-    tokenizer: Any,
-) -> list[list[int]]:
-    """
-    Build assistant masks from token IDs by finding assistant turns.
-
-    Args:
-        input_ids: List of token sequences
-        tokenizer: Tokenizer to decode tokens and get special token IDs
-        verbose: Whether to print verbose information
-
-    Returns:
-        List of binary masks indicating assistant tokens
-    """
-    id_im_start = tokenizer.convert_tokens_to_ids("<|im_start|>")
-    id_im_end = tokenizer.convert_tokens_to_ids("<|im_end|>")
-    id_assistant = tokenizer.convert_tokens_to_ids("assistant")
-
-    assistant_masks: list[list[int]] = []
-
-    for seq in input_ids:
-        mask = [0] * len(seq)
-        i_tok = 0
-        assistant_turn_count = 0
-
-        while i_tok < len(seq):
-            # Detect start of assistant turn
-            if (
-                seq[i_tok] == id_im_start
-                and i_tok + 1 < len(seq)
-                and seq[i_tok + 1] == id_assistant
-            ):
-                assistant_turn_count += 1
-
-                # Skip '<|im_start|>', 'assistant' and possible newline token
-                i_tok += 2
-                # Check for newline after 'assistant'
-                if i_tok < len(seq) and tokenizer.decode([seq[i_tok]]) == "\n":
-                    i_tok += 1
-
-                # Skip leading spaces after assistant\n
-                while i_tok < len(seq) and tokenizer.decode([seq[i_tok]]).strip() == "":
-                    i_tok += 1
-
-                assistant_content_start = i_tok
-
-                # Mark tokens until we hit <|im_end|>
-                content_end = i_tok
-                while i_tok < len(seq) and seq[i_tok] != id_im_end:
-                    content_end = i_tok + 1  # Track last non-<|im_end|> position
-                    mask[i_tok] = 1
-                    i_tok += 1
-
-                # Remove trailing spaces from the mask
-                while content_end > assistant_content_start:
-                    if (
-                        mask[content_end - 1] == 1
-                        and tokenizer.decode([seq[content_end - 1]]).strip() == ""
-                    ):
-                        mask[content_end - 1] = 0
-                        content_end -= 1
-                    else:
-                        break
-
-                # Skip the <|im_end|> token
-                i_tok += 1
-            else:
-                i_tok += 1
-
-        assistant_masks.append(mask)
-
-    return assistant_masks
-
-
-def prepare_conversation_history(
-    conversation_history: list[dict[str, Any]],
-) -> tuple[list[dict[str, Any]], list[Image.Image]]:
-    """Sanitize conversation history to avoid vLLM errors."""
-    sanitized_messages = []
-    images = []
-    for m in conversation_history:
-        if "tool_calls" in m:
-            m = {
-                "role": m["role"],
-                "content": m.get("content", ""),
-                "tool_calls": [
-                    tc.model_dump() if not isinstance(tc, dict) else tc
-                    for tc in m.get("tool_calls", [])
-                ],
-            }
-        elif m.get("role") == "user":
-            user_content = m.get("content", [])
-            for c in user_content:
-                if isinstance(c, dict) and c.get("type") == "image_url":
-                    image_url = c.get("image_url", {})
-                    url = image_url.get("url", "")
-                    if url.startswith("data:image"):
-                        data = url.split(",", 1)[1] if "," in url else url
-                        images.append(b64_to_pil(data))
-                    elif isinstance(data, bytes | bytearray):
-                        images.append(Image.open(io.BytesIO(data)).convert("RGB"))
-                    c = {"type": "image"}
-            m["content"] = user_content
-        sanitized_messages.append(m)
-    return sanitized_messages, images
-
-
-def prepare_inputs(trace: Trace, processor: Any) -> dict[str, torch.Tensor]:
-    """
-    Prepare inputs from a trace.
-
-    Args:
-        trace: Trace to process
-        processor: Model processor
-
-    Returns:
-        Inputs for the model
-    """
-    if len(trace.messages) == 0:
-        return {}
-
-    # Get images for current turn
-    conversation, images = prepare_conversation_history(trace.messages)
-
-    # Get absolute path to chat template
-    chat_template_path = Path(__file__).parent / "chat_template.jinja"
-
-    # For VL models, processor has a tokenizer attribute; for text models, processor IS tokenizer
-    tokenizer = processor.tokenizer if hasattr(processor, "tokenizer") else processor
-
-    text_list, _ = render_jinja_template(
-        conversations=[conversation],
-        chat_template=load_chat_template(str(chat_template_path)),
-        tools=trace.info["tool_spec"] if trace.info["tool_spec"] else None,  # mcp_tools
-        return_assistant_tokens_mask=True,
-        **tokenizer.special_tokens_map,
-    )
-    # For text models, don't pass images parameter
-    if hasattr(processor, "tokenizer"):
-        # VL model - processor accepts images
-        inputs = processor(
-            images=images if len(images) > 0 else None,
-            text=text_list,
-            return_offsets_mapping=False,  # we no longer need char offsets
-        )
-    else:
-        # Text model - processor is tokenizer, doesn't accept images
-        inputs = processor(
-            text=text_list,
-            return_offsets_mapping=False,  # we no longer need char offsets
-        )
-
-    assistant_masks = build_assistant_masks(inputs["input_ids"], tokenizer)
-    mask_tensor = torch.tensor(assistant_masks, dtype=torch.long)
-
-    # Ensure mask_tensor is 2D before slicing
-    if mask_tensor.dim() == 1:
-        mask_tensor = mask_tensor.unsqueeze(0)
-
-    # Slice to align with targets [B, T-1]
-    inputs["assistant_mask"] = mask_tensor[:, 1:].bool()
-
-    # Log amount of assistant tokens, and the first 10 tokens that are non 0, decoded
-    # assistant_batches = render_assistant_tokens(mask_tensor, inputs['input_ids'], processor)
-    inputs.convert_to_tensors(tensor_type="pt")
-
-    return inputs
-
-
-def render_assistant_tokens(
-    mask_tensor: torch.Tensor, input_ids: torch.Tensor, processor: Any
-) -> list[str]:
-    """Render assistant tokens as a list of continuous batches."""
-    # Get the mask as a 1D tensor
-    mask_1d = mask_tensor[0]
-
-    # Find continuous sequences of non-zero values
-    batches = []
-    start_idx = None
-
-    for i in range(len(mask_1d)):
-        if mask_1d[i] != 0 and start_idx is None:
-            # Start of a new batch
-            start_idx = i
-        elif mask_1d[i] == 0 and start_idx is not None:
-            # End of current batch
-            # Extract and decode the tokens in this batch
-            batch_token_ids = input_ids[0][start_idx:i].tolist()
-            decoded_batch = processor.decode(batch_token_ids)
-            batches.append(decoded_batch)
-            start_idx = None
-
-    # Handle case where the last batch extends to the end
-    if start_idx is not None:
-        batch_token_ids = input_ids[0][start_idx:].tolist()
-        decoded_batch = processor.decode(batch_token_ids)
-        batches.append(decoded_batch)
-
-    return batches
-
-
-def entropy_from_logits(logits: torch.Tensor) -> torch.Tensor:
-    """Calculate entropy from logits in a memory-efficient way."""
-    log_probs = torch.nn.functional.log_softmax(logits, dim=-1)
-    entropy = -torch.sum(torch.exp(log_probs) * log_probs, dim=-1)
-    return entropy
-
-
-def preprocess_advantages(group: list[Trace], config: Config) -> list[TrainingSample]:
-    """Preprocess a group of traces."""
-    group_size = config.training.group_size
-    if config.training.batch_level == "group":
-        groups = [group[i : i + group_size] for i in range(0, len(group), group_size)]
-    elif config.training.batch_level == "batch":
-        groups = [group]
-    else:
-        raise ValueError(f"Invalid batch level: {config.training.batch_level}")
-
-    all_samples = []
-    for i, group in enumerate(groups):
-        rewards = np.array([trace.reward for trace in group])
-        mean_reward = np.mean(rewards)
-        std_reward = np.std(rewards)
-
-        # Calculate advantages
-        samples = [TrainingSample(**trace.model_dump()) for trace in group]
-        for sample, reward in zip(samples, rewards, strict=True):
-            if sample.isError:
-                sample.advantage = torch.Tensor(np.array([0.0]))
-                continue
-            # No std (non-baseline GRPO)
-            if config.training.no_std:
-                advantage_value = reward - mean_reward
-            else:
-                # Avoid division by zero
-                if std_reward < 1e-6:
-                    advantage_value = torch.Tensor(np.array([0.0]))
-                else:
-                    advantage_value = (reward - mean_reward) / std_reward
-            # Leave one out RLOO/LOOP
-            if config.training.leave_one_out:
-                advantage_value = advantage_value * len(group) / (len(group) - 1)
-            sample.advantage = torch.Tensor(np.array([advantage_value]))
-        hud_console.info_log(
-            f"Advantages for group {i} [{mean_reward:.4f} ± {std_reward:.4f}]:"
-            f"{[round(sample.advantage.item(), 4) for sample in samples if sample.advantage is not None]}"  # noqa: E501
-        )
-
-        all_samples.extend(samples)
-
-    return all_samples
-
-
-def batch_training_samples(samples: list[TrainingSample]) -> list[TrainingSample]:
-    """Create batched model inputs from a list of TrainingSample.
-
-    Pads token sequences to the maximum length in the list and zero-pads
-    images to the maximum H/W when present. Returns a dictionary of batched
-    tensors suitable for a single forward pass. Keeps assistant_masks for
-    masked scoring.
-    """
-    if not samples:
-        hud_console.warning("No samples to batch.")
-        return []
-
-    for s in samples:
-        if (
-            "assistant_mask" not in s.inputs
-            or s.inputs["assistant_mask"].sum() == 0
-            or s.advantage == 0.0
-        ) and len(samples) > 1:
-            hud_console.info("Removing sample with zero advantage.")
-            samples.remove(s)
-
-    if len(samples) == 1:
-        return samples
-
-    import torch.nn.functional as F
-
-    new_samples = [TrainingSample()]
-
-    input_keys_to_expand = ["input_ids", "attention_mask", "assistant_mask"]
-    input_keys_to_cat = ["pixel_values", "image_grid_thw"]
-    updated_inputs: dict[str, list[torch.Tensor]] = {
-        k: [] for k in input_keys_to_expand + input_keys_to_cat
-    }
-
-    # Sanity check dimensions
-    for s in samples:
-        for k in input_keys_to_expand + input_keys_to_cat:
-            val = s.inputs.get(k)
-            if val is not None:
-                if k in input_keys_to_expand:
-                    if val.dim() == 2 and val.size(0) == 1:
-                        val = val[0]
-                    elif val.dim() != 1:
-                        raise ValueError(f"{k} has unexpected dimensions: {val.shape}")
-                updated_inputs[k].append(val)
-
-    # Pad 1D sequences to max length
-    max_len = max(t.size(-1) for t in updated_inputs["input_ids"])
-
-    def pad_1d(x: torch.Tensor, pad_to: int, pad_value: int) -> torch.Tensor:
-        pad = pad_to - x.size(-1)
-        return F.pad(x, (0, pad), value=pad_value) if pad > 0 else x
-
-    stacked_inputs: dict[str, torch.Tensor] = {}
-    # These are 1D sequences that need padding
-    for k in input_keys_to_expand:
-        if updated_inputs[k]:
-            # assistant_mask is T-1, others are T
-            if k == "assistant_mask":
-                stacked_inputs[k] = torch.stack(
-                    [pad_1d(x, max_len - 1, 0) for x in updated_inputs[k]], dim=0
-                )
-            else:
-                stacked_inputs[k] = torch.stack(
-                    [pad_1d(x, max_len, 0) for x in updated_inputs[k]], dim=0
-                )
-
-    for k in input_keys_to_cat:
-        if updated_inputs[k]:
-            # pixel_values and image_grid_thw are concatenated across all images from all samples
-            # Shape of pixel_values: (sum of all patches from all images, feature_dim)
-            # Shape of image_grid_thw: (sum of all images, 3)
-            stacked_inputs[k] = torch.cat(updated_inputs[k], dim=0)
-        else:
-            stacked_inputs.pop(k)
-
-    new_samples[0].inputs = stacked_inputs
-
-    # Pad logprobs to max length before stacking
-    # old_logprobs and ref_logprobs have shape [seq_len] or [1, seq_len] after gathering
-    def pad_logprobs(logprobs: torch.Tensor | None, max_len: int) -> torch.Tensor:
-        # Always work with 1D tensor, squeeze batch dim if present
-        if logprobs is None:
-            return torch.tensor([float("-inf")], dtype=torch.float32)
-        if logprobs.dim() == 2 and logprobs.size(0) == 1:
-            logprobs = logprobs.squeeze(0)
-        elif logprobs.dim() != 1:
-            raise ValueError(
-                f"Expected logprobs to have 1 or 2 dimensions, got {logprobs.dim()} with shape {logprobs.shape}"  # noqa: E501
-            )
-
-        # Now logprobs is [seq_len]
-        seq_len = logprobs.size(0) if logprobs is not None else 0
-        if seq_len < max_len:
-            pad_size = max_len - seq_len
-            # Pad with -inf (log of 0 probability) along sequence dimension
-            return F.pad(logprobs, (0, pad_size), value=float("-inf"))
-        return logprobs
-
-    # Stack padded logprobs (these are T-1 length)
-    old_logprobs_list = [pad_logprobs(s.old_logprobs, max_len - 1) for s in samples]
-    ref_logprobs_list = [pad_logprobs(s.ref_logprobs, max_len - 1) for s in samples]
-
-    new_samples[0].old_logprobs = torch.stack(old_logprobs_list, dim=0)
-    new_samples[0].ref_logprobs = torch.stack(ref_logprobs_list, dim=0)
-
-    # Stack advantages, checking for None values
-    advantages = [s.advantage for s in samples]
-    if any(adv is None for adv in advantages):
-        raise ValueError(
-            "Some samples have None advantages. Make sure advantages are computed before batching."
-        )
-    new_samples[0].advantage = torch.stack(advantages, dim=0)  # type: ignore
-
-    return new_samples
diff --git a/hud/rl/utils/start_vllm_server.sh b/hud/rl/utils/start_vllm_server.sh
deleted file mode 100755
index 38ea6739..00000000
--- a/hud/rl/utils/start_vllm_server.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/bin/bash
-# Start vLLM server with OpenAI-compatible API
-
-echo "Starting vLLM server for Qwen2.5-VL-3B-Instruct..."
-
-# Enable runtime LoRA adapter loading
-export VLLM_ALLOW_RUNTIME_LORA_UPDATING=True
-
-export TOKENIZERS_PARALLELISM=false
-export VLLM_LOGGING_LEVEL=DEBUG
-export CUDA_LAUNCH_BLOCKING=1  # Better error messages for CUDA errors
-
-# Common vLLM server command
-# Using CUDA_VISIBLE_DEVICES to put vLLM on GPU 1
-CUDA_VISIBLE_DEVICES=1 uv run vllm serve \
-    Qwen/Qwen2.5-VL-3B-Instruct \
-    --api-key token-abc123 \
-    --host 0.0.0.0 \
-    --port 8000 \
-    --tensor-parallel-size 1 \
-    --trust-remote-code \
-    --max-model-len 16384 \
-    --enable-lora \
-    --max-lora-rank 64 \
-    --max-cpu-loras 4 \
-    --enable-auto-tool-choice \
-    --tool-call-parser hermes \
-    --chat-template chat_template.jinja \
-    --enable-log-requests \
-    --uvicorn-log-level=debug 2>&1 | tee vllm_debug.log
\ No newline at end of file
diff --git a/hud/rl/vllm_adapter.py b/hud/rl/vllm_adapter.py
deleted file mode 100644
index 2937448e..00000000
--- a/hud/rl/vllm_adapter.py
+++ /dev/null
@@ -1,143 +0,0 @@
-"""vLLM adapter management for LoRA hot-swapping."""
-
-from __future__ import annotations
-
-import json
-import logging
-
-import requests
-
-from hud.utils.hud_console import HUDConsole
-
-hud_console = HUDConsole(logging.getLogger(__name__))
-
-
-class VLLMAdapter:
-    """Manages LoRA adapter loading/unloading in vLLM."""
-
-    def __init__(self, base_url: str, api_key: str) -> None:
-        self.base_url = base_url
-        self.api_key = api_key
-        self.current_adapter = None
-
-    def load_adapter(self, adapter_name: str, adapter_path: str, timeout: int = 30) -> bool:
-        """
-        Hot-load a LoRA adapter to vLLM.
-
-        Args:
-            adapter_name: Name to register the adapter as
-            adapter_path: Path to the adapter checkpoint
-            timeout: Request timeout in seconds
-
-        Returns:
-            True if successful, False otherwise
-        """
-        url = f"{self.base_url}/load_lora_adapter"
-        headers = {"Authorization": f"Bearer {self.api_key}", "Content-Type": "application/json"}
-        payload = {"lora_name": adapter_name, "lora_path": adapter_path}
-        # Implement exponential backoff for retrying the adapter load request.
-        max_retries = 8
-        backoff_factor = 2
-        delay = 1  # initial delay in seconds
-
-        for attempt in range(1, max_retries + 1):
-            try:
-                response = requests.post(
-                    url, headers=headers, data=json.dumps(payload), timeout=timeout
-                )
-                response.raise_for_status()
-
-                self.current_adapter = adapter_name
-                hud_console.info(f"[VLLMAdapter] Loaded adapter: {adapter_name}")
-                return True
-
-            except requests.exceptions.RequestException as e:
-                if attempt == max_retries:
-                    hud_console.error(
-                        f"[VLLMAdapter] Failed to load adapter {adapter_name} after {attempt} attempts: {e}"  # noqa: E501
-                    )
-                    return False
-                else:
-                    hud_console.warning(
-                        f"[VLLMAdapter] Load adapter {adapter_name} failed (attempt {attempt}/{max_retries}): {e}. Retrying in {delay} seconds...",  # noqa: E501
-                    )
-                    import time
-
-                    time.sleep(delay)
-                    delay *= backoff_factor
-
-        return False
-
-    def unload_adapter(self, adapter_name: str) -> bool:
-        """
-        Unload a LoRA adapter from vLLM.
-
-        Args:
-            adapter_name: Name of the adapter to unload
-
-        Returns:
-            True if successful, False otherwise
-        """
-        url = f"{self.base_url}/unload_lora_adapter"
-        headers = {"Authorization": f"Bearer {self.api_key}", "Content-Type": "application/json"}
-        payload = {"lora_name": adapter_name}
-
-        try:
-            response = requests.post(url, headers=headers, data=json.dumps(payload), timeout=30)
-            response.raise_for_status()
-
-            if self.current_adapter == adapter_name:
-                self.current_adapter = None
-
-            hud_console.info(f"[VLLMAdapter] Unloaded adapter: {adapter_name}")
-            return True
-
-        except requests.exceptions.RequestException as e:
-            hud_console.error(f"[VLLMAdapter] Failed to unload adapter {adapter_name}: {e}")
-            return False
-
-    def list_adapters(self) -> list | None:
-        """
-        List all loaded LoRA adapters in vLLM.
-
-        Returns:
-            List of adapter names, or None if failed
-        """
-        url = f"{self.base_url}/list_lora_adapters"
-        headers = {"Authorization": f"Bearer {self.api_key}"}
-
-        try:
-            response = requests.get(url, headers=headers, timeout=10)
-            response.raise_for_status()
-            return response.json().get("adapters", [])
-
-        except requests.exceptions.RequestException as e:
-            hud_console.error(f"[VLLMAdapter] Failed to list adapters: {e}")
-            return None
-
-    def get_current(self) -> str | None:
-        """Get the name of the currently loaded adapter."""
-        return self.current_adapter
-
-
-# Convenience function for standalone use
-def hotload_lora(
-    adapter_name: str,
-    adapter_path: str,
-    base_url: str = "http://localhost:8000/v1",
-    api_key: str = "token-abc123",
-) -> bool:
-    """
-    Quick function to hot-load a LoRA adapter.
-
-    Args:
-        adapter_name: Name for the adapter
-        adapter_path: Path to adapter checkpoint
-        base_url: vLLM server URL
-        api_key: API key for vLLM
-
-    Returns:
-        True if successful
-    """
-    adapter = VLLMAdapter(base_url, api_key)
-    return adapter.load_adapter(adapter_name, adapter_path)
diff --git a/hud/samples/browser.py b/hud/samples/browser.py
index 17de5d99..a6fdc695 100644
--- a/hud/samples/browser.py
+++ b/hud/samples/browser.py
@@ -7,17 +7,17 @@
 from pydantic import Field
 
 from hud.settings import settings
-from hud.types import MCPToolCall, Task
+from hud.types import LegacyTask, MCPToolCall
 
 
-class BrowserTask(Task):
-    """Task subclass with browser defaults for BrowserTask(prompt=...)."""
+class BrowserTask(LegacyTask):
+    """LegacyTask subclass with browser defaults for BrowserTask(prompt=...)."""
 
     prompt: str = "Open Google and be ready to search."
     mcp_config: dict[str, Any] = Field(
         default_factory=lambda: {
             "browser": {
-                "url": "https://mcp.hud.ai/v3/mcp",
+                "url": settings.hud_mcp_url,
                 "headers": {
                     "Authorization": f"Bearer {settings.api_key}",
                     "Mcp-Image": "hudevals/hud-remote-browser:0.1.1",
diff --git a/hud/server/low_level.py b/hud/server/low_level.py
index 65460ee4..05758a4c 100644
--- a/hud/server/low_level.py
+++ b/hud/server/low_level.py
@@ -89,11 +89,12 @@ class LowLevelServerWithInit(_BaseLL):
 
     def __init__(
         self,
+        fastmcp: Any,
         *args: Any,
         init_fn: Callable[[RequestContext], Awaitable[None]] | None = None,
         **kwargs: Any,
     ) -> None:
-        super().__init__(*args, **kwargs)
+        super().__init__(fastmcp, *args, **kwargs)
         self._init_fn = init_fn
 
     async def run(
diff --git a/hud/server/router.py b/hud/server/router.py
index e9859dcb..987fb611 100644
--- a/hud/server/router.py
+++ b/hud/server/router.py
@@ -140,8 +140,12 @@ async def _functions_catalogue() -> list[str]:
         self._resource_manager.add_resource(catalogue_resource)
 
     # Override _list_tools to hide internal tools when mounted
-    async def _list_tools(self) -> list[Tool]:
-        """Override _list_tools to hide internal tools when mounted."""
+    async def _list_tools(self, context: Any = None) -> list[Tool]:
+        """Override _list_tools to hide internal tools when mounted.
+
+        Args:
+            context: MiddlewareContext passed by FastMCP (optional for backwards compat)
+        """
         return [
             tool
             for key, tool in self._tool_manager._tools.items()
diff --git a/hud/server/server.py b/hud/server/server.py
index 431592ea..b833617b 100644
--- a/hud/server/server.py
+++ b/hud/server/server.py
@@ -16,9 +16,10 @@
 from starlette.requests import Request
 from starlette.responses import JSONResponse, Response
 
-from hud.datasets import run_tasks
+from hud.datasets import run_dataset
+from hud.eval.task import Task
 from hud.server.low_level import LowLevelServerWithInit
-from hud.types import Task
+from hud.types import LegacyTask
 
 if TYPE_CHECKING:
     from collections.abc import AsyncGenerator, Callable
@@ -242,6 +243,7 @@ async def _run_init(ctx: object | None = None) -> None:
         old_notification_handlers = self._mcp_server.notification_handlers
 
         self._mcp_server = LowLevelServerWithInit(
+            self,  # Pass FastMCP instance as required by parent class
             name=self.name,
             version=self.version,
             instructions=self.instructions,
@@ -486,7 +488,6 @@ def _sync_import_router(
         for key, prompt in router._prompt_manager._prompts.items():
             new_key = f"{prefix}_{key}" if prefix else key
             self._prompt_manager._prompts[new_key] = prompt
-        # await self.import_server(hidden_router, prefix=None, **kwargs)
 
     def _get_docker_logs(
         self,
@@ -594,9 +595,9 @@ async def tool_endpoint(request: Request) -> Response:
                     # Recursively serialize MCP objects
                     def serialize_obj(obj: Any) -> Any:
                         """Recursively serialize MCP objects to JSON-compatible format."""
-                        if obj is None or isinstance(obj, (str, int, float, bool)):
+                        if obj is None or isinstance(obj, str | int | float | bool):
                             return obj
-                        if isinstance(obj, (list, tuple)):
+                        if isinstance(obj, list | tuple):
                             return [serialize_obj(item) for item in obj]
                         if isinstance(obj, dict):
                             return {k: serialize_obj(v) for k, v in obj.items()}
@@ -753,10 +754,10 @@ async def run_eval(request: Request) -> Response:
                         )
 
                     # Add MCP config to each task and validate basic structure
-                    task_objects: list[Task] = []
+                    task_objects: list[LegacyTask] = []
                     for task_data in eval_request.tasks:
                         task_data["mcp_config"] = docker_config
-                        task_objects.append(Task.model_validate(task_data))
+                        task_objects.append(LegacyTask.model_validate(task_data))
 
                     agent_params: dict[str, Any] = {}
                     if eval_request.model:
@@ -764,8 +765,8 @@ async def run_eval(request: Request) -> Response:
 
                     # Fire and forget - launch evaluation in background
                     async def run_eval_background() -> None:
-                        await run_tasks(
-                            task_objects,
+                        await run_dataset(
+                            [Task.from_v4(task) for task in task_objects],
                             agent_type=agent_type,
                             agent_params=agent_params,
                             max_steps=eval_request.max_steps,
diff --git a/hud/server/tests/test_mcp_server_integration.py b/hud/server/tests/test_mcp_server_integration.py
index 10bc2c33..b575c04a 100644
--- a/hud/server/tests/test_mcp_server_integration.py
+++ b/hud/server/tests/test_mcp_server_integration.py
@@ -84,7 +84,7 @@ async def echo(text: str = "ok") -> str:  # type: ignore[override]
 
     async def connect_and_check() -> None:
         cfg = {"srv": {"url": f"http://127.0.0.1:{port}/mcp"}}
-        client = MCPClient(mcp_config=cfg, auto_trace=False, verbose=False)
+        client = MCPClient(mcp_config=cfg, verbose=False)
         await client.initialize()
         tools = await client.list_tools()
         names = sorted(t.name for t in tools)
@@ -123,7 +123,7 @@ async def _on_shutdown() -> None:
     try:
         # sanity connect so lifespan actually ran
         cfg = {"srv": {"url": f"http://127.0.0.1:{port}/mcp"}}
-        c = MCPClient(mcp_config=cfg, auto_trace=False, verbose=False)
+        c = MCPClient(mcp_config=cfg, verbose=False)
         await c.initialize()
         await c.shutdown()
     finally:
@@ -140,7 +140,7 @@ async def _on_shutdown() -> None:
     server_task2 = await _start_http_server(mcp, port=port2)
     try:
         cfg = {"srv": {"url": f"http://127.0.0.1:{port2}/mcp"}}
-        c = MCPClient(mcp_config=cfg, auto_trace=False, verbose=False)
+        c = MCPClient(mcp_config=cfg, verbose=False)
         await c.initialize()
         await c.shutdown()
 
@@ -170,7 +170,7 @@ async def _init(_ctx) -> None:
     server_task = await _start_http_server(mcp, port)
 
     cfg = {"srv": {"url": f"http://127.0.0.1:{port}/mcp"}}
-    client = MCPClient(mcp_config=cfg, auto_trace=False, verbose=False)
+    client = MCPClient(mcp_config=cfg, verbose=False)
 
     try:
         with pytest.raises(Exception):
@@ -211,7 +211,7 @@ async def _init(_ctx) -> None:
 
     async def connect_and_check() -> None:
         cfg = {"srv": {"url": f"http://127.0.0.1:{port}/mcp"}}
-        c = MCPClient(mcp_config=cfg, auto_trace=False, verbose=False)
+        c = MCPClient(mcp_config=cfg, verbose=False)
         await c.initialize()
         tools = await c.list_tools()
         names = sorted(t.name for t in tools)
@@ -244,7 +244,7 @@ async def echo(text: str = "ok") -> str:  # type: ignore[override]
     server_task = await _start_http_server(mcp, port)
     try:
         cfg = {"srv": {"url": f"http://127.0.0.1:{port}/mcp"}}
-        c = MCPClient(mcp_config=cfg, auto_trace=False, verbose=False)
+        c = MCPClient(mcp_config=cfg, verbose=False)
         await c.initialize()
         # Call with no args → default should kick in
         res = await c.call_tool(name="echo", arguments={})
@@ -273,7 +273,7 @@ async def _on_shutdown() -> None:
     try:
         # Ensure lifespan started
         cfg = {"srv": {"url": f"http://127.0.0.1:{port}/mcp"}}
-        c = MCPClient(mcp_config=cfg, auto_trace=False, verbose=False)
+        c = MCPClient(mcp_config=cfg, verbose=False)
         await c.initialize()
         await c.shutdown()
 
@@ -315,7 +315,7 @@ async def _init(ctx) -> None:  # type: ignore[override]
     server_task = await _start_http_server(mcp, port)
     try:
         cfg = {"srv": {"url": f"http://127.0.0.1:{port}/mcp"}}
-        c = MCPClient(mcp_config=cfg, auto_trace=False, verbose=False)
+        c = MCPClient(mcp_config=cfg, verbose=False)
         await c.initialize()
         await c.shutdown()
     finally:
@@ -344,7 +344,7 @@ async def _init(_ctx) -> None:
 
     try:
         cfg = {"srv": {"url": f"http://127.0.0.1:{port}/mcp"}}
-        c = MCPClient(mcp_config=cfg, auto_trace=False, verbose=False)
+        c = MCPClient(mcp_config=cfg, verbose=False)
         await c.initialize()
         await c.shutdown()
     finally:
@@ -373,11 +373,11 @@ async def _init(_ctx) -> None:
     server_task = await _start_http_server(mcp, port)
     try:
         cfg = {"srv": {"url": f"http://127.0.0.1:{port}/mcp"}}
-        c1 = MCPClient(mcp_config=cfg, auto_trace=False, verbose=False)
+        c1 = MCPClient(mcp_config=cfg, verbose=False)
         await c1.initialize()
         await c1.shutdown()
 
-        c2 = MCPClient(mcp_config=cfg, auto_trace=False, verbose=False)
+        c2 = MCPClient(mcp_config=cfg, verbose=False)
         await c2.initialize()
         await c2.shutdown()
     finally:
diff --git a/hud/server/tests/test_mcp_server_more.py b/hud/server/tests/test_mcp_server_more.py
index 875424ba..d364837a 100644
--- a/hud/server/tests/test_mcp_server_more.py
+++ b/hud/server/tests/test_mcp_server_more.py
@@ -142,7 +142,7 @@ async def echo(text: str = "ok") -> str:  # type: ignore[override]
 
     try:
         cfg = {"srv": {"url": f"http://127.0.0.1:{port}/mcp"}}
-        c = MCPClient(mcp_config=cfg, auto_trace=False, verbose=False)
+        c = MCPClient(mcp_config=cfg, verbose=False)
         await c.initialize()
 
         # Call a tool to ensure init didn't break anything
diff --git a/hud/telemetry/__init__.py b/hud/telemetry/__init__.py
index 84e632f7..e237673b 100644
--- a/hud/telemetry/__init__.py
+++ b/hud/telemetry/__init__.py
@@ -1,47 +1,27 @@
-"""HUD Telemetry - Tracing and job management for agent execution.
+"""HUD Telemetry - Lightweight telemetry for HUD SDK.
 
-Provides telemetry APIs for tracking agent execution and experiments.
+This module provides:
+- @instrument decorator for recording function calls
+- High-performance span export to HUD API
 
-Async Usage (Recommended):
-    >>> import hud
-    >>> async with hud.async_trace("Task"):
-    ...     await agent.run(task)
-    >>> async with hud.async_job("Evaluation") as job:
-    ...     async with hud.async_trace("Task", job_id=job.id):
-    ...         await agent.run(task)
+Usage:
+    import hud
 
-Sync Usage:
-    >>> import hud
-    >>> with hud.trace("Task"):
-    ...     do_work()
-    >>> with hud.job("My Job") as job:
-    ...     with hud.trace("Task", job_id=job.id):
-    ...         do_work()
+    @hud.instrument
+    async def my_function():
+        ...
 
-APIs:
-    - async_trace(), async_job() - Async context managers (recommended)
-    - trace(), job() - Sync context managers
-    - flush_telemetry() - Manual span flushing (rarely needed)
-    - instrument() - Function instrumentation decorator
+    # Within an eval context, calls are recorded
+    async with hud.eval(task) as ctx:
+        result = await my_function()
 """
 
-from __future__ import annotations
-
-from .async_context import async_job, async_trace
-from .instrument import instrument
-from .job import Job, create_job, job
-from .replay import clear_trace, get_trace
-from .trace import Trace, trace
+from hud.telemetry.exporter import flush, queue_span, shutdown
+from hud.telemetry.instrument import instrument
 
 __all__ = [
-    "Job",
-    "Trace",
-    "async_job",
-    "async_trace",
-    "clear_trace",
-    "create_job",
-    "get_trace",
+    "flush",
     "instrument",
-    "job",
-    "trace",
+    "queue_span",
+    "shutdown",
 ]
diff --git a/hud/telemetry/async_context.py b/hud/telemetry/async_context.py
deleted file mode 100644
index 90a00723..00000000
--- a/hud/telemetry/async_context.py
+++ /dev/null
@@ -1,345 +0,0 @@
-"""Async context managers for HUD telemetry.
-
-Provides async-native trace and job context managers for async code.
-
-Usage:
-    >>> import hud
-    >>> async with hud.async_trace("Task"):
-    ...     await agent.run(task)
-    >>> async with hud.async_job("Evaluation") as job:
-    ...     async with hud.async_trace("Task", job_id=job.id):
-    ...         await agent.run(task)
-
-Telemetry is fully automatic - status updates are awaited and spans are
-flushed on context exit. No manual cleanup required.
-"""
-
-from __future__ import annotations
-
-import logging
-import traceback
-import uuid
-from typing import TYPE_CHECKING, Any
-
-if TYPE_CHECKING:
-    from types import TracebackType
-
-from hud.otel import configure_telemetry
-from hud.otel.context import (
-    _update_task_status_async,
-)
-from hud.otel.context import (
-    trace as OtelTrace,
-)
-from hud.settings import settings
-from hud.shared import make_request
-from hud.telemetry.job import Job, _print_job_complete_url, _print_job_url
-from hud.telemetry.trace import Trace
-
-logger = logging.getLogger(__name__)
-
-# Module exports
-__all__ = ["AsyncJob", "AsyncTrace", "async_job", "async_trace"]
-
-# Global state for current job
-_current_job: Job | None = None
-
-
-class AsyncTrace:
-    """Async context manager for HUD trace tracking.
-
-    This is the async equivalent of `hud.trace()`, designed for use in
-    high-concurrency async contexts. It tracks task execution with automatic
-    status updates.
-
-    The context manager:
-    - Creates a unique task_run_id for telemetry correlation
-    - Sends and AWAITS status updates ("running" → "completed"/"error")
-    - Integrates with OpenTelemetry for span collection
-    - Ensures status is updated before exiting the context
-
-    Use `async_trace()` helper function instead of instantiating directly.
-    """
-
-    def __init__(
-        self,
-        name: str = "Test task from hud",
-        *,
-        root: bool = True,
-        attrs: dict[str, Any] | None = None,
-        job_id: str | None = None,
-        task_id: str | None = None,
-        group_id: str | None = None,
-        trace_id: str | None = None,
-    ) -> None:
-        self.name = name
-        self.root = root
-        self.attrs = attrs or {}
-        self.job_id = job_id
-        self.task_id = task_id
-        self.group_id = group_id
-        self.task_run_id = trace_id if trace_id else str(uuid.uuid4())
-        self.trace_obj = Trace(self.task_run_id, name, job_id, task_id, group_id)
-        self._otel_trace = None
-
-    async def __aenter__(self) -> Trace:
-        """Enter the async trace context."""
-        # Ensure telemetry is configured
-        configure_telemetry()
-
-        # Start the OpenTelemetry span
-        self._otel_trace = OtelTrace(
-            self.task_run_id,
-            is_root=self.root,
-            span_name=self.name,
-            attributes=self.attrs,
-            job_id=self.job_id,
-            task_id=self.task_id,
-            group_id=self.group_id,
-        )
-        self._otel_trace.__enter__()
-
-        # Update trace status to "running"
-        if self.root and settings.telemetry_enabled and settings.api_key:
-            await _update_task_status_async(
-                self.task_run_id,
-                "running",
-                job_id=self.job_id,
-                trace_name=self.name,
-                task_id=self.task_id,
-                group_id=self.group_id,
-            )
-
-        logger.debug("Started trace: %s (%s)", self.name, self.task_run_id)
-        return self.trace_obj
-
-    async def __aexit__(
-        self,
-        exc_type: type[BaseException] | None,
-        exc_val: BaseException | None,
-        exc_tb: TracebackType | None,
-    ) -> None:
-        """Exit the async trace context."""
-        # Close the OpenTelemetry span
-        if self._otel_trace:
-            self._otel_trace.__exit__(exc_type, exc_val, exc_tb)
-
-        # Update trace status to "completed" or "error"
-        if self.root and settings.telemetry_enabled and settings.api_key:
-            status = "error" if exc_type else "completed"
-            error_msg = None
-            if exc_type is not None:
-                error_msg = "".join(traceback.format_exception(exc_type, exc_val, exc_tb))
-
-            try:
-                await _update_task_status_async(
-                    self.task_run_id,
-                    status,
-                    job_id=self.job_id,
-                    error_message=error_msg,
-                    trace_name=self.name,
-                    task_id=self.task_id,
-                    group_id=self.group_id,
-                )
-            except Exception as e:
-                logger.warning("Failed to update trace status: %s", e)
-
-        # Flush spans for standalone traces (not part of a job)
-        if not self.job_id and self.root:
-            from hud.telemetry.utils import flush_telemetry
-
-            await flush_telemetry()
-
-        logger.debug("Ended trace: %s (%s)", self.name, self.task_run_id)
-
-
-class AsyncJob:
-    """Async context manager for HUD job tracking.
-
-    This is the async equivalent of `hud.job()`, designed for grouping
-    related tasks in high-concurrency async contexts.
-
-    The context manager:
-    - Creates or uses a provided job_id
-    - Sends and AWAITS status updates ("running" → "completed"/"failed")
-    - Associates all child traces with this job
-    - Ensures status is updated before exiting the context
-
-    Use `async_job()` helper function instead of instantiating directly.
-    """
-
-    def __init__(
-        self,
-        name: str,
-        metadata: dict[str, Any] | None = None,
-        job_id: str | None = None,
-        dataset_link: str | None = None,
-    ) -> None:
-        self.job_id = job_id or str(uuid.uuid4())
-        self.job = Job(self.job_id, name, metadata, dataset_link)
-
-    async def __aenter__(self) -> Job:
-        """Enter the async job context."""
-        global _current_job
-
-        # Save previous job and set this as current
-        self._old_job = _current_job
-        _current_job = self.job
-
-        # Update job status to "running"
-        if settings.telemetry_enabled:
-            payload = {
-                "name": self.job.name,
-                "status": "running",
-                "metadata": self.job.metadata,
-            }
-            if self.job.dataset_link:
-                payload["dataset_link"] = self.job.dataset_link
-
-            try:
-                await make_request(
-                    method="POST",
-                    url=f"{settings.hud_telemetry_url}/jobs/{self.job.id}/status",
-                    json=payload,
-                    api_key=settings.api_key,
-                )
-            except Exception as e:
-                logger.warning("Failed to update job status: %s", e)
-
-        _print_job_url(self.job.id, self.job.name)
-        logger.debug("Started job: %s (%s)", self.job.name, self.job.id)
-        return self.job
-
-    async def __aexit__(
-        self,
-        exc_type: type[BaseException] | None,
-        exc_val: BaseException | None,
-        exc_tb: TracebackType | None,
-    ) -> None:
-        """Exit the async job context."""
-        global _current_job
-
-        # Flush all child trace spans before updating job status
-        from hud.telemetry.utils import flush_telemetry
-
-        await flush_telemetry()
-
-        # Update job status to "completed" or "failed"
-        if settings.telemetry_enabled:
-            status = "failed" if exc_type else "completed"
-            payload = {
-                "name": self.job.name,
-                "status": status,
-                "metadata": self.job.metadata,
-            }
-            if self.job.dataset_link:
-                payload["dataset_link"] = self.job.dataset_link
-
-            try:
-                await make_request(
-                    method="POST",
-                    url=f"{settings.hud_telemetry_url}/jobs/{self.job.id}/status",
-                    json=payload,
-                    api_key=settings.api_key,
-                )
-            except Exception as e:
-                logger.warning("Failed to update job status: %s", e)
-
-        _print_job_complete_url(self.job.id, self.job.name, error_occurred=bool(exc_type))
-
-        # Restore previous job
-        _current_job = self._old_job
-
-        logger.debug("Ended job: %s (%s)", self.job.name, self.job.id)
-
-
-def async_trace(
-    name: str = "Test task from hud",
-    *,
-    root: bool = True,
-    attrs: dict[str, Any] | None = None,
-    job_id: str | None = None,
-    task_id: str | None = None,
-    group_id: str | None = None,
-    trace_id: str | None = None,
-) -> AsyncTrace:
-    """Create an async trace context for telemetry tracking.
-
-    This is the async equivalent of `hud.trace()` for use in async contexts.
-    Status updates are automatically sent and awaited - the trace doesn't exit
-    until its status is confirmed on the server.
-
-    Args:
-        name: Descriptive name for this trace/task
-        root: Whether this is a root trace (updates task status)
-        attrs: Additional attributes to attach to the trace
-        job_id: Optional job ID to associate with this trace
-        task_id: Optional task ID for custom task identifiers
-        group_id: Optional group ID to associate with this trace
-        trace_id: Optional trace ID (auto-generated if not provided)
-
-    Returns:
-        AsyncTrace context manager
-
-    Example:
-        >>> import hud
-        >>> # Single task - everything is automatic!
-        >>> async with hud.async_trace("My Task"):
-        ...     result = await agent.run(task)
-        >>> # Status is "completed" and spans are flushed - all done!
-        >>>
-        >>> # Multiple tasks - each trace handles itself
-        >>> for task in tasks:
-        ...     async with hud.async_trace(task.name):
-        ...         await process(task)
-        >>> # All traces completed and flushed - nothing more needed!
-
-    Note:
-        Use this async version in async code. For sync code, use `hud.trace()`.
-        Telemetry is fully automatic - no manual flushing required.
-    """
-    return AsyncTrace(
-        name,
-        root=root,
-        attrs=attrs,
-        job_id=job_id,
-        task_id=task_id,
-        group_id=group_id,
-        trace_id=trace_id,
-    )
-
-
-def async_job(
-    name: str,
-    metadata: dict[str, Any] | None = None,
-    job_id: str | None = None,
-    dataset_link: str | None = None,
-) -> AsyncJob:
-    """Create an async job context for grouping related tasks.
-
-    This is the async equivalent of `hud.job()` for async contexts.
-    Status updates are automatically sent and awaited - the job doesn't exit
-    until its status is confirmed on the server.
-
-    Args:
-        name: Human-readable job name
-        metadata: Optional metadata dictionary
-        job_id: Optional job ID (auto-generated if not provided)
-        dataset_link: Optional HuggingFace dataset identifier
-
-    Returns:
-        AsyncJob context manager
-
-    Example:
-        >>> import hud
-        >>> async with hud.async_job("Batch Processing") as job:
-        ...     for item in items:
-        ...         async with hud.async_trace(f"Task {item.id}", job_id=job.id):
-        ...             await process(item)
-        >>> # Job exits - automatically flushes all child trace spans!
-
-    Note:
-        Use this async version in async code. For sync code, use `hud.job()`.
-        Telemetry is fully automatic - no manual flushing required.
-    """
-    return AsyncJob(name, metadata=metadata, job_id=job_id, dataset_link=dataset_link)
diff --git a/hud/telemetry/exporter.py b/hud/telemetry/exporter.py
new file mode 100644
index 00000000..1b6abf08
--- /dev/null
+++ b/hud/telemetry/exporter.py
@@ -0,0 +1,201 @@
+"""High-performance span exporter for HUD telemetry backend.
+
+This module provides a lightweight span exporter that sends spans to the HUD
+telemetry API immediately, using a thread pool to avoid blocking async code.
+
+No OpenTelemetry dependency required.
+"""
+
+from __future__ import annotations
+
+import atexit
+import concurrent.futures as cf
+import contextlib
+import logging
+from collections import defaultdict
+from concurrent.futures import ThreadPoolExecutor
+from typing import Any
+
+from hud.shared import make_request_sync
+
+logger = logging.getLogger(__name__)
+
+# Global singleton thread pool for span exports
+_export_executor: ThreadPoolExecutor | None = None
+
+# Pending futures for shutdown coordination
+_pending_futures: list[cf.Future[bool]] = []
+
+# Spans waiting to be flushed at context exit (per task_run_id)
+_pending_spans: dict[str, list[dict[str, Any]]] = defaultdict(list)
+
+
+def _get_export_executor() -> ThreadPoolExecutor:
+    """Get or create the global thread pool for span exports."""
+    global _export_executor
+    if _export_executor is None:
+        _export_executor = ThreadPoolExecutor(max_workers=4, thread_name_prefix="span-export")
+
+        def cleanup() -> None:
+            if _export_executor is not None:
+                _export_executor.shutdown(wait=True)
+
+        atexit.register(cleanup)
+    return _export_executor
+
+
+def _do_upload(
+    task_run_id: str,
+    spans: list[dict[str, Any]],
+    telemetry_url: str,
+    api_key: str,
+) -> bool:
+    """Upload spans to HUD API (sync, runs in thread pool)."""
+    try:
+        url = f"{telemetry_url}/trace/{task_run_id}/telemetry-upload"
+        payload: dict[str, Any] = {"telemetry": spans}
+
+        logger.debug("Uploading %d spans to %s", len(spans), url)
+        make_request_sync(
+            method="POST",
+            url=url,
+            json=payload,
+            api_key=api_key,
+        )
+        return True
+    except Exception as e:
+        logger.debug("Failed to upload spans for task %s: %s", task_run_id, e)
+        return False
+
+
+def _get_api_key() -> str | None:
+    """Get the API key - prefer context override, fallback to settings."""
+    from hud.eval.context import get_current_api_key
+    from hud.settings import settings
+
+    return get_current_api_key() or settings.api_key
+
+
+def queue_span(span: dict[str, Any]) -> None:
+    """Queue a span and immediately upload it (non-blocking).
+
+    Uses thread pool to upload without blocking the event loop.
+    """
+    from hud.settings import settings
+
+    api_key = _get_api_key()
+    if not api_key or not settings.telemetry_enabled:
+        return
+
+    task_run_id = span.get("attributes", {}).get("task_run_id")
+    if not task_run_id:
+        return
+
+    # Store for potential re-flush at context exit
+    _pending_spans[task_run_id].append(span)
+
+    # Capture api_key for upload closure (context may change)
+    upload_api_key = api_key
+
+    # Upload immediately via thread pool
+    import asyncio
+
+    try:
+        loop = asyncio.get_running_loop()
+        # In async context - use thread pool
+        executor = _get_export_executor()
+
+        def _upload() -> bool:
+            return _do_upload(task_run_id, [span], settings.hud_telemetry_url, upload_api_key)
+
+        future = loop.run_in_executor(executor, _upload)
+        _pending_futures.append(future)  # type: ignore[arg-type]
+
+        def _cleanup_done(f: cf.Future[bool]) -> None:
+            with contextlib.suppress(Exception):
+                _ = f.exception()
+            with contextlib.suppress(ValueError):
+                _pending_futures.remove(f)
+            # Remove from pending spans on success
+            if not f.exception():
+                with contextlib.suppress(Exception):
+                    if task_run_id in _pending_spans and span in _pending_spans[task_run_id]:
+                        _pending_spans[task_run_id].remove(span)
+
+        future.add_done_callback(_cleanup_done)  # type: ignore[arg-type]
+
+    except RuntimeError:
+        # No event loop - upload synchronously
+        if _do_upload(task_run_id, [span], settings.hud_telemetry_url, upload_api_key):
+            with contextlib.suppress(Exception):
+                if task_run_id in _pending_spans and span in _pending_spans[task_run_id]:
+                    _pending_spans[task_run_id].remove(span)
+
+
+def flush(task_run_id: str | None = None) -> None:
+    """Flush any pending spans (called at context exit).
+
+    This ensures any spans that failed to upload are retried.
+
+    Args:
+        task_run_id: Optional task run ID to flush. If None, flushes all.
+    """
+    from hud.settings import settings
+
+    api_key = _get_api_key()
+    if not api_key or not settings.telemetry_enabled:
+        _pending_spans.clear()
+        return
+
+    if task_run_id:
+        # Flush specific task
+        spans = _pending_spans.pop(task_run_id, [])
+        if spans:
+            _do_upload(task_run_id, spans, settings.hud_telemetry_url, api_key)
+    else:
+        # Flush all
+        for tid, spans in list(_pending_spans.items()):
+            if spans:
+                _do_upload(tid, spans, settings.hud_telemetry_url, api_key)
+        _pending_spans.clear()
+
+
+def shutdown(timeout: float = 10.0) -> bool:
+    """Shutdown and wait for pending exports.
+
+    Args:
+        timeout: Maximum time to wait in seconds
+
+    Returns:
+        True if all exports completed, False if timed out
+    """
+    # Wait for pending async exports
+    if _pending_futures:
+        try:
+            done, not_done = cf.wait(_pending_futures, timeout=timeout)
+            for f in done:
+                with contextlib.suppress(Exception):
+                    _ = f.exception()
+            _pending_futures.clear()
+
+            # Flush any remaining spans synchronously
+            flush()
+
+            return len(not_done) == 0
+        except Exception:
+            return False
+
+    # Flush any remaining spans
+    flush()
+    return True
+
+
+# Register shutdown handler
+atexit.register(lambda: shutdown(timeout=5.0))
+
+
+__all__ = [
+    "flush",
+    "queue_span",
+    "shutdown",
+]
diff --git a/hud/telemetry/instrument.py b/hud/telemetry/instrument.py
index d50c45be..204f11bd 100644
--- a/hud/telemetry/instrument.py
+++ b/hud/telemetry/instrument.py
@@ -1,7 +1,16 @@
-"""General-purpose instrumentation decorator for HUD telemetry.
+"""Instrumentation decorator for HUD telemetry.
 
-This module provides the instrument() decorator that users can use
-to instrument any function with OpenTelemetry spans.
+This module provides a lightweight @instrument decorator that records
+function calls and sends them to the HUD telemetry backend.
+
+Usage:
+    @hud.instrument
+    async def my_function(arg1, arg2):
+        ...
+
+    # Within an eval context, calls are recorded and sent to HUD
+    async with env.eval("task") as ctx:
+        result = await my_function("a", "b")
 """
 
 from __future__ import annotations
@@ -11,14 +20,23 @@
 import inspect
 import json
 import logging
+import time
+import uuid
+from datetime import UTC, datetime
 from typing import TYPE_CHECKING, Any, TypeVar, overload
 
 import pydantic_core
-from opentelemetry import trace
-from opentelemetry.trace import SpanKind, Status, StatusCode
 
-from hud.otel import configure_telemetry, is_telemetry_configured
-from hud.otel.context import get_current_task_run_id
+from hud.telemetry.exporter import queue_span
+from hud.types import TraceStep
+
+
+def _get_trace_id() -> str | None:
+    """Lazy import to avoid circular dependency with eval.context."""
+    from hud.eval.context import get_current_trace_id
+
+    return get_current_trace_id()
+
 
 if TYPE_CHECKING:
     from collections.abc import Awaitable, Callable
@@ -31,53 +49,42 @@
 
 
 def _serialize_value(value: Any, max_items: int = 10) -> Any:
-    """Serialize a value for span attributes.
-
-    Uses pydantic_core.to_json for robust serialization of complex objects.
-
-    Args:
-        value: The value to serialize
-        max_items: Maximum number of items for collections
-
-    Returns:
-        JSON-serializable version of the value
-    """
-    # Simple types pass through
+    """Serialize a value for recording."""
     if isinstance(value, str | int | float | bool | type(None)):
         return value
 
-    # For collections, we need to limit size first
     if isinstance(value, list | tuple):
         value = value[:max_items] if len(value) > max_items else value
     elif isinstance(value, dict) and len(value) > max_items:
         value = dict(list(value.items())[:max_items])
 
-    # Use pydantic_core for serialization - it handles:
-    # - Pydantic models (via model_dump)
-    # - Dataclasses (via asdict)
-    # - Bytes (encodes to string)
-    # - Custom objects (via __dict__ or repr)
-    # - Complex nested structures
     try:
-        # Convert to JSON bytes then back to Python objects
-        # This ensures we get JSON-serializable types
         json_bytes = pydantic_core.to_json(value, fallback=str)
         return json.loads(json_bytes)
     except Exception:
-        # Fallback if pydantic_core fails somehow
         return f"<{type(value).__name__}>"
 
 
+def _now_iso() -> str:
+    """Get current time as ISO-8601 string."""
+    return datetime.now(UTC).isoformat().replace("+00:00", "Z")
+
+
+def _normalize_trace_id(trace_id: str) -> str:
+    """Normalize trace_id to 32-character hex string."""
+    clean = trace_id.replace("-", "")
+    return clean[:32].ljust(32, "0")
+
+
 @overload
 def instrument(
     func: None = None,
     *,
     name: str | None = None,
-    span_type: str = "function",
-    attributes: dict[str, Any] | None = None,
+    category: str = "function",
+    span_type: str | None = None,
     record_args: bool = True,
     record_result: bool = True,
-    span_kind: SpanKind = SpanKind.INTERNAL,
 ) -> Callable[[Callable[..., Any]], Callable[..., Any]]: ...
 
 
@@ -86,11 +93,10 @@ def instrument(
     func: Callable[P, R],
     *,
     name: str | None = None,
-    span_type: str = "function",
-    attributes: dict[str, Any] | None = None,
+    category: str = "function",
+    span_type: str | None = None,
     record_args: bool = True,
     record_result: bool = True,
-    span_kind: SpanKind = SpanKind.INTERNAL,
 ) -> Callable[P, R]: ...
 
 
@@ -99,11 +105,10 @@ def instrument(
     func: Callable[P, Awaitable[R]],
     *,
     name: str | None = None,
-    span_type: str = "function",
-    attributes: dict[str, Any] | None = None,
+    category: str = "function",
+    span_type: str | None = None,
     record_args: bool = True,
     record_result: bool = True,
-    span_kind: SpanKind = SpanKind.INTERNAL,
 ) -> Callable[P, Awaitable[R]]: ...
 
 
@@ -111,269 +116,169 @@ def instrument(
     func: Callable[..., Any] | None = None,
     *,
     name: str | None = None,
-    span_type: str = "function",
-    attributes: dict[str, Any] | None = None,
+    category: str = "function",
+    span_type: str | None = None,
     record_args: bool = True,
     record_result: bool = True,
-    span_kind: SpanKind = SpanKind.INTERNAL,
 ) -> Callable[..., Any]:
-    """Instrument a function to emit OpenTelemetry spans.
+    """Instrument a function to record spans within eval context.
 
-    This decorator wraps any function to automatically create spans for
-    observability. It works with both sync and async functions.
+    This decorator records function calls as spans and sends them to the HUD API.
 
     Args:
-        func: The function to instrument (when used without parentheses)
-        name: Custom span name (defaults to fully qualified function name)
-        span_type: The category for this span (e.g., "agent", "mcp", "database", "validation")
-        attributes: Additional attributes to attach to every span
-        record_args: Whether to record function arguments in the request field
-        record_result: Whether to record function result in the result field
-        span_kind: OpenTelemetry span kind (INTERNAL, CLIENT, SERVER, etc.)
+        func: The function to instrument
+        name: Custom span name (defaults to module.function)
+        category: Span category (e.g., "agent", "tool", "function", "mcp")
+        span_type: Alias for category (deprecated, use category instead)
+        record_args: Whether to record function arguments
+        record_result: Whether to record function result
 
     Returns:
-        The instrumented function that emits spans
+        The instrumented function
 
     Examples:
-        # Basic usage - defaults to category="function"
         @hud.instrument
         async def process_data(items: list[str]) -> dict:
             return {"count": len(items)}
 
-        # Custom category
-        @hud.instrument(
-            span_type="database",  # This becomes category="database"
-            record_args=True,
-            record_result=True
-        )
-        async def query_users(filter: dict) -> list[User]:
-            return await db.find(filter)
-
-        # Agent instrumentation
-        @hud.instrument(
-            span_type="agent",  # category="agent" gets special handling
-            record_args=False,  # Don't record large message arrays
-            record_result=True
-        )
-        async def get_model_response(self, messages: list) -> Response:
-            return await self.model.complete(messages)
-
-        # Instrument third-party functions
-        import requests
-        requests.get = hud.instrument(
-            span_type="http",  # category="http"
-            span_kind=SpanKind.CLIENT
-        )(requests.get)
-
-        # Conditional instrumentation
-        if settings.enable_db_tracing:
-            db.query = hud.instrument(db.query)
+        @hud.instrument(category="agent")
+        async def call_model(messages: list) -> str:
+            return await model.generate(messages)
     """
-    # Don't configure telemetry at decoration time - wait until first call
-    # This allows users to configure alternative backends before importing agents
+    effective_category = span_type if span_type is not None else category
 
     def decorator(func: Callable[..., Any]) -> Callable[..., Any]:
-        # Check if already instrumented
         if hasattr(func, "_hud_instrumented"):
-            logger.debug("Function %s already instrumented, skipping", func.__name__)
             return func
 
-        # Get function metadata
         func_module = getattr(func, "__module__", "unknown")
         func_name = getattr(func, "__name__", "unknown")
         func_qualname = getattr(func, "__qualname__", func_name)
-
-        # Determine span name
         span_name = name or f"{func_module}.{func_qualname}"
 
-        # Get function signature for argument parsing
         try:
             sig = inspect.signature(func)
         except (ValueError, TypeError):
             sig = None
 
-        @functools.wraps(func)
-        async def async_wrapper(*args: Any, **kwargs: Any) -> Any:
-            # Ensure telemetry is configured (lazy initialization)
-            # Only configure with defaults if user hasn't configured it yet
-            if not is_telemetry_configured():
-                configure_telemetry()
-
-            tracer = trace.get_tracer("hud-sdk")
-
-            # Build span attributes
-            span_attrs = {
-                "category": span_type,  # span_type IS the category
-                "function.module": func_module,
-                "function.name": func_name,
-                "function.qualname": func_qualname,
-            }
-
-            # Add custom attributes
-            if attributes:
-                span_attrs.update(attributes)
-
-            # Add current task_run_id if available
-            task_run_id = get_current_task_run_id()
-            if task_run_id:
-                span_attrs["hud.task_run_id"] = task_run_id
-
-            # Record function arguments if requested
+        def _build_span(
+            task_run_id: str,
+            args: tuple[Any, ...],
+            kwargs: dict[str, Any],
+            start_time: str,
+            end_time: str,
+            result: Any = None,
+            error: str | None = None,
+        ) -> dict[str, Any]:
+            """Build a HudSpan-compatible span record."""
+            # Build attributes using TraceStep
+            attributes = TraceStep(
+                task_run_id=task_run_id,
+                category=effective_category,
+                type="CLIENT",
+                start_timestamp=start_time,
+                end_timestamp=end_time,
+            )
+
+            # Record arguments as request
             if record_args and sig:
                 try:
                     bound_args = sig.bind(*args, **kwargs)
                     bound_args.apply_defaults()
-
-                    # Serialize arguments (with safety limits)
-                    args_dict = {}
-                    for param_name, value in bound_args.arguments.items():
-                        try:
-                            # Skip 'self' and 'cls' parameters
-                            if param_name in ("self", "cls"):
-                                continue
-
-                            args_dict[param_name] = _serialize_value(value)
-                        except Exception:
-                            args_dict[param_name] = "<serialization_error>"
-
+                    args_dict = {
+                        k: _serialize_value(v)
+                        for k, v in bound_args.arguments.items()
+                        if k not in ("self", "cls")
+                    }
                     if args_dict:
-                        args_json = json.dumps(args_dict)
-                        span_attrs["function.arguments"] = args_json
-                        # Always set generic request field for consistency
-                        span_attrs["request"] = args_json
+                        attributes.request = args_dict
                 except Exception as e:
-                    logger.debug("Failed to record function arguments: %s", e)
+                    logger.debug("Failed to serialize args: %s", e)
 
-            with tracer.start_as_current_span(
-                span_name,
-                kind=span_kind,
-                attributes=span_attrs,
-            ) as span:
+            # Record result
+            if record_result and result is not None and error is None:
                 try:
-                    # Execute the function
-                    result = await func(*args, **kwargs)
-
-                    # Record result if requested
-                    if record_result:
-                        try:
-                            serialized = _serialize_value(result)
-                            result_json = json.dumps(serialized)
-                            span.set_attribute("function.result", result_json)
-                            # Always set generic result field for consistency
-                            span.set_attribute("result", result_json)
-
-                            # Also set result type for complex objects
-                            if not isinstance(
-                                result, str | int | float | bool | type(None) | list | tuple | dict
-                            ):
-                                span.set_attribute("function.result_type", type(result).__name__)
-                        except Exception as e:
-                            logger.debug("Failed to record function result: %s", e)
-
-                    span.set_status(Status(StatusCode.OK))
-                    return result
-
+                    attributes.result = _serialize_value(result)
                 except Exception as e:
-                    # Record exception and set error status
-                    span.record_exception(e)
-                    span.set_status(Status(StatusCode.ERROR, str(e)))
-                    raise
-
-        @functools.wraps(func)
-        def sync_wrapper(*args: Any, **kwargs: Any) -> Any:
-            # Ensure telemetry is configured (lazy initialization)
-            # Only configure with defaults if user hasn't configured it yet
-            if not is_telemetry_configured():
-                configure_telemetry()
-
-            tracer = trace.get_tracer("hud-sdk")
-
-            # Build span attributes (same as async)
-            span_attrs = {
-                "category": span_type,  # span_type IS the category
-                "function.module": func_module,
-                "function.name": func_name,
-                "function.qualname": func_qualname,
+                    logger.debug("Failed to serialize result: %s", e)
+
+            # Build span
+            span_id = uuid.uuid4().hex[:16]
+            span = {
+                "name": span_name,
+                "trace_id": _normalize_trace_id(task_run_id),
+                "span_id": span_id,
+                "parent_span_id": None,
+                "start_time": start_time,
+                "end_time": end_time,
+                "status_code": "ERROR" if error else "OK",
+                "status_message": error,
+                "attributes": attributes.model_dump(mode="json", exclude_none=True),
+                "exceptions": [{"message": error}] if error else None,
             }
+            return span
 
-            if attributes:
-                span_attrs.update(attributes)
-
-            task_run_id = get_current_task_run_id()
-            if task_run_id:
-                span_attrs["hud.task_run_id"] = task_run_id
-
-            # Record function arguments if requested
-            if record_args and sig:
-                try:
-                    bound_args = sig.bind(*args, **kwargs)
-                    bound_args.apply_defaults()
-
-                    args_dict = {}
-                    for param_name, value in bound_args.arguments.items():
-                        try:
-                            if param_name in ("self", "cls"):
-                                continue
-
-                            args_dict[param_name] = _serialize_value(value)
-                        except Exception:
-                            args_dict[param_name] = "<serialization_error>"
-
-                    if args_dict:
-                        args_json = json.dumps(args_dict)
-                        span_attrs["function.arguments"] = args_json
-                        # Always set generic request field for consistency
-                        span_attrs["request"] = args_json
-                except Exception as e:
-                    logger.debug("Failed to record function arguments: %s", e)
-
-            with tracer.start_as_current_span(
-                span_name,
-                kind=span_kind,
-                attributes=span_attrs,
-            ) as span:
-                try:
-                    # Execute the function
-                    result = func(*args, **kwargs)
-
-                    # Record result if requested
-                    if record_result:
-                        try:
-                            serialized = _serialize_value(result)
-                            result_json = json.dumps(serialized)
-                            span.set_attribute("function.result", result_json)
-                            # Always set generic result field for consistency
-                            span.set_attribute("result", result_json)
-
-                            # Also set result type for complex objects
-                            if not isinstance(
-                                result, str | int | float | bool | type(None) | list | tuple | dict
-                            ):
-                                span.set_attribute("function.result_type", type(result).__name__)
-                        except Exception as e:
-                            logger.debug("Failed to record function result: %s", e)
-
-                    span.set_status(Status(StatusCode.OK))
-                    return result
+        @functools.wraps(func)
+        async def async_wrapper(*args: Any, **kwargs: Any) -> Any:
+            task_run_id = _get_trace_id()
+            start_time = _now_iso()
+            start_perf = time.perf_counter()
+            error: str | None = None
+            result: Any = None
+
+            try:
+                result = await func(*args, **kwargs)
+                return result
+            except Exception as e:
+                error = f"{type(e).__name__}: {e}"
+                raise
+            finally:
+                end_time = _now_iso()
+                duration_ms = (time.perf_counter() - start_perf) * 1000
+
+                if task_run_id:
+                    span = _build_span(
+                        task_run_id, args, kwargs, start_time, end_time, result, error
+                    )
+                    queue_span(span)
+                    logger.debug("Span: %s (%.2fms)", span_name, duration_ms)
 
-                except Exception as e:
-                    span.record_exception(e)
-                    span.set_status(Status(StatusCode.ERROR, str(e)))
-                    raise
+        @functools.wraps(func)
+        def sync_wrapper(*args: Any, **kwargs: Any) -> Any:
+            task_run_id = _get_trace_id()
+            start_time = _now_iso()
+            start_perf = time.perf_counter()
+            error: str | None = None
+            result: Any = None
+
+            try:
+                result = func(*args, **kwargs)
+                return result
+            except Exception as e:
+                error = f"{type(e).__name__}: {e}"
+                raise
+            finally:
+                end_time = _now_iso()
+                duration_ms = (time.perf_counter() - start_perf) * 1000
+
+                if task_run_id:
+                    span = _build_span(
+                        task_run_id, args, kwargs, start_time, end_time, result, error
+                    )
+                    queue_span(span)
+                    logger.debug("Span: %s (%.2fms)", span_name, duration_ms)
 
-        # Choose wrapper based on function type
         wrapper = async_wrapper if asyncio.iscoroutinefunction(func) else sync_wrapper
-
-        # Mark as instrumented
         wrapper._hud_instrumented = True  # type: ignore[attr-defined]
         wrapper._hud_original = func  # type: ignore[attr-defined]
 
         return wrapper
 
-    # Handle usage with or without parentheses
     if func is None:
-        # Called with arguments: @instrument(name="foo")
         return decorator
-    else:
-        # Called without arguments: @instrument
-        return decorator(func)
+    return decorator(func)
+
+
+__all__ = [
+    "instrument",
+]
diff --git a/hud/telemetry/job.py b/hud/telemetry/job.py
deleted file mode 100644
index 86576570..00000000
--- a/hud/telemetry/job.py
+++ /dev/null
@@ -1,355 +0,0 @@
-"""Job management for HUD SDK.
-
-This module provides APIs for managing jobs - logical groupings of related tasks.
-Jobs can be used to track experiments, batch processing, training runs, etc.
-"""
-
-from __future__ import annotations
-
-import asyncio
-import logging
-import uuid
-from contextlib import contextmanager
-from datetime import UTC, datetime
-from functools import wraps
-from typing import TYPE_CHECKING, Any
-
-from hud.settings import settings
-from hud.shared import make_request, make_request_sync
-
-if TYPE_CHECKING:
-    from collections.abc import Callable, Generator
-
-logger = logging.getLogger(__name__)
-
-
-class Job:
-    """A job represents a collection of related tasks."""
-
-    def __init__(
-        self,
-        job_id: str,
-        name: str,
-        metadata: dict[str, Any] | None = None,
-        dataset_link: str | None = None,
-    ) -> None:
-        self.id = job_id
-        self.name = name
-        self.metadata = metadata or {}
-        self.dataset_link = dataset_link
-        self.status = "created"
-        self.created_at = datetime.now(UTC)
-        self.tasks: list[str] = []
-
-    def add_task(self, task_id: str) -> None:
-        """Associate a task with this job."""
-        self.tasks.append(task_id)
-
-    async def update_status(self, status: str) -> None:
-        """Update job status on the server."""
-        self.status = status
-        if settings.telemetry_enabled:
-            try:
-                payload = {
-                    "name": self.name,
-                    "status": status,
-                    "metadata": self.metadata,
-                }
-                if self.dataset_link:
-                    payload["dataset_link"] = self.dataset_link
-
-                await make_request(
-                    method="POST",
-                    url=f"{settings.hud_telemetry_url}/jobs/{self.id}/status",
-                    json=payload,
-                    api_key=settings.api_key,
-                )
-            except Exception as e:
-                logger.warning("Failed to update job status: %s", e)
-
-    def update_status_sync(self, status: str) -> None:
-        """Synchronously update job status on the server."""
-        self.status = status
-        if settings.telemetry_enabled:
-            try:
-                payload = {
-                    "name": self.name,
-                    "status": status,
-                    "metadata": self.metadata,
-                }
-                if self.dataset_link:
-                    payload["dataset_link"] = self.dataset_link
-
-                make_request_sync(
-                    method="POST",
-                    url=f"{settings.hud_telemetry_url}/jobs/{self.id}/status",
-                    json=payload,
-                    api_key=settings.api_key,
-                )
-            except Exception as e:
-                logger.warning("Failed to update job status: %s", e)
-
-    async def log(self, metrics: dict[str, Any]) -> None:
-        """Log metrics to the job.
-
-        Args:
-            metrics: Dictionary of metric name to value pairs
-
-        Example:
-            await job.log({"loss": 0.5, "accuracy": 0.95, "epoch": 1})
-        """
-        if settings.telemetry_enabled:
-            try:
-                await make_request(
-                    method="POST",
-                    url=f"{settings.hud_telemetry_url}/jobs/{self.id}/log",
-                    json={"metrics": metrics, "timestamp": datetime.now(UTC).isoformat()},
-                    api_key=settings.api_key,
-                )
-            except Exception as e:
-                logger.warning("Failed to log metrics to job: %s", e)
-
-    def log_sync(self, metrics: dict[str, Any]) -> None:
-        """Synchronously log metrics to the job.
-
-        Args:
-            metrics: Dictionary of metric name to value pairs
-
-        Example:
-            job.log_sync({"loss": 0.5, "accuracy": 0.95, "epoch": 1})
-        """
-        if settings.telemetry_enabled:
-            try:
-                make_request_sync(
-                    method="POST",
-                    url=f"{settings.hud_telemetry_url}/jobs/{self.id}/log",
-                    json={"metrics": metrics, "timestamp": datetime.now(UTC).isoformat()},
-                    api_key=settings.api_key,
-                )
-            except Exception as e:
-                logger.warning("Failed to log metrics to job: %s", e)
-
-    def __repr__(self) -> str:
-        return f"Job(id={self.id!r}, name={self.name!r}, status={self.status!r})"
-
-
-# Global job registry for the decorator pattern
-_current_job: Job | None = None
-
-
-def _print_job_url(job_id: str, job_name: str) -> None:
-    """Print the job URL in a colorful box."""
-    # Only print HUD URL if HUD telemetry is enabled and has API key
-    if not (settings.telemetry_enabled and settings.api_key):
-        return
-
-    url = f"https://hud.ai/jobs/{job_id}"
-    header = f"🚀 Job '{job_name}' started:"
-
-    # ANSI color codes
-    DIM = "\033[90m"  # Dim/Gray for border
-    GOLD = "\033[33m"  # Gold/Yellow for URL
-    RESET = "\033[0m"
-    BOLD = "\033[1m"
-
-    # Calculate box width based on the longest line
-    box_width = max(len(url), len(header)) + 6
-
-    # Box drawing characters
-    top_border = "╔" + "═" * (box_width - 2) + "╗"
-    bottom_border = "╚" + "═" * (box_width - 2) + "╝"
-    divider = "╟" + "─" * (box_width - 2) + "╢"
-
-    # Center the content
-    header_padding = (box_width - len(header) - 2) // 2
-    url_padding = (box_width - len(url) - 2) // 2
-
-    # Print the box
-    print(f"\n{DIM}{top_border}{RESET}")  # noqa: T201
-    print(  # noqa: T201
-        f"{DIM}║{RESET}{' ' * header_padding}{header}{' ' * (box_width - len(header) - header_padding - 3)}{DIM}║{RESET}"  # noqa: E501
-    )
-    print(f"{DIM}{divider}{RESET}")  # noqa: T201
-    print(  # noqa: T201
-        f"{DIM}║{RESET}{' ' * url_padding}{BOLD}{GOLD}{url}{RESET}{' ' * (box_width - len(url) - url_padding - 2)}{DIM}║{RESET}"  # noqa: E501
-    )
-    print(f"{DIM}{bottom_border}{RESET}\n")  # noqa: T201
-
-
-def _print_job_complete_url(job_id: str, job_name: str, error_occurred: bool = False) -> None:
-    """Print the job completion URL with appropriate messaging."""
-    # Only print HUD URL if HUD telemetry is enabled and has API key
-    if not (settings.telemetry_enabled and settings.api_key):
-        return
-
-    url = f"https://hud.ai/jobs/{job_id}"
-
-    # ANSI color codes
-    GREEN = "\033[92m"
-    RED = "\033[91m"
-    GOLD = "\033[33m"
-    RESET = "\033[0m"
-    DIM = "\033[2m"
-    BOLD = "\033[1m"
-
-    if error_occurred:
-        print(  # noqa: T201
-            f"\n{RED}✗ Job '{job_name}' failed!{RESET} {DIM}View details at:{RESET} {BOLD}{GOLD}{url}{RESET}\n"  # noqa: E501
-        )
-    else:
-        print(  # noqa: T201
-            f"\n{GREEN}✓ Job '{job_name}' complete!{RESET} {DIM}View all results at:{RESET} {BOLD}{GOLD}{url}{RESET}\n"  # noqa: E501
-        )
-
-
-def get_current_job() -> Job | None:
-    """Get the currently active job, if any."""
-    return _current_job
-
-
-@contextmanager
-def job(
-    name: str,
-    metadata: dict[str, Any] | None = None,
-    job_id: str | None = None,
-    dataset_link: str | None = None,
-) -> Generator[Job, None, None]:
-    """Context manager for job tracking and organization.
-
-    Groups related tasks together under a single job for tracking and visualization.
-
-    Args:
-        name: Human-readable job name
-        metadata: Optional metadata dictionary
-        job_id: Optional job ID (auto-generated if not provided)
-        dataset_link: Optional HuggingFace dataset identifier (e.g. "hud-evals/SheetBench-50")
-
-    Yields:
-        Job: The job object
-
-    Example:
-        >>> import hud
-        >>> with hud.job("training_run", {"model": "gpt-4"}) as job:
-        ...     for epoch in range(10):
-        ...         with hud.trace(f"epoch_{epoch}", job_id=job.id):
-        ...             train_epoch()
-        >>> # For async code, use async_job
-        >>> async with hud.async_job("batch_processing") as job:
-        ...     async with hud.async_trace("task", job_id=job.id):
-        ...         await process()
-
-    Note:
-        This is a synchronous context manager that uses blocking HTTP calls.
-        For async code, use `hud.async_job()` instead.
-    """
-    global _current_job
-
-    if not job_id:
-        job_id = str(uuid.uuid4())
-
-    job_obj = Job(job_id, name, metadata, dataset_link)
-
-    # Set as current job
-    old_job = _current_job
-    _current_job = job_obj
-
-    try:
-        job_obj.update_status_sync("running")
-        _print_job_url(job_obj.id, job_obj.name)
-        yield job_obj
-        job_obj.update_status_sync("completed")
-        _print_job_complete_url(job_obj.id, job_obj.name, error_occurred=False)
-    except Exception:
-        job_obj.update_status_sync("failed")
-        _print_job_complete_url(job_obj.id, job_obj.name, error_occurred=True)
-        raise
-    finally:
-        _current_job = old_job
-
-
-def create_job(
-    name: str,
-    metadata: dict[str, Any] | None = None,
-    dataset_link: str | None = None,
-    job_id: str | None = None,
-) -> Job:
-    """Create a job without using context manager.
-
-    Useful when you need explicit control over job lifecycle.
-
-    Args:
-        name: Human-readable job name
-        metadata: Optional metadata dictionary
-        dataset_link: Optional HuggingFace dataset identifier (e.g. "hud-evals/SheetBench-50")
-        job_id: Optional job ID (auto-generated if not provided)
-    Returns:
-        Job: The created job object
-
-    Example:
-        job = hud.create_job("data_processing")
-        try:
-            for item in items:
-                with hud.trace(f"process_{item.id}", job_id=job.id):
-                    process(item)
-        finally:
-            await job.update_status("completed")
-    """
-    job_id = job_id or str(uuid.uuid4())
-    return Job(job_id, name, metadata, dataset_link)
-
-
-def job_decorator(name: str | None = None, **metadata: Any) -> Callable:
-    """Decorator for functions that should be tracked as jobs.
-
-    Args:
-        name: Job name (defaults to function name)
-        **metadata: Additional metadata for the job
-
-    Example:
-        @hud.job_decorator("model_training", model="gpt-4", dataset="v2")
-        async def train_model(config):
-            # This entire function execution is tracked as a job
-            await model.train(config)
-            return model.evaluate()
-    """
-
-    def decorator(func: Callable) -> Callable:
-        job_name = name or func.__name__
-
-        @wraps(func)
-        async def async_wrapper(*args: Any, **kwargs: Any) -> Any:
-            with job(job_name, metadata) as job_obj:
-                # Store job ID in function for access
-                func._current_job_id = job_obj.id
-                try:
-                    return await func(*args, **kwargs)
-                finally:
-                    delattr(func, "_current_job_id")
-
-        @wraps(func)
-        def sync_wrapper(*args: Any, **kwargs: Any) -> Any:
-            with job(job_name, metadata) as job_obj:
-                # Store job ID in function for access
-                func._current_job_id = job_obj.id
-                try:
-                    return func(*args, **kwargs)
-                finally:
-                    delattr(func, "_current_job_id")
-
-        # Return appropriate wrapper based on function type
-        if asyncio.iscoroutinefunction(func):
-            return async_wrapper
-        else:
-            return sync_wrapper
-
-    return decorator
-
-
-# Convenience exports
-__all__ = [
-    "Job",
-    "create_job",
-    "get_current_job",
-    "job",
-    "job_decorator",
-]
diff --git a/hud/telemetry/replay.py b/hud/telemetry/replay.py
deleted file mode 100644
index 67d5bddc..00000000
--- a/hud/telemetry/replay.py
+++ /dev/null
@@ -1,74 +0,0 @@
-"""Trace retrieval and replay functionality.
-
-This module provides APIs to retrieve collected traces for analysis,
-debugging, and replay purposes.
-"""
-
-from __future__ import annotations
-
-from typing import TYPE_CHECKING
-
-from hud.otel.collector import clear_trace as _clear_trace
-from hud.otel.collector import get_trace as _get_trace
-
-if TYPE_CHECKING:
-    from hud.types import Trace
-
-__all__ = ["clear_trace", "get_trace"]
-
-
-def get_trace(task_run_id: str) -> Trace | None:
-    """Retrieve the collected trace for a task run.
-
-    Returns None if trace collection was disabled or the trace doesn't exist.
-
-    Args:
-        task_run_id: The task run ID to retrieve the trace for
-
-    Returns:
-        Trace object containing all collected steps, or None if not found
-
-    Usage:
-        import hud
-
-        # Run agent with tracing
-        with hud.trace() as task_run_id:
-            agent = MyAgent()
-            result = await agent.run("solve task")
-
-        # Get the trace for analysis
-        trace = hud.get_trace(task_run_id)
-        if trace:
-            print(f"Collected {len(trace.trace)} steps")
-
-            # Analyze agent vs MCP steps
-            agent_steps = [s for s in trace.trace if s.category == "agent"]
-            mcp_steps = [s for s in trace.trace if s.category == "mcp"]
-
-            print(f"Agent steps: {len(agent_steps)}")
-            print(f"MCP steps: {len(mcp_steps)}")
-
-            # Replay or analyze individual steps
-            for step in trace.trace:
-                if step.category == "agent" and step.result:
-                    print(f"Agent: {step.result.get('content') if isinstance(step.result, dict) else step.result}")
-                if step.category == "mcp" and step.request:
-                    print(f"MCP: {step.request.method if hasattr(step.request, 'method') else step.request}")
-    """  # noqa: E501
-    return _get_trace(task_run_id)
-
-
-def clear_trace(task_run_id: str) -> None:
-    """Clear the collected trace for a task run ID.
-
-    Useful for cleaning up memory after processing large traces.
-
-    Args:
-        task_run_id: The task run ID to clear the trace for
-
-    Usage:
-        trace = hud.get_trace(task_run_id)
-        # Process trace...
-        hud.clear_trace(task_run_id)  # Free memory
-    """
-    _clear_trace(task_run_id)
diff --git a/hud/telemetry/tests/test_async_context.py b/hud/telemetry/tests/test_async_context.py
deleted file mode 100644
index 47697cbc..00000000
--- a/hud/telemetry/tests/test_async_context.py
+++ /dev/null
@@ -1,515 +0,0 @@
-from __future__ import annotations
-
-from unittest.mock import AsyncMock, MagicMock, patch
-
-import pytest
-
-from hud.telemetry.async_context import async_job, async_trace
-
-
-@pytest.mark.asyncio
-async def test_async_trace_basic():
-    """Test basic AsyncTrace usage."""
-    with (
-        patch("hud.telemetry.async_context.OtelTrace") as mock_otel,
-        patch("hud.telemetry.async_context._update_task_status_async", new_callable=AsyncMock),
-    ):
-        mock_otel_instance = MagicMock()
-        mock_otel.return_value = mock_otel_instance
-
-        async with async_trace("Test Task") as trace_obj:
-            assert trace_obj.name == "Test Task"
-            assert trace_obj.id is not None
-
-
-@pytest.mark.asyncio
-async def test_async_trace_with_job_id():
-    """Test AsyncTrace with job_id parameter."""
-    with (
-        patch("hud.telemetry.async_context.OtelTrace") as mock_otel,
-        patch("hud.telemetry.async_context._update_task_status_async", new_callable=AsyncMock),
-    ):
-        mock_otel_instance = MagicMock()
-        mock_otel.return_value = mock_otel_instance
-
-        async with async_trace("Test", job_id="job-123") as trace_obj:
-            assert trace_obj.job_id == "job-123"
-
-
-@pytest.mark.asyncio
-async def test_async_trace_with_task_id():
-    """Test AsyncTrace with task_id parameter."""
-    with (
-        patch("hud.telemetry.async_context.OtelTrace") as mock_otel,
-        patch("hud.telemetry.async_context._update_task_status_async", new_callable=AsyncMock),
-    ):
-        mock_otel_instance = MagicMock()
-        mock_otel.return_value = mock_otel_instance
-
-        async with async_trace("Test", task_id="task-456") as trace_obj:
-            assert trace_obj.task_id == "task-456"
-
-
-@pytest.mark.asyncio
-async def test_async_trace_status_updates():
-    """Test AsyncTrace sends and awaits status updates."""
-    with (
-        patch("hud.telemetry.async_context.settings") as mock_settings,
-        patch("hud.telemetry.async_context.OtelTrace") as mock_otel,
-        patch(
-            "hud.telemetry.async_context._update_task_status_async",
-            new_callable=AsyncMock,
-        ) as mock_update,
-        patch("hud.telemetry.utils.flush_telemetry", new_callable=AsyncMock),
-    ):
-        mock_settings.telemetry_enabled = True
-        mock_settings.api_key = "test-key"
-        mock_otel_instance = MagicMock()
-        mock_otel.return_value = mock_otel_instance
-
-        async with async_trace("Test", job_id=None):
-            pass
-
-        assert mock_update.call_count == 2
-
-
-@pytest.mark.asyncio
-async def test_async_trace_with_exception():
-    """Test AsyncTrace handles exceptions."""
-    with (
-        patch("hud.telemetry.async_context.settings") as mock_settings,
-        patch("hud.telemetry.async_context.OtelTrace") as mock_otel,
-        patch(
-            "hud.telemetry.async_context._update_task_status_async",
-            new_callable=AsyncMock,
-        ) as mock_update,
-        patch("hud.telemetry.utils.flush_telemetry", new_callable=AsyncMock),
-    ):
-        mock_settings.telemetry_enabled = True
-        mock_settings.api_key = "test-key"
-        mock_otel_instance = MagicMock()
-        mock_otel.return_value = mock_otel_instance
-
-        with pytest.raises(ValueError):
-            async with async_trace("Test"):
-                raise ValueError("Test error")
-
-        assert mock_update.call_count == 2
-        final_call = mock_update.call_args_list[1]
-        assert final_call[0][1] == "error"
-
-
-@pytest.mark.asyncio
-async def test_async_trace_non_root():
-    """Test AsyncTrace with root=False."""
-    with (
-        patch("hud.telemetry.async_context.OtelTrace") as mock_otel,
-        patch(
-            "hud.telemetry.async_context._update_task_status_async",
-            new_callable=AsyncMock,
-        ) as mock_update,
-    ):
-        mock_otel_instance = MagicMock()
-        mock_otel.return_value = mock_otel_instance
-
-        async with async_trace("Test", root=False):
-            pass
-
-        mock_update.assert_not_called()
-
-
-@pytest.mark.asyncio
-async def test_async_trace_flushes_when_standalone():
-    """Test AsyncTrace flushes spans when not part of a job."""
-    with (
-        patch("hud.telemetry.async_context.settings") as mock_settings,
-        patch("hud.telemetry.async_context.OtelTrace") as mock_otel,
-        patch(
-            "hud.telemetry.async_context._update_task_status_async",
-            new_callable=AsyncMock,
-        ),
-        patch("hud.telemetry.utils.flush_telemetry", new_callable=AsyncMock) as mock_flush,
-    ):
-        mock_settings.telemetry_enabled = True
-        mock_settings.api_key = "test-key"
-        mock_otel_instance = MagicMock()
-        mock_otel.return_value = mock_otel_instance
-
-        async with async_trace("Test", job_id=None):
-            pass
-
-        mock_flush.assert_called_once()
-
-
-@pytest.mark.asyncio
-async def test_async_trace_no_flush_when_in_job():
-    """Test AsyncTrace doesn't flush when part of a job."""
-    with (
-        patch("hud.telemetry.async_context.settings") as mock_settings,
-        patch("hud.telemetry.async_context.OtelTrace") as mock_otel,
-        patch(
-            "hud.telemetry.async_context._update_task_status_async",
-            new_callable=AsyncMock,
-        ),
-        patch("hud.telemetry.utils.flush_telemetry", new_callable=AsyncMock) as mock_flush,
-    ):
-        mock_settings.telemetry_enabled = True
-        mock_settings.api_key = "test-key"
-        mock_otel_instance = MagicMock()
-        mock_otel.return_value = mock_otel_instance
-
-        async with async_trace("Test", job_id="job-123"):
-            pass
-
-        mock_flush.assert_not_called()
-
-
-@pytest.mark.asyncio
-async def test_async_job_basic():
-    """Test basic AsyncJob usage."""
-    with (
-        patch("hud.telemetry.async_context.make_request", new_callable=AsyncMock),
-        patch("hud.telemetry.utils.flush_telemetry", new_callable=AsyncMock),
-        patch("hud.telemetry.async_context._print_job_url"),
-        patch("hud.telemetry.async_context._print_job_complete_url"),
-    ):
-        async with async_job("Test Job") as job_obj:
-            assert job_obj.name == "Test Job"
-            assert job_obj.id is not None
-
-
-@pytest.mark.asyncio
-async def test_async_job_with_metadata():
-    """Test AsyncJob with metadata."""
-    with (
-        patch("hud.telemetry.async_context.make_request", new_callable=AsyncMock),
-        patch("hud.telemetry.utils.flush_telemetry", new_callable=AsyncMock),
-        patch("hud.telemetry.async_context._print_job_url"),
-        patch("hud.telemetry.async_context._print_job_complete_url"),
-    ):
-        async with async_job("Test", metadata={"key": "value"}) as job_obj:
-            assert job_obj.metadata == {"key": "value"}
-
-
-@pytest.mark.asyncio
-async def test_async_job_with_dataset_link():
-    """Test AsyncJob with dataset_link."""
-    with (
-        patch("hud.telemetry.async_context.make_request", new_callable=AsyncMock),
-        patch("hud.telemetry.utils.flush_telemetry", new_callable=AsyncMock),
-        patch("hud.telemetry.async_context._print_job_url"),
-        patch("hud.telemetry.async_context._print_job_complete_url"),
-    ):
-        async with async_job("Test", dataset_link="test/dataset") as job_obj:
-            assert job_obj.dataset_link == "test/dataset"
-
-
-@pytest.mark.asyncio
-async def test_async_job_with_custom_job_id():
-    """Test AsyncJob with custom job_id."""
-    with (
-        patch("hud.telemetry.async_context.make_request", new_callable=AsyncMock),
-        patch("hud.telemetry.utils.flush_telemetry", new_callable=AsyncMock),
-        patch("hud.telemetry.async_context._print_job_url"),
-        patch("hud.telemetry.async_context._print_job_complete_url"),
-    ):
-        async with async_job("Test", job_id="custom-id") as job_obj:
-            assert job_obj.id == "custom-id"
-
-
-@pytest.mark.asyncio
-async def test_async_job_with_exception():
-    """Test AsyncJob handles exceptions."""
-    with (
-        patch("hud.telemetry.async_context.make_request", new_callable=AsyncMock),
-        patch("hud.telemetry.utils.flush_telemetry", new_callable=AsyncMock),
-        patch("hud.telemetry.async_context._print_job_url"),
-        patch("hud.telemetry.async_context._print_job_complete_url") as mock_print,
-    ):
-        with pytest.raises(ValueError):
-            async with async_job("Test"):
-                raise ValueError("Job error")
-
-        mock_print.assert_called_once()
-        call_kwargs = mock_print.call_args[1]
-        assert call_kwargs["error_occurred"] is True
-
-
-@pytest.mark.asyncio
-async def test_async_job_status_updates():
-    """Test AsyncJob sends status updates."""
-    with (
-        patch("hud.telemetry.async_context.settings") as mock_settings,
-        patch("hud.telemetry.async_context.make_request", new_callable=AsyncMock) as mock_request,
-        patch("hud.telemetry.utils.flush_telemetry", new_callable=AsyncMock),
-        patch("hud.telemetry.async_context._print_job_url"),
-        patch("hud.telemetry.async_context._print_job_complete_url"),
-    ):
-        mock_settings.telemetry_enabled = True
-        mock_settings.api_key = "test-key"
-        mock_settings.hud_telemetry_url = "https://test.com"
-
-        async with async_job("Test"):
-            pass
-
-        assert mock_request.call_count == 2
-
-
-@pytest.mark.asyncio
-async def test_async_job_flushes_on_exit():
-    """Test AsyncJob flushes telemetry on exit."""
-    with (
-        patch("hud.telemetry.async_context.make_request", new_callable=AsyncMock),
-        patch("hud.telemetry.utils.flush_telemetry", new_callable=AsyncMock) as mock_flush,
-        patch("hud.telemetry.async_context._print_job_url"),
-        patch("hud.telemetry.async_context._print_job_complete_url"),
-    ):
-        async with async_job("Test"):
-            pass
-
-        mock_flush.assert_called_once()
-
-
-@pytest.mark.asyncio
-async def test_async_trace_nested_contexts():
-    """Test nested AsyncTrace contexts work correctly."""
-    with (
-        patch("hud.telemetry.async_context.OtelTrace") as mock_otel,
-        patch("hud.telemetry.async_context._update_task_status_async", new_callable=AsyncMock),
-    ):
-        mock_otel_instance = MagicMock()
-        mock_otel.return_value = mock_otel_instance
-
-        async with async_trace("Outer Task") as outer:
-            assert outer.name == "Outer Task"
-
-            async with async_trace("Inner Task", root=False) as inner:
-                assert inner.name == "Inner Task"
-                assert inner.id != outer.id
-
-
-@pytest.mark.asyncio
-async def test_async_trace_concurrent_traces():
-    """Test multiple concurrent AsyncTrace contexts."""
-    import asyncio
-
-    with (
-        patch("hud.telemetry.async_context.OtelTrace") as mock_otel,
-        patch("hud.telemetry.async_context._update_task_status_async", new_callable=AsyncMock),
-    ):
-        mock_otel_instance = MagicMock()
-        mock_otel.return_value = mock_otel_instance
-
-        async def run_trace(name: str):
-            async with async_trace(name) as trace_obj:
-                await asyncio.sleep(0.01)
-                return trace_obj.id
-
-        # Run multiple traces concurrently
-        ids = await asyncio.gather(
-            run_trace("Trace 1"),
-            run_trace("Trace 2"),
-            run_trace("Trace 3"),
-        )
-
-        # All traces should have unique IDs
-        assert len(set(ids)) == 3
-
-
-@pytest.mark.asyncio
-async def test_async_trace_with_attrs():
-    """Test AsyncTrace with attrs parameter passed to OtelTrace."""
-    with (
-        patch("hud.telemetry.async_context.OtelTrace") as mock_otel,
-        patch("hud.telemetry.async_context._update_task_status_async", new_callable=AsyncMock),
-    ):
-        mock_otel_instance = MagicMock()
-        mock_otel.return_value = mock_otel_instance
-
-        attrs = {"key": "value", "count": 42}
-        async with async_trace("Test", attrs=attrs):
-            # attrs are passed to OtelTrace, not exposed on Trace object
-            mock_otel.assert_called_once()
-            call_kwargs = mock_otel.call_args[1]
-            assert call_kwargs["attributes"] == attrs
-
-
-@pytest.mark.asyncio
-async def test_async_trace_exception_types():
-    """Test AsyncTrace handles different exception types correctly."""
-    with (
-        patch("hud.telemetry.async_context.settings") as mock_settings,
-        patch("hud.telemetry.async_context.OtelTrace") as mock_otel,
-        patch(
-            "hud.telemetry.async_context._update_task_status_async",
-            new_callable=AsyncMock,
-        ) as mock_update,
-        patch("hud.telemetry.utils.flush_telemetry", new_callable=AsyncMock),
-    ):
-        mock_settings.telemetry_enabled = True
-        mock_settings.api_key = "test-key"
-        mock_otel_instance = MagicMock()
-        mock_otel.return_value = mock_otel_instance
-
-        # Test KeyError
-        with pytest.raises(KeyError):
-            async with async_trace("Test"):
-                raise KeyError("Missing key")
-
-        # Test RuntimeError
-        with pytest.raises(RuntimeError):
-            async with async_trace("Test"):
-                raise RuntimeError("Runtime issue")
-
-        # Both should have resulted in error status
-        assert mock_update.call_count >= 4  # 2 calls per trace
-
-
-@pytest.mark.asyncio
-async def test_async_job_nested_with_trace():
-    """Test AsyncJob with nested AsyncTrace contexts."""
-    with (
-        patch("hud.telemetry.async_context.make_request", new_callable=AsyncMock),
-        patch("hud.telemetry.utils.flush_telemetry", new_callable=AsyncMock),
-        patch("hud.telemetry.async_context._print_job_url"),
-        patch("hud.telemetry.async_context._print_job_complete_url"),
-        patch("hud.telemetry.async_context.OtelTrace") as mock_otel,
-        patch("hud.telemetry.async_context._update_task_status_async", new_callable=AsyncMock),
-    ):
-        mock_otel_instance = MagicMock()
-        mock_otel.return_value = mock_otel_instance
-
-        async with async_job("Test Job") as job_obj:
-            async with async_trace("Task 1", job_id=job_obj.id) as trace1:
-                assert trace1.job_id == job_obj.id
-
-            async with async_trace("Task 2", job_id=job_obj.id) as trace2:
-                assert trace2.job_id == job_obj.id
-                assert trace2.id != trace1.id
-
-
-@pytest.mark.asyncio
-async def test_async_job_concurrent_jobs():
-    """Test multiple concurrent AsyncJob contexts."""
-    import asyncio
-
-    with (
-        patch("hud.telemetry.async_context.make_request", new_callable=AsyncMock),
-        patch("hud.telemetry.utils.flush_telemetry", new_callable=AsyncMock),
-        patch("hud.telemetry.async_context._print_job_url"),
-        patch("hud.telemetry.async_context._print_job_complete_url"),
-    ):
-
-        async def run_job(name: str):
-            async with async_job(name) as job_obj:
-                await asyncio.sleep(0.01)
-                return job_obj.id
-
-        # Run multiple jobs concurrently
-        ids = await asyncio.gather(
-            run_job("Job 1"),
-            run_job("Job 2"),
-            run_job("Job 3"),
-        )
-
-        # All jobs should have unique IDs
-        assert len(set(ids)) == 3
-
-
-@pytest.mark.asyncio
-async def test_async_job_with_multiple_exceptions():
-    """Test AsyncJob handles multiple exceptions in nested contexts."""
-    with (
-        patch("hud.telemetry.async_context.make_request", new_callable=AsyncMock),
-        patch("hud.telemetry.utils.flush_telemetry", new_callable=AsyncMock),
-        patch("hud.telemetry.async_context._print_job_url"),
-        patch("hud.telemetry.async_context._print_job_complete_url") as mock_print,
-    ):
-        with pytest.raises(ValueError):
-            async with async_job("Test"):
-                try:
-                    raise RuntimeError("First error")
-                except RuntimeError:
-                    # Catch and re-raise different error
-                    raise ValueError("Second error")
-
-        mock_print.assert_called_once()
-        call_kwargs = mock_print.call_args[1]
-        assert call_kwargs["error_occurred"] is True
-
-
-@pytest.mark.asyncio
-async def test_async_trace_telemetry_disabled():
-    """Test AsyncTrace behavior when telemetry is disabled."""
-    with (
-        patch("hud.telemetry.async_context.settings") as mock_settings,
-        patch("hud.telemetry.async_context.OtelTrace") as mock_otel,
-        patch(
-            "hud.telemetry.async_context._update_task_status_async",
-            new_callable=AsyncMock,
-        ),
-    ):
-        mock_settings.telemetry_enabled = False
-        mock_otel_instance = MagicMock()
-        mock_otel.return_value = mock_otel_instance
-
-        async with async_trace("Test"):
-            pass
-
-        # Should still create trace but not send updates
-        mock_otel.assert_called_once()
-        # Status updates might still be called depending on implementation
-
-
-@pytest.mark.asyncio
-async def test_async_job_empty_metadata():
-    """Test AsyncJob with empty metadata dict."""
-    with (
-        patch("hud.telemetry.async_context.make_request", new_callable=AsyncMock),
-        patch("hud.telemetry.utils.flush_telemetry", new_callable=AsyncMock),
-        patch("hud.telemetry.async_context._print_job_url"),
-        patch("hud.telemetry.async_context._print_job_complete_url"),
-    ):
-        async with async_job("Test", metadata={}) as job_obj:
-            assert job_obj.metadata == {}
-
-
-@pytest.mark.asyncio
-async def test_async_trace_with_all_parameters():
-    """Test AsyncTrace with all parameters specified."""
-    with (
-        patch("hud.telemetry.async_context.OtelTrace") as mock_otel,
-        patch("hud.telemetry.async_context._update_task_status_async", new_callable=AsyncMock),
-    ):
-        mock_otel_instance = MagicMock()
-        mock_otel.return_value = mock_otel_instance
-
-        async with async_trace(
-            "Test",
-            job_id="job-123",
-            task_id="task-456",
-            group_id="group-789",
-            attrs={"key": "value"},
-            root=True,
-        ) as trace_obj:
-            assert trace_obj.name == "Test"
-            assert trace_obj.job_id == "job-123"
-            assert trace_obj.task_id == "task-456"
-            assert trace_obj.group_id == "group-789"
-            # Verify attrs were passed to OtelTrace
-            call_kwargs = mock_otel.call_args[1]
-            assert call_kwargs["attributes"] == {"key": "value"}
-
-
-@pytest.mark.asyncio
-async def test_async_trace_with_group_id():
-    """Test AsyncTrace with group_id parameter."""
-    with (
-        patch("hud.telemetry.async_context.OtelTrace") as mock_otel,
-        patch("hud.telemetry.async_context._update_task_status_async", new_callable=AsyncMock),
-    ):
-        mock_otel_instance = MagicMock()
-        mock_otel.return_value = mock_otel_instance
-
-        async with async_trace("Test", group_id="group-999") as trace_obj:
-            assert trace_obj.group_id == "group-999"
diff --git a/hud/telemetry/tests/test_eval_telemetry.py b/hud/telemetry/tests/test_eval_telemetry.py
new file mode 100644
index 00000000..8849cd13
--- /dev/null
+++ b/hud/telemetry/tests/test_eval_telemetry.py
@@ -0,0 +1,356 @@
+"""Tests for EvalContext telemetry integration with mock backend."""
+
+from __future__ import annotations
+
+import asyncio
+from typing import Any
+from unittest.mock import patch
+
+import pytest
+
+import hud
+from hud.environment import Environment
+from hud.eval import Task
+from hud.telemetry.exporter import _pending_futures, _pending_spans
+
+
+@pytest.fixture(autouse=True)
+def clear_pending_state():
+    """Clear pending spans and futures before and after each test."""
+    _pending_spans.clear()
+    _pending_futures.clear()
+    yield
+    _pending_spans.clear()
+    _pending_futures.clear()
+
+
+class TestEvalContextTelemetry:
+    """Tests for EvalContext telemetry integration."""
+
+    @pytest.mark.asyncio
+    async def test_call_tool_records_span(self):
+        """Test that call_tool records a span with correct format."""
+        uploaded_spans: list[dict[str, Any]] = []
+
+        def capture_upload(
+            task_run_id: str,
+            spans: list[dict[str, Any]],
+            telemetry_url: str,
+            api_key: str,
+        ) -> bool:
+            uploaded_spans.extend(spans)
+            return True
+
+        # Create environment with a simple tool
+        env = Environment("test-env")
+
+        @env.tool
+        async def greet(name: str) -> str:
+            """Say hello."""
+            return f"Hello, {name}!"
+
+        # Create task from environment
+        task = Task(env=env)
+
+        with (
+            patch("hud.settings.settings") as mock_settings,
+            patch("hud.telemetry.exporter._do_upload", side_effect=capture_upload),
+            patch("hud.eval.context.make_request"),  # Don't send eval enter/exit
+        ):
+            mock_settings.api_key = "test-key"
+            mock_settings.telemetry_enabled = True
+            mock_settings.hud_telemetry_url = "https://api.hud.ai"
+            mock_settings.hud_api_url = "https://api.hud.ai"
+
+            async with hud.eval(task) as ctx:
+                result = await ctx.call_tool("greet", name="World")
+                # call_tool returns MCPToolResult with formatted content
+                assert "Hello, World!" in str(result)
+                trace_id = ctx.trace_id
+
+            # Wait for thread pool
+            await asyncio.sleep(0.2)
+
+        # Verify span was recorded
+        assert len(uploaded_spans) >= 1
+        span = uploaded_spans[0]
+
+        # Check span structure
+        assert "name" in span
+        assert "trace_id" in span
+        assert "span_id" in span
+        assert "start_time" in span
+        assert "end_time" in span
+        assert "status_code" in span
+        assert "attributes" in span
+
+        # Check attributes
+        attrs = span["attributes"]
+        assert attrs["task_run_id"] == trace_id
+        assert attrs["category"] == "mcp"
+
+    @pytest.mark.asyncio
+    async def test_call_tool_records_error_span(self):
+        """Test that failed call_tool records error span."""
+        uploaded_spans: list[dict[str, Any]] = []
+
+        def capture_upload(
+            task_run_id: str,
+            spans: list[dict[str, Any]],
+            telemetry_url: str,
+            api_key: str,
+        ) -> bool:
+            uploaded_spans.extend(spans)
+            return True
+
+        env = Environment("test-env")
+
+        @env.tool
+        async def failing_tool() -> str:
+            """Always fails."""
+            raise ValueError("Tool error")
+
+        task = Task(env=env)
+
+        with (
+            patch("hud.settings.settings") as mock_settings,
+            patch("hud.telemetry.exporter._do_upload", side_effect=capture_upload),
+            patch("hud.eval.context.make_request"),
+        ):
+            mock_settings.api_key = "test-key"
+            mock_settings.telemetry_enabled = True
+            mock_settings.hud_telemetry_url = "https://api.hud.ai"
+            mock_settings.hud_api_url = "https://api.hud.ai"
+
+            async with hud.eval(task) as ctx:
+                # Tool errors are wrapped in ToolError
+                with pytest.raises(Exception, match="Tool error"):
+                    await ctx.call_tool("failing_tool")
+
+            await asyncio.sleep(0.2)
+
+        # Should have recorded span with ERROR status
+        assert len(uploaded_spans) >= 1
+        span = uploaded_spans[0]
+        assert span["status_code"] == "ERROR"
+        # Error message contains the original error
+        assert "Tool error" in (span.get("status_message") or "")
+
+    @pytest.mark.asyncio
+    async def test_multiple_call_tools_record_spans(self):
+        """Test that multiple call_tool calls each record a span."""
+        uploaded_spans: list[dict[str, Any]] = []
+
+        def capture_upload(
+            task_run_id: str,
+            spans: list[dict[str, Any]],
+            telemetry_url: str,
+            api_key: str,
+        ) -> bool:
+            uploaded_spans.extend(spans)
+            return True
+
+        env = Environment("test-env")
+
+        @env.tool
+        async def add(a: int, b: int) -> int:
+            """Add two numbers."""
+            return a + b
+
+        @env.tool
+        async def multiply(a: int, b: int) -> int:
+            """Multiply two numbers."""
+            return a * b
+
+        task = Task(env=env)
+
+        with (
+            patch("hud.settings.settings") as mock_settings,
+            patch("hud.telemetry.exporter._do_upload", side_effect=capture_upload),
+            patch("hud.eval.context.make_request"),
+        ):
+            mock_settings.api_key = "test-key"
+            mock_settings.telemetry_enabled = True
+            mock_settings.hud_telemetry_url = "https://api.hud.ai"
+            mock_settings.hud_api_url = "https://api.hud.ai"
+
+            async with hud.eval(task) as ctx:
+                r1 = await ctx.call_tool("add", a=2, b=3)
+                r2 = await ctx.call_tool("multiply", a=4, b=5)
+                # Results are MCPToolResult objects
+                assert "5" in str(r1)
+                assert "20" in str(r2)
+
+            await asyncio.sleep(0.2)
+
+        # Should have 2 spans
+        assert len(uploaded_spans) >= 2
+
+    @pytest.mark.asyncio
+    async def test_flush_called_on_context_exit(self):
+        """Test that flush is called when context exits."""
+        env = Environment("test-env")
+
+        @env.tool
+        async def simple_tool() -> str:
+            return "done"
+
+        task = Task(env=env)
+
+        with (
+            patch("hud.eval.context.flush") as mock_flush,
+            patch("hud.settings.settings") as mock_settings,
+            patch("hud.eval.context.make_request"),
+        ):
+            mock_settings.api_key = "test-key"
+            mock_settings.telemetry_enabled = True
+            mock_settings.hud_api_url = "https://api.hud.ai"
+
+            async with hud.eval(task) as ctx:
+                await ctx.call_tool("simple_tool")
+                trace_id = ctx.trace_id
+
+            # Verify flush was called with the trace_id
+            mock_flush.assert_called_once_with(trace_id)
+
+    @pytest.mark.asyncio
+    async def test_telemetry_disabled_no_upload(self):
+        """Test that no upload happens when telemetry is disabled."""
+        upload_called = False
+
+        def should_not_be_called(*args: Any, **kwargs: Any) -> bool:
+            nonlocal upload_called
+            upload_called = True
+            return True
+
+        env = Environment("test-env")
+
+        @env.tool
+        async def test_tool() -> str:
+            return "ok"
+
+        task = Task(env=env)
+
+        with (
+            patch("hud.settings.settings") as mock_settings,
+            patch("hud.telemetry.exporter._do_upload", side_effect=should_not_be_called),
+            patch("hud.eval.context.make_request"),
+        ):
+            mock_settings.api_key = "test-key"
+            mock_settings.telemetry_enabled = False  # Disabled!
+            mock_settings.hud_telemetry_url = "https://api.hud.ai"
+            mock_settings.hud_api_url = "https://api.hud.ai"
+
+            async with hud.eval(task) as ctx:
+                await ctx.call_tool("test_tool")
+
+            await asyncio.sleep(0.1)
+
+        assert upload_called is False
+
+
+class TestSpanFormat:
+    """Tests for the format of recorded spans."""
+
+    @pytest.mark.asyncio
+    async def test_span_has_required_fields(self):
+        """Test that spans have all required HudSpan fields."""
+        uploaded_spans: list[dict[str, Any]] = []
+
+        def capture_upload(
+            task_run_id: str,
+            spans: list[dict[str, Any]],
+            telemetry_url: str,
+            api_key: str,
+        ) -> bool:
+            uploaded_spans.extend(spans)
+            return True
+
+        env = Environment("test-env")
+
+        @env.tool
+        async def echo(message: str) -> str:
+            return message
+
+        task = Task(env=env)
+
+        with (
+            patch("hud.settings.settings") as mock_settings,
+            patch("hud.telemetry.exporter._do_upload", side_effect=capture_upload),
+            patch("hud.eval.context.make_request"),
+        ):
+            mock_settings.api_key = "test-key"
+            mock_settings.telemetry_enabled = True
+            mock_settings.hud_telemetry_url = "https://api.hud.ai"
+            mock_settings.hud_api_url = "https://api.hud.ai"
+
+            async with hud.eval(task) as ctx:
+                await ctx.call_tool("echo", message="test")
+
+            await asyncio.sleep(0.2)
+
+        assert len(uploaded_spans) >= 1
+        span = uploaded_spans[0]
+
+        # Required fields from HudSpan
+        assert "name" in span
+        assert "trace_id" in span
+        assert len(span["trace_id"]) == 32  # 32-char hex
+        assert "span_id" in span
+        assert len(span["span_id"]) == 16  # 16-char hex
+        assert "start_time" in span
+        assert "end_time" in span
+        assert "status_code" in span
+        assert span["status_code"] in ("OK", "ERROR", "UNSET")
+
+        # Attributes
+        assert "attributes" in span
+        attrs = span["attributes"]
+        assert "task_run_id" in attrs
+        assert "category" in attrs
+
+    @pytest.mark.asyncio
+    async def test_span_timestamps_are_iso(self):
+        """Test that span timestamps are in ISO format."""
+        uploaded_spans: list[dict[str, Any]] = []
+
+        def capture_upload(
+            task_run_id: str,
+            spans: list[dict[str, Any]],
+            telemetry_url: str,
+            api_key: str,
+        ) -> bool:
+            uploaded_spans.extend(spans)
+            return True
+
+        env = Environment("test-env")
+
+        @env.tool
+        async def noop() -> None:
+            pass
+
+        task = Task(env=env)
+
+        with (
+            patch("hud.settings.settings") as mock_settings,
+            patch("hud.telemetry.exporter._do_upload", side_effect=capture_upload),
+            patch("hud.eval.context.make_request"),
+        ):
+            mock_settings.api_key = "test-key"
+            mock_settings.telemetry_enabled = True
+            mock_settings.hud_telemetry_url = "https://api.hud.ai"
+            mock_settings.hud_api_url = "https://api.hud.ai"
+
+            async with hud.eval(task) as ctx:
+                await ctx.call_tool("noop")
+
+            await asyncio.sleep(0.2)
+
+        span = uploaded_spans[0]
+
+        # ISO format: YYYY-MM-DDTHH:MM:SS.ssssssZ
+        import re
+
+        iso_pattern = r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}"
+        assert re.match(iso_pattern, span["start_time"])
+        assert re.match(iso_pattern, span["end_time"])
diff --git a/hud/telemetry/tests/test_exporter.py b/hud/telemetry/tests/test_exporter.py
new file mode 100644
index 00000000..16c712d7
--- /dev/null
+++ b/hud/telemetry/tests/test_exporter.py
@@ -0,0 +1,258 @@
+"""Tests for telemetry exporter with mock backend."""
+
+from __future__ import annotations
+
+import asyncio
+from typing import Any
+from unittest.mock import patch
+
+import pytest
+
+from hud.telemetry.exporter import (
+    _do_upload,
+    _pending_futures,
+    _pending_spans,
+    flush,
+    queue_span,
+    shutdown,
+)
+
+
+@pytest.fixture(autouse=True)
+def clear_pending_state():
+    """Clear pending spans and futures before and after each test."""
+    _pending_spans.clear()
+    _pending_futures.clear()
+    yield
+    _pending_spans.clear()
+    _pending_futures.clear()
+
+
+class TestDoUpload:
+    """Tests for _do_upload function."""
+
+    def test_upload_success(self):
+        """Test successful upload."""
+        with patch("hud.telemetry.exporter.make_request_sync") as mock_request:
+            result = _do_upload(
+                task_run_id="test-task-123",
+                spans=[{"name": "test.span", "attributes": {"task_run_id": "test-task-123"}}],
+                telemetry_url="https://api.hud.ai",
+                api_key="test-key",
+            )
+
+            assert result is True
+            mock_request.assert_called_once()
+            call_kwargs = mock_request.call_args.kwargs
+            assert call_kwargs["method"] == "POST"
+            assert "test-task-123" in call_kwargs["url"]
+            assert call_kwargs["api_key"] == "test-key"
+            assert "telemetry" in call_kwargs["json"]
+
+    def test_upload_failure(self):
+        """Test upload failure handling."""
+        with patch("hud.telemetry.exporter.make_request_sync") as mock_request:
+            mock_request.side_effect = Exception("Network error")
+
+            result = _do_upload(
+                task_run_id="test-task-123",
+                spans=[{"name": "test.span"}],
+                telemetry_url="https://api.hud.ai",
+                api_key="test-key",
+            )
+
+            assert result is False
+
+
+class TestQueueSpan:
+    """Tests for queue_span function."""
+
+    def test_queue_span_without_api_key(self):
+        """Test that spans are not queued without API key."""
+        with patch("hud.settings.settings") as mock_settings:
+            mock_settings.api_key = None
+            mock_settings.telemetry_enabled = True
+
+            queue_span({"name": "test", "attributes": {"task_run_id": "123"}})
+
+            assert len(_pending_spans) == 0
+
+    def test_queue_span_without_telemetry_enabled(self):
+        """Test that spans are not queued when telemetry disabled."""
+        with patch("hud.settings.settings") as mock_settings:
+            mock_settings.api_key = "test-key"
+            mock_settings.telemetry_enabled = False
+
+            queue_span({"name": "test", "attributes": {"task_run_id": "123"}})
+
+            assert len(_pending_spans) == 0
+
+    def test_queue_span_without_task_run_id(self):
+        """Test that spans without task_run_id are ignored."""
+        with patch("hud.settings.settings") as mock_settings:
+            mock_settings.api_key = "test-key"
+            mock_settings.telemetry_enabled = True
+
+            queue_span({"name": "test", "attributes": {}})
+
+            assert len(_pending_spans) == 0
+
+    def test_queue_span_adds_to_pending(self):
+        """Test that spans are added to pending list."""
+        # Don't mock _do_upload so spans stay in pending
+        with patch("hud.settings.settings") as mock_settings:
+            mock_settings.api_key = "test-key"
+            mock_settings.telemetry_enabled = True
+            mock_settings.hud_telemetry_url = "https://api.hud.ai"
+
+            # Use a sync context (no event loop) so upload happens sync
+            # But we'll make it fail so span stays in pending
+            with patch("hud.telemetry.exporter._do_upload", return_value=False):
+                span = {"name": "test", "attributes": {"task_run_id": "task-123"}}
+                queue_span(span)
+
+                # Span should be in pending (upload failed so not removed)
+                assert "task-123" in _pending_spans
+                assert span in _pending_spans["task-123"]
+
+    @pytest.mark.asyncio
+    async def test_queue_span_uploads_async(self):
+        """Test that spans are uploaded via thread pool in async context."""
+        uploaded_spans: list[dict[str, Any]] = []
+
+        def mock_upload(
+            task_run_id: str,
+            spans: list[dict[str, Any]],
+            telemetry_url: str,
+            api_key: str,
+        ) -> bool:
+            uploaded_spans.extend(spans)
+            return True
+
+        with (
+            patch("hud.settings.settings") as mock_settings,
+            patch("hud.telemetry.exporter._do_upload", side_effect=mock_upload),
+        ):
+            mock_settings.api_key = "test-key"
+            mock_settings.telemetry_enabled = True
+            mock_settings.hud_telemetry_url = "https://api.hud.ai"
+
+            span = {"name": "test.async", "attributes": {"task_run_id": "async-task"}}
+            queue_span(span)
+
+            # Wait for thread pool to complete
+            await asyncio.sleep(0.1)
+
+            assert len(uploaded_spans) == 1
+            assert uploaded_spans[0]["name"] == "test.async"
+
+
+class TestFlush:
+    """Tests for flush function."""
+
+    def test_flush_specific_task(self):
+        """Test flushing spans for specific task."""
+        uploaded: list[tuple[str, list[dict[str, Any]]]] = []
+
+        def mock_upload(
+            task_run_id: str,
+            spans: list[dict[str, Any]],
+            telemetry_url: str,
+            api_key: str,
+        ) -> bool:
+            uploaded.append((task_run_id, spans))
+            return True
+
+        with (
+            patch("hud.settings.settings") as mock_settings,
+            patch("hud.telemetry.exporter._do_upload", side_effect=mock_upload),
+        ):
+            mock_settings.api_key = "test-key"
+            mock_settings.telemetry_enabled = True
+            mock_settings.hud_telemetry_url = "https://api.hud.ai"
+
+            # Add spans for two tasks
+            _pending_spans["task-1"].append({"name": "span1"})
+            _pending_spans["task-2"].append({"name": "span2"})
+
+            # Flush only task-1
+            flush("task-1")
+
+            assert len(uploaded) == 1
+            assert uploaded[0][0] == "task-1"
+            assert "task-1" not in _pending_spans
+            assert "task-2" in _pending_spans
+
+    def test_flush_all_tasks(self):
+        """Test flushing all pending spans."""
+        uploaded: list[tuple[str, list[dict[str, Any]]]] = []
+
+        def mock_upload(
+            task_run_id: str,
+            spans: list[dict[str, Any]],
+            telemetry_url: str,
+            api_key: str,
+        ) -> bool:
+            uploaded.append((task_run_id, spans))
+            return True
+
+        with (
+            patch("hud.settings.settings") as mock_settings,
+            patch("hud.telemetry.exporter._do_upload", side_effect=mock_upload),
+        ):
+            mock_settings.api_key = "test-key"
+            mock_settings.telemetry_enabled = True
+            mock_settings.hud_telemetry_url = "https://api.hud.ai"
+
+            _pending_spans["task-1"].append({"name": "span1"})
+            _pending_spans["task-2"].append({"name": "span2"})
+
+            flush()
+
+            assert len(uploaded) == 2
+            assert len(_pending_spans) == 0
+
+    def test_flush_clears_without_api_key(self):
+        """Test that flush clears spans when no API key."""
+        with patch("hud.settings.settings") as mock_settings:
+            mock_settings.api_key = None
+            mock_settings.telemetry_enabled = True
+
+            _pending_spans["task-1"].append({"name": "span1"})
+
+            flush()
+
+            assert len(_pending_spans) == 0
+
+
+class TestShutdown:
+    """Tests for shutdown function."""
+
+    def test_shutdown_flushes_pending(self):
+        """Test that shutdown flushes pending spans."""
+        uploaded: list[str] = []
+
+        def mock_upload(
+            task_run_id: str,
+            spans: list[dict[str, Any]],
+            telemetry_url: str,
+            api_key: str,
+        ) -> bool:
+            uploaded.append(task_run_id)
+            return True
+
+        with (
+            patch("hud.settings.settings") as mock_settings,
+            patch("hud.telemetry.exporter._do_upload", side_effect=mock_upload),
+            patch("hud.telemetry.exporter._get_api_key", return_value="test-key"),
+        ):
+            mock_settings.api_key = "test-key"
+            mock_settings.telemetry_enabled = True
+            mock_settings.hud_telemetry_url = "https://api.hud.ai"
+
+            _pending_spans["shutdown-task"].append({"name": "final-span"})
+
+            result = shutdown(timeout=1.0)
+
+            assert result is True
+            assert "shutdown-task" in uploaded
diff --git a/hud/telemetry/tests/test_instrument.py b/hud/telemetry/tests/test_instrument.py
index 1acf950a..2ffcf2f8 100644
--- a/hud/telemetry/tests/test_instrument.py
+++ b/hud/telemetry/tests/test_instrument.py
@@ -3,7 +3,6 @@
 from dataclasses import dataclass
 
 import pytest
-from opentelemetry.trace import SpanKind
 
 from hud.telemetry.instrument import _serialize_value, instrument
 
@@ -102,7 +101,7 @@ async def test_func(x: int, y: int) -> int:
 async def test_instrument_async_with_params():
     """Test instrument with custom parameters."""
 
-    @instrument(name="custom_name", span_type="custom_type")
+    @instrument(name="custom_name", category="custom_type")
     async def test_func(x: int) -> int:
         return x * 2
 
@@ -147,10 +146,10 @@ async def test_func() -> str:
 
 
 @pytest.mark.asyncio
-async def test_instrument_async_with_attributes():
-    """Test instrument with custom attributes."""
+async def test_instrument_async_with_category():
+    """Test instrument with custom category."""
 
-    @instrument(attributes={"custom_attr": "value"})
+    @instrument(category="agent")
     async def test_func() -> int:
         return 42
 
@@ -158,18 +157,6 @@ async def test_func() -> int:
     assert result == 42
 
 
-@pytest.mark.asyncio
-async def test_instrument_async_with_span_kind():
-    """Test instrument with custom span kind."""
-
-    @instrument(span_kind=SpanKind.CLIENT)
-    async def test_func() -> int:
-        return 1
-
-    result = await test_func()
-    assert result == 1
-
-
 def test_instrument_sync_basic():
     """Test instrument decorator on sync function."""
 
@@ -184,7 +171,7 @@ def test_func(x: int, y: int) -> int:
 def test_instrument_sync_with_params():
     """Test instrument on sync function with parameters."""
 
-    @instrument(name="sync_custom", span_type="sync_type")
+    @instrument(name="sync_custom", category="sync_type")
     def test_func(x: int) -> int:
         return x * 2
 
@@ -225,10 +212,10 @@ def test_func() -> str:
     assert result == "test"
 
 
-def test_instrument_sync_with_attributes():
-    """Test instrument sync with custom attributes."""
+def test_instrument_sync_with_category():
+    """Test instrument sync with custom category."""
 
-    @instrument(attributes={"sync_attr": "sync_value"})
+    @instrument(category="tool")
     def test_func() -> int:
         return 42
 
diff --git a/hud/telemetry/tests/test_job.py b/hud/telemetry/tests/test_job.py
deleted file mode 100644
index 4449da9f..00000000
--- a/hud/telemetry/tests/test_job.py
+++ /dev/null
@@ -1,555 +0,0 @@
-from __future__ import annotations
-
-from datetime import datetime
-from unittest.mock import AsyncMock, patch
-
-import pytest
-
-from hud.telemetry.job import (
-    Job,
-    _print_job_complete_url,
-    _print_job_url,
-    create_job,
-    get_current_job,
-    job,
-    job_decorator,
-)
-
-
-def test_job_initialization():
-    """Test Job initialization with all parameters."""
-    job_obj = Job(
-        job_id="test-id",
-        name="Test Job",
-        metadata={"key": "value"},
-        dataset_link="test/dataset",
-    )
-
-    assert job_obj.id == "test-id"
-    assert job_obj.name == "Test Job"
-    assert job_obj.metadata == {"key": "value"}
-    assert job_obj.dataset_link == "test/dataset"
-    assert job_obj.status == "created"
-    assert isinstance(job_obj.created_at, datetime)
-    assert job_obj.tasks == []
-
-
-def test_job_initialization_minimal():
-    """Test Job initialization with minimal parameters."""
-    job_obj = Job(job_id="test-id", name="Test")
-
-    assert job_obj.metadata == {}
-    assert job_obj.dataset_link is None
-
-
-def test_job_add_task():
-    """Test adding tasks to a job."""
-    job_obj = Job(job_id="test-id", name="Test")
-
-    job_obj.add_task("task1")
-    job_obj.add_task("task2")
-
-    assert job_obj.tasks == ["task1", "task2"]
-
-
-@pytest.mark.asyncio
-async def test_job_update_status_async():
-    """Test async status update."""
-    job_obj = Job(job_id="test-id", name="Test")
-
-    with (
-        patch("hud.telemetry.job.settings") as mock_settings,
-        patch("hud.telemetry.job.make_request", new_callable=AsyncMock) as mock_request,
-    ):
-        mock_settings.telemetry_enabled = True
-        mock_settings.api_key = "test_key"
-        mock_settings.hud_telemetry_url = "https://test.com"
-
-        await job_obj.update_status("running")
-
-        assert job_obj.status == "running"
-        mock_request.assert_called_once()
-        call_kwargs = mock_request.call_args[1]
-        assert call_kwargs["method"] == "POST"
-        assert "test-id" in call_kwargs["url"]
-        assert call_kwargs["json"]["status"] == "running"
-
-
-@pytest.mark.asyncio
-async def test_job_update_status_async_with_dataset():
-    """Test async status update includes dataset link."""
-    job_obj = Job(job_id="test-id", name="Test", dataset_link="test/dataset")
-
-    with (
-        patch("hud.telemetry.job.settings") as mock_settings,
-        patch("hud.telemetry.job.make_request", new_callable=AsyncMock) as mock_request,
-    ):
-        mock_settings.telemetry_enabled = True
-        mock_settings.api_key = "test_key"
-        mock_settings.hud_telemetry_url = "https://test.com"
-
-        await job_obj.update_status("running")
-
-        call_kwargs = mock_request.call_args[1]
-        assert call_kwargs["json"]["dataset_link"] == "test/dataset"
-
-
-@pytest.mark.asyncio
-async def test_job_update_status_async_telemetry_disabled():
-    """Test async status update when telemetry is disabled."""
-    job_obj = Job(job_id="test-id", name="Test")
-
-    with (
-        patch("hud.telemetry.job.settings") as mock_settings,
-        patch("hud.telemetry.job.make_request", new_callable=AsyncMock) as mock_request,
-    ):
-        mock_settings.telemetry_enabled = False
-
-        await job_obj.update_status("running")
-
-        assert job_obj.status == "running"
-        mock_request.assert_not_called()
-
-
-@pytest.mark.asyncio
-async def test_job_update_status_async_error():
-    """Test async status update handles errors gracefully."""
-    job_obj = Job(job_id="test-id", name="Test")
-
-    with (
-        patch("hud.telemetry.job.settings") as mock_settings,
-        patch("hud.telemetry.job.make_request", new_callable=AsyncMock) as mock_request,
-    ):
-        mock_settings.telemetry_enabled = True
-        mock_settings.api_key = "test_key"
-        mock_settings.hud_telemetry_url = "https://test.com"
-        mock_request.side_effect = Exception("Network error")
-
-        # Should not raise
-        await job_obj.update_status("running")
-        assert job_obj.status == "running"
-
-
-def test_job_update_status_sync():
-    """Test sync status update."""
-    job_obj = Job(job_id="test-id", name="Test")
-
-    with (
-        patch("hud.telemetry.job.settings") as mock_settings,
-        patch("hud.telemetry.job.make_request_sync") as mock_request,
-    ):
-        mock_settings.telemetry_enabled = True
-        mock_settings.api_key = "test_key"
-        mock_settings.hud_telemetry_url = "https://test.com"
-
-        job_obj.update_status_sync("completed")
-
-        assert job_obj.status == "completed"
-        mock_request.assert_called_once()
-
-
-def test_job_update_status_sync_with_dataset():
-    """Test sync status update includes dataset link."""
-    job_obj = Job(job_id="test-id", name="Test", dataset_link="test/dataset")
-
-    with (
-        patch("hud.telemetry.job.settings") as mock_settings,
-        patch("hud.telemetry.job.make_request_sync") as mock_request,
-    ):
-        mock_settings.telemetry_enabled = True
-        mock_settings.api_key = "test_key"
-        mock_settings.hud_telemetry_url = "https://test.com"
-
-        job_obj.update_status_sync("completed")
-
-        call_kwargs = mock_request.call_args[1]
-        assert call_kwargs["json"]["dataset_link"] == "test/dataset"
-
-
-def test_job_update_status_sync_telemetry_disabled():
-    """Test sync status update when telemetry is disabled."""
-    job_obj = Job(job_id="test-id", name="Test")
-
-    with (
-        patch("hud.telemetry.job.settings") as mock_settings,
-        patch("hud.telemetry.job.make_request_sync") as mock_request,
-    ):
-        mock_settings.telemetry_enabled = False
-
-        job_obj.update_status_sync("completed")
-
-        mock_request.assert_not_called()
-
-
-def test_job_update_status_sync_error():
-    """Test sync status update handles errors gracefully."""
-    job_obj = Job(job_id="test-id", name="Test")
-
-    with (
-        patch("hud.telemetry.job.settings") as mock_settings,
-        patch("hud.telemetry.job.make_request_sync") as mock_request,
-    ):
-        mock_settings.telemetry_enabled = True
-        mock_settings.api_key = "test_key"
-        mock_settings.hud_telemetry_url = "https://test.com"
-        mock_request.side_effect = Exception("Network error")
-
-        # Should not raise
-        job_obj.update_status_sync("completed")
-
-
-@pytest.mark.asyncio
-async def test_job_log():
-    """Test async log method."""
-    job_obj = Job(job_id="test-id", name="Test")
-
-    with (
-        patch("hud.telemetry.job.settings") as mock_settings,
-        patch("hud.telemetry.job.make_request", new_callable=AsyncMock) as mock_request,
-    ):
-        mock_settings.telemetry_enabled = True
-        mock_settings.api_key = "test_key"
-        mock_settings.hud_telemetry_url = "https://test.com"
-
-        await job_obj.log({"loss": 0.5, "accuracy": 0.95})
-
-        mock_request.assert_called_once()
-        call_kwargs = mock_request.call_args[1]
-        assert call_kwargs["json"]["metrics"] == {"loss": 0.5, "accuracy": 0.95}
-        assert "timestamp" in call_kwargs["json"]
-
-
-@pytest.mark.asyncio
-async def test_job_log_telemetry_disabled():
-    """Test async log when telemetry is disabled."""
-    job_obj = Job(job_id="test-id", name="Test")
-
-    with (
-        patch("hud.telemetry.job.settings") as mock_settings,
-        patch("hud.telemetry.job.make_request", new_callable=AsyncMock) as mock_request,
-    ):
-        mock_settings.telemetry_enabled = False
-
-        await job_obj.log({"loss": 0.5})
-
-        mock_request.assert_not_called()
-
-
-@pytest.mark.asyncio
-async def test_job_log_error():
-    """Test async log handles errors gracefully."""
-    job_obj = Job(job_id="test-id", name="Test")
-
-    with (
-        patch("hud.telemetry.job.settings") as mock_settings,
-        patch("hud.telemetry.job.make_request", new_callable=AsyncMock) as mock_request,
-    ):
-        mock_settings.telemetry_enabled = True
-        mock_settings.api_key = "test_key"
-        mock_settings.hud_telemetry_url = "https://test.com"
-        mock_request.side_effect = Exception("Network error")
-
-        # Should not raise
-        await job_obj.log({"loss": 0.5})
-
-
-def test_job_log_sync():
-    """Test sync log method."""
-    job_obj = Job(job_id="test-id", name="Test")
-
-    with (
-        patch("hud.telemetry.job.settings") as mock_settings,
-        patch("hud.telemetry.job.make_request_sync") as mock_request,
-    ):
-        mock_settings.telemetry_enabled = True
-        mock_settings.api_key = "test_key"
-        mock_settings.hud_telemetry_url = "https://test.com"
-
-        job_obj.log_sync({"loss": 0.5, "accuracy": 0.95})
-
-        mock_request.assert_called_once()
-        call_kwargs = mock_request.call_args[1]
-        assert call_kwargs["json"]["metrics"] == {"loss": 0.5, "accuracy": 0.95}
-
-
-def test_job_log_sync_telemetry_disabled():
-    """Test sync log when telemetry is disabled."""
-    job_obj = Job(job_id="test-id", name="Test")
-
-    with (
-        patch("hud.telemetry.job.settings") as mock_settings,
-        patch("hud.telemetry.job.make_request_sync") as mock_request,
-    ):
-        mock_settings.telemetry_enabled = False
-
-        job_obj.log_sync({"loss": 0.5})
-
-        mock_request.assert_not_called()
-
-
-def test_job_log_sync_error():
-    """Test sync log handles errors gracefully."""
-    job_obj = Job(job_id="test-id", name="Test")
-
-    with (
-        patch("hud.telemetry.job.settings") as mock_settings,
-        patch("hud.telemetry.job.make_request_sync") as mock_request,
-    ):
-        mock_settings.telemetry_enabled = True
-        mock_settings.api_key = "test_key"
-        mock_settings.hud_telemetry_url = "https://test.com"
-        mock_request.side_effect = Exception("Network error")
-
-        # Should not raise
-        job_obj.log_sync({"loss": 0.5})
-
-
-def test_job_repr():
-    """Test Job __repr__."""
-    job_obj = Job(job_id="test-id", name="Test Job")
-    job_obj.status = "running"
-
-    repr_str = repr(job_obj)
-    assert "test-id" in repr_str
-    assert "Test Job" in repr_str
-    assert "running" in repr_str
-
-
-def test_print_job_url_enabled():
-    """Test _print_job_url when telemetry is enabled."""
-    with (
-        patch("hud.telemetry.job.settings") as mock_settings,
-        patch("builtins.print") as mock_print,
-    ):
-        mock_settings.telemetry_enabled = True
-        mock_settings.api_key = "test_key"
-
-        _print_job_url("job-123", "My Job")
-
-        # Should print multiple lines (box)
-        assert mock_print.call_count > 0
-
-
-def test_print_job_url_disabled():
-    """Test _print_job_url when telemetry is disabled."""
-    with (
-        patch("hud.telemetry.job.settings") as mock_settings,
-        patch("builtins.print") as mock_print,
-    ):
-        mock_settings.telemetry_enabled = False
-
-        _print_job_url("job-123", "My Job")
-
-        mock_print.assert_not_called()
-
-
-def test_print_job_url_no_api_key():
-    """Test _print_job_url when no API key is set."""
-    with (
-        patch("hud.telemetry.job.settings") as mock_settings,
-        patch("builtins.print") as mock_print,
-    ):
-        mock_settings.telemetry_enabled = True
-        mock_settings.api_key = None
-
-        _print_job_url("job-123", "My Job")
-
-        mock_print.assert_not_called()
-
-
-def test_print_job_complete_url_success():
-    """Test _print_job_complete_url for successful completion."""
-    with (
-        patch("hud.telemetry.job.settings") as mock_settings,
-        patch("builtins.print") as mock_print,
-    ):
-        mock_settings.telemetry_enabled = True
-        mock_settings.api_key = "test_key"
-
-        _print_job_complete_url("job-123", "My Job", error_occurred=False)
-
-        mock_print.assert_called_once()
-        call_str = str(mock_print.call_args)
-        assert "complete" in call_str.lower() or "✓" in call_str
-
-
-def test_print_job_complete_url_failure():
-    """Test _print_job_complete_url for failed completion."""
-    with (
-        patch("hud.telemetry.job.settings") as mock_settings,
-        patch("builtins.print") as mock_print,
-    ):
-        mock_settings.telemetry_enabled = True
-        mock_settings.api_key = "test_key"
-
-        _print_job_complete_url("job-123", "My Job", error_occurred=True)
-
-        mock_print.assert_called_once()
-        call_str = str(mock_print.call_args)
-        assert "fail" in call_str.lower() or "✗" in call_str
-
-
-def test_print_job_complete_url_disabled():
-    """Test _print_job_complete_url when telemetry is disabled."""
-    with (
-        patch("hud.telemetry.job.settings") as mock_settings,
-        patch("builtins.print") as mock_print,
-    ):
-        mock_settings.telemetry_enabled = False
-
-        _print_job_complete_url("job-123", "My Job")
-
-        mock_print.assert_not_called()
-
-
-def test_get_current_job_none():
-    """Test get_current_job when no job is active."""
-    result = get_current_job()
-    assert result is None
-
-
-def test_job_context_manager():
-    """Test job context manager."""
-    with (
-        patch("hud.telemetry.job.settings") as mock_settings,
-        patch("builtins.print"),
-    ):
-        mock_settings.telemetry_enabled = True
-        mock_settings.api_key = "test_key"
-
-        with job("Test Job", {"key": "value"}) as job_obj:
-            assert job_obj.name == "Test Job"
-            assert job_obj.metadata == {"key": "value"}
-            assert get_current_job() == job_obj
-
-        # After context, job should be cleared
-        assert get_current_job() is None
-
-
-def test_job_context_manager_with_job_id():
-    """Test job context manager with explicit job_id."""
-    with (
-        patch("hud.telemetry.job.settings") as mock_settings,
-        patch("builtins.print"),
-    ):
-        mock_settings.telemetry_enabled = True
-        mock_settings.api_key = "test_key"
-
-        with job("Test", job_id="my-custom-id") as job_obj:
-            assert job_obj.id == "my-custom-id"
-
-
-def test_job_context_manager_with_dataset_link():
-    """Test job context manager with dataset link."""
-    with (
-        patch("hud.telemetry.job.settings") as mock_settings,
-        patch("builtins.print"),
-    ):
-        mock_settings.telemetry_enabled = True
-        mock_settings.api_key = "test_key"
-
-        with job("Test", dataset_link="test/dataset") as job_obj:
-            assert job_obj.dataset_link == "test/dataset"
-
-
-def test_job_context_manager_exception():
-    """Test job context manager handles exceptions."""
-    with (
-        patch("hud.telemetry.job.settings") as mock_settings,
-        patch("builtins.print"),
-    ):
-        mock_settings.telemetry_enabled = True
-        mock_settings.api_key = "test_key"
-
-        with pytest.raises(ValueError), job("Test"):
-            raise ValueError("Test error")
-
-        # Job should be cleared even after exception
-        assert get_current_job() is None
-
-
-def test_create_job():
-    """Test create_job function."""
-    job_obj = create_job("Test Job", {"key": "value"}, dataset_link="test/dataset")
-
-    assert job_obj.name == "Test Job"
-    assert job_obj.metadata == {"key": "value"}
-    assert job_obj.dataset_link == "test/dataset"
-    assert job_obj.id  # Should have an auto-generated ID
-
-
-def test_create_job_with_job_id():
-    """Test create_job with explicit job_id."""
-    job_obj = create_job("Test", job_id="custom-id")
-
-    assert job_obj.id == "custom-id"
-
-
-@pytest.mark.asyncio
-async def test_job_decorator_async():
-    """Test job_decorator on async function."""
-    with (
-        patch("hud.telemetry.job.settings") as mock_settings,
-        patch("builtins.print"),
-    ):
-        mock_settings.telemetry_enabled = True
-        mock_settings.api_key = "test_key"
-
-        @job_decorator("test_job", model="gpt-4")
-        async def test_func(x: int) -> int:
-            return x * 2
-
-        result = await test_func(5)
-        assert result == 10
-
-
-def test_job_decorator_sync():
-    """Test job_decorator on sync function."""
-    with (
-        patch("hud.telemetry.job.settings") as mock_settings,
-        patch("builtins.print"),
-    ):
-        mock_settings.telemetry_enabled = True
-        mock_settings.api_key = "test_key"
-
-        @job_decorator("test_job", model="gpt-4")
-        def test_func(x: int) -> int:
-            return x * 2
-
-        result = test_func(5)
-        assert result == 10
-
-
-@pytest.mark.asyncio
-async def test_job_decorator_async_default_name():
-    """Test job_decorator uses function name as default."""
-    with (
-        patch("hud.telemetry.job.settings") as mock_settings,
-        patch("builtins.print"),
-    ):
-        mock_settings.telemetry_enabled = True
-        mock_settings.api_key = "test_key"
-
-        @job_decorator()
-        async def my_function():
-            return "success"
-
-        result = await my_function()
-        assert result == "success"
-
-
-def test_job_decorator_sync_default_name():
-    """Test job_decorator sync uses function name as default."""
-    with (
-        patch("hud.telemetry.job.settings") as mock_settings,
-        patch("builtins.print"),
-    ):
-        mock_settings.telemetry_enabled = True
-        mock_settings.api_key = "test_key"
-
-        @job_decorator()
-        def my_function():
-            return "success"
-
-        result = my_function()
-        assert result == "success"
diff --git a/hud/telemetry/tests/test_replay.py b/hud/telemetry/tests/test_replay.py
deleted file mode 100644
index 507c4e4a..00000000
--- a/hud/telemetry/tests/test_replay.py
+++ /dev/null
@@ -1,40 +0,0 @@
-"""Tests for telemetry replay functionality."""
-
-from __future__ import annotations
-
-from unittest.mock import patch
-
-from hud.telemetry.replay import clear_trace, get_trace
-
-
-class TestReplayAPI:
-    """Tests for replay API functions."""
-
-    def test_get_trace_calls_internal(self):
-        """Test that get_trace calls the internal _get_trace function."""
-        with patch("hud.telemetry.replay._get_trace") as mock_get:
-            mock_get.return_value = None
-
-            result = get_trace("test-task-id")
-
-            mock_get.assert_called_once_with("test-task-id")
-            assert result is None
-
-    def test_clear_trace_calls_internal(self):
-        """Test that clear_trace calls the internal _clear_trace function."""
-        with patch("hud.telemetry.replay._clear_trace") as mock_clear:
-            clear_trace("test-task-id")
-
-            mock_clear.assert_called_once_with("test-task-id")
-
-    def test_get_trace_with_data(self):
-        """Test get_trace with mock data."""
-        mock_trace = {"trace": [{"step": 1}], "task_run_id": "test-123"}
-
-        with patch("hud.telemetry.replay._get_trace") as mock_get:
-            mock_get.return_value = mock_trace
-
-            result = get_trace("test-123")
-
-            assert result == mock_trace
-            mock_get.assert_called_once_with("test-123")
diff --git a/hud/telemetry/tests/test_trace.py b/hud/telemetry/tests/test_trace.py
deleted file mode 100644
index b2835edf..00000000
--- a/hud/telemetry/tests/test_trace.py
+++ /dev/null
@@ -1,241 +0,0 @@
-"""Tests for telemetry trace functionality."""
-
-from __future__ import annotations
-
-from unittest.mock import AsyncMock, patch
-
-import pytest
-
-from hud.telemetry.trace import Trace, trace
-
-
-class TestTraceAPI:
-    """Tests for trace API function."""
-
-    def test_trace_with_disabled_telemetry_and_no_api_key(self):
-        """Test trace behavior when telemetry is disabled and no API key."""
-        # Mock settings to disable telemetry and remove API key
-        mock_settings = type("Settings", (), {"telemetry_enabled": False, "api_key": None})()
-
-        with (
-            patch("hud.settings.get_settings", return_value=mock_settings),
-            patch("hud.telemetry.trace.OtelTrace") as mock_otel_trace,
-        ):
-            mock_otel_trace.return_value.__enter__.return_value = "1234567890"
-
-            with trace("test-trace") as task_run_id:
-                # Should use placeholder ID for custom backends
-                assert len(task_run_id.id) == 36
-
-    def test_trace_with_enabled_telemetry_and_api_key(self):
-        """Test trace behavior when telemetry is enabled with API key."""
-        mock_settings = type("Settings", (), {"telemetry_enabled": True, "api_key": "test-key"})()
-
-        with (
-            patch("hud.settings.get_settings", return_value=mock_settings),
-            patch("hud.telemetry.trace.OtelTrace") as mock_otel_trace,
-            patch("hud.telemetry.trace.uuid.uuid4") as mock_uuid,
-        ):
-            mock_uuid.return_value = "mock-uuid-123"
-            mock_otel_trace.return_value.__enter__.return_value = "mock-uuid-123"
-
-            with trace("test-trace") as task_run_id:
-                # Should use generated UUID
-                assert task_run_id.id == "mock-uuid-123"
-
-    def test_trace_with_no_api_key(self):
-        """Test trace behavior with no API key (custom backend scenario)."""
-        mock_settings = type(
-            "Settings",
-            (),
-            {
-                "telemetry_enabled": True,  # Enabled but no API key
-                "api_key": None,
-            },
-        )()
-
-        with (
-            patch("hud.settings.get_settings", return_value=mock_settings),
-            patch("hud.telemetry.trace.OtelTrace") as mock_otel_trace,
-        ):
-            mock_otel_trace.return_value.__enter__.return_value = "custom-otlp-trace"
-
-            with trace("test-trace") as task_run_id:
-                # In absence of HUD API key, ID should still be a string
-                assert isinstance(task_run_id.id, str)
-
-    def test_trace_with_job_id(self):
-        """Test trace with job_id parameter."""
-        mock_settings = type("Settings", (), {"telemetry_enabled": True, "api_key": "test-key"})()
-
-        with (
-            patch("hud.settings.get_settings", return_value=mock_settings),
-            patch("hud.telemetry.trace.OtelTrace") as mock_otel_trace,
-            trace("test-trace", job_id="job-123") as trace_obj,
-        ):
-            assert trace_obj.job_id == "job-123"
-
-            # Check OtelTrace was called with job_id
-            call_kwargs = mock_otel_trace.call_args[1]
-            assert call_kwargs["job_id"] == "job-123"
-
-    def test_trace_with_task_id(self):
-        """Test trace with task_id parameter."""
-        mock_settings = type("Settings", (), {"telemetry_enabled": True, "api_key": "test-key"})()
-
-        with (
-            patch("hud.settings.get_settings", return_value=mock_settings),
-            patch("hud.telemetry.trace.OtelTrace"),
-            trace("test-trace", task_id="task-456") as trace_obj,
-        ):
-            assert trace_obj.task_id == "task-456"
-
-    def test_trace_with_attributes(self):
-        """Test trace with custom attributes."""
-        mock_settings = type("Settings", (), {"telemetry_enabled": True, "api_key": "test-key"})()
-
-        with (
-            patch("hud.settings.get_settings", return_value=mock_settings),
-            patch("hud.telemetry.trace.OtelTrace") as mock_otel_trace,
-            trace("test-trace", attrs={"custom": "value"}),
-        ):
-            # Check OtelTrace was called with attributes
-            call_kwargs = mock_otel_trace.call_args[1]
-            assert call_kwargs["attributes"] == {"custom": "value"}
-
-    def test_trace_non_root(self):
-        """Test trace with root=False."""
-        mock_settings = type("Settings", (), {"telemetry_enabled": True, "api_key": "test-key"})()
-
-        with (
-            patch("hud.settings.get_settings", return_value=mock_settings),
-            patch("hud.telemetry.trace.OtelTrace") as mock_otel_trace,
-            trace("test-trace", root=False),
-        ):
-            # Check OtelTrace was called with is_root=False
-            call_kwargs = mock_otel_trace.call_args[1]
-            assert call_kwargs["is_root"] is False
-
-
-class TestTraceClass:
-    """Tests for Trace class."""
-
-    def test_trace_initialization(self):
-        """Test Trace initialization."""
-        trace_obj = Trace(
-            trace_id="test-id",
-            name="Test Trace",
-            job_id="job-123",
-            task_id="task-456",
-        )
-
-        assert trace_obj.id == "test-id"
-        assert trace_obj.name == "Test Trace"
-        assert trace_obj.job_id == "job-123"
-        assert trace_obj.task_id == "task-456"
-        assert trace_obj.created_at is not None
-
-    @pytest.mark.asyncio
-    async def test_trace_log(self):
-        """Test Trace async log method."""
-        trace_obj = Trace("test-id", "Test")
-
-        with (
-            patch("hud.telemetry.trace.settings") as mock_settings,
-            patch("hud.telemetry.trace.make_request", new_callable=AsyncMock) as mock_request,
-        ):
-            mock_settings.telemetry_enabled = True
-            mock_settings.api_key = "test-key"
-            mock_settings.hud_telemetry_url = "https://test.com"
-
-            await trace_obj.log({"metric": 1.0})
-
-            mock_request.assert_called_once()
-            call_kwargs = mock_request.call_args[1]
-            assert call_kwargs["json"]["metrics"] == {"metric": 1.0}
-
-    @pytest.mark.asyncio
-    async def test_trace_log_telemetry_disabled(self):
-        """Test Trace log when telemetry is disabled."""
-        trace_obj = Trace("test-id", "Test")
-
-        with (
-            patch("hud.telemetry.trace.settings") as mock_settings,
-            patch("hud.telemetry.trace.make_request", new_callable=AsyncMock) as mock_request,
-        ):
-            mock_settings.telemetry_enabled = False
-
-            await trace_obj.log({"metric": 1.0})
-
-            mock_request.assert_not_called()
-
-    @pytest.mark.asyncio
-    async def test_trace_log_error(self):
-        """Test Trace log handles errors gracefully."""
-        trace_obj = Trace("test-id", "Test")
-
-        with (
-            patch("hud.telemetry.trace.settings") as mock_settings,
-            patch("hud.telemetry.trace.make_request", new_callable=AsyncMock) as mock_request,
-        ):
-            mock_settings.telemetry_enabled = True
-            mock_settings.api_key = "test-key"
-            mock_settings.hud_telemetry_url = "https://test.com"
-            mock_request.side_effect = Exception("Network error")
-
-            # Should not raise
-            await trace_obj.log({"metric": 1.0})
-
-    def test_trace_log_sync(self):
-        """Test Trace sync log method."""
-        trace_obj = Trace("test-id", "Test")
-
-        with (
-            patch("hud.telemetry.trace.settings") as mock_settings,
-            patch("hud.telemetry.trace.make_request_sync") as mock_request,
-        ):
-            mock_settings.telemetry_enabled = True
-            mock_settings.api_key = "test-key"
-            mock_settings.hud_telemetry_url = "https://test.com"
-
-            trace_obj.log_sync({"metric": 1.0})
-
-            mock_request.assert_called_once()
-
-    def test_trace_log_sync_telemetry_disabled(self):
-        """Test Trace sync log when telemetry is disabled."""
-        trace_obj = Trace("test-id", "Test")
-
-        with (
-            patch("hud.telemetry.trace.settings") as mock_settings,
-            patch("hud.telemetry.trace.make_request_sync") as mock_request,
-        ):
-            mock_settings.telemetry_enabled = False
-
-            trace_obj.log_sync({"metric": 1.0})
-
-            mock_request.assert_not_called()
-
-    def test_trace_log_sync_error(self):
-        """Test Trace sync log handles errors gracefully."""
-        trace_obj = Trace("test-id", "Test")
-
-        with (
-            patch("hud.telemetry.trace.settings") as mock_settings,
-            patch("hud.telemetry.trace.make_request_sync") as mock_request,
-        ):
-            mock_settings.telemetry_enabled = True
-            mock_settings.api_key = "test-key"
-            mock_settings.hud_telemetry_url = "https://test.com"
-            mock_request.side_effect = Exception("Network error")
-
-            # Should not raise
-            trace_obj.log_sync({"metric": 1.0})
-
-    def test_trace_repr(self):
-        """Test Trace __repr__."""
-        trace_obj = Trace("test-id", "Test Trace")
-
-        repr_str = repr(trace_obj)
-        assert "test-id" in repr_str
-        assert "Test Trace" in repr_str
diff --git a/hud/telemetry/trace.py b/hud/telemetry/trace.py
deleted file mode 100644
index 2aa19080..00000000
--- a/hud/telemetry/trace.py
+++ /dev/null
@@ -1,166 +0,0 @@
-"""User-facing trace context manager for HUD telemetry.
-
-This module provides the simple trace() API that users interact with.
-The actual OpenTelemetry implementation is in hud.otel.
-"""
-
-from __future__ import annotations
-
-import logging
-import uuid
-from contextlib import contextmanager
-from datetime import UTC, datetime
-from typing import TYPE_CHECKING, Any
-
-from hud.otel import configure_telemetry
-from hud.otel import trace as OtelTrace
-from hud.settings import settings
-from hud.shared import make_request, make_request_sync
-
-if TYPE_CHECKING:
-    from collections.abc import Generator
-
-logger = logging.getLogger(__name__)
-
-__all__ = ["Trace", "trace"]
-
-
-class Trace:
-    """A trace represents a single task execution with telemetry."""
-
-    def __init__(
-        self,
-        trace_id: str,
-        name: str,
-        job_id: str | None = None,
-        task_id: str | None = None,
-        group_id: str | None = None,
-    ) -> None:
-        self.id = trace_id
-        self.name = name
-        self.job_id = job_id
-        self.task_id = task_id
-        self.group_id = group_id
-        self.created_at = datetime.now(UTC)
-
-    async def log(self, metrics: dict[str, Any]) -> None:
-        """Log metrics to this trace.
-
-        Args:
-            metrics: Dictionary of metric name to value pairs
-
-        Example:
-            await trace.log({"step": 1, "loss": 0.5, "accuracy": 0.92})
-        """
-        if settings.telemetry_enabled:
-            try:
-                await make_request(
-                    method="POST",
-                    url=f"{settings.hud_telemetry_url}/traces/{self.id}/log",
-                    json={"metrics": metrics, "timestamp": datetime.now(UTC).isoformat()},
-                    api_key=settings.api_key,
-                )
-            except Exception as e:
-                logger.warning("Failed to log metrics to trace: %s", e)
-
-    def log_sync(self, metrics: dict[str, Any]) -> None:
-        """Synchronously log metrics to this trace.
-
-        Args:
-            metrics: Dictionary of metric name to value pairs
-
-        Example:
-            trace.log_sync({"step": 1, "loss": 0.5, "accuracy": 0.92})
-        """
-        if settings.telemetry_enabled:
-            try:
-                make_request_sync(
-                    method="POST",
-                    url=f"{settings.hud_telemetry_url}/traces/{self.id}/log",
-                    json={"metrics": metrics, "timestamp": datetime.now(UTC).isoformat()},
-                    api_key=settings.api_key,
-                )
-            except Exception as e:
-                logger.warning("Failed to log metrics to trace: %s", e)
-
-    def __repr__(self) -> str:
-        return f"Trace(id={self.id!r}, name={self.name!r})"
-
-
-@contextmanager
-def trace(
-    name: str = "Test task from hud",
-    *,
-    root: bool = True,
-    attrs: dict[str, Any] | None = None,
-    job_id: str | None = None,
-    task_id: str | None = None,
-    group_id: str | None = None,
-    trace_id: str | None = None,
-) -> Generator[Trace, None, None]:
-    """Start a HUD trace context for telemetry tracking.
-
-    A unique task_run_id is automatically generated for each trace unless provided.
-
-    Args:
-        name: Descriptive name for this trace/task
-        root: Whether this is a root trace (updates task status)
-        attrs: Additional attributes to attach to the trace
-        job_id: Optional job ID to associate with this trace
-        task_id: Optional task ID (for custom task identifiers)
-        group_id: Optional group ID to associate with this trace
-        trace_id: Optional trace ID (auto-generated if not provided)
-
-    Yields:
-        Trace: The trace object with logging capabilities
-
-    Example:
-        >>> import hud
-        >>> with hud.trace("My Task") as trace:
-        ...     do_work()
-        ...     trace.log_sync({"step": 1, "progress": 0.5})
-        >>> # For async code, use async_trace
-        >>> async with hud.async_trace("Async Task") as trace:
-        ...     await do_async_work()
-        ...     await trace.log({"loss": 0.23})
-
-    Note:
-        This is a synchronous context manager that uses blocking HTTP calls.
-        For async code, use `hud.async_trace()` instead.
-    """
-    # Ensure telemetry is configured
-    configure_telemetry()
-
-    # Use provided trace_id or generate one
-    if trace_id:
-        task_run_id = trace_id
-    else:
-        # Only generate task_run_id if using HUD backend
-        # For custom OTLP backends, we don't need it
-        from hud.settings import get_settings
-
-        settings = get_settings()
-
-        if settings.telemetry_enabled and settings.api_key:
-            task_run_id = str(uuid.uuid4())
-        else:
-            # Use a placeholder for custom backends
-            logger.warning(
-                "HUD API key is not set, using a placeholder for the task run ID. If this looks wrong, check your API key."  # noqa: E501
-            )
-            task_run_id = str(uuid.uuid4())
-
-    # Create trace object
-    trace_obj = Trace(task_run_id, name, job_id, task_id, group_id)
-
-    # Delegate to OpenTelemetry implementation
-    with OtelTrace(
-        task_run_id,
-        is_root=root,
-        span_name=name,
-        attributes=attrs or {},
-        job_id=job_id,
-        task_id=task_id,
-        group_id=group_id,
-    ):
-        yield trace_obj
diff --git a/hud/telemetry/utils.py b/hud/telemetry/utils.py
deleted file mode 100644
index a63ecfaf..00000000
--- a/hud/telemetry/utils.py
+++ /dev/null
@@ -1,42 +0,0 @@
-"""Telemetry utility functions for managing trace and span lifecycle."""
-
-from __future__ import annotations
-
-import logging
-
-logger = logging.getLogger(__name__)
-
-
-async def flush_telemetry() -> None:
-    """Flush OpenTelemetry span processor to export buffered spans immediately.
-
-    Called automatically by async_trace (standalone) and async_job on exit.
-
-    Example:
-        >>> # Custom evaluation loop
-        >>> for task in tasks:
-        ...     async with hud.async_trace(task.name):
-        ...         await process(task)
-        >>> # Spans already flushed by each async_trace
-    """
-    from hud.otel.config import is_telemetry_configured
-    from hud.utils import hud_console
-
-    logger.debug("Flushing telemetry spans...")
-    if not is_telemetry_configured():
-        return
-
-    try:
-        from opentelemetry import trace
-        from opentelemetry.sdk.trace import TracerProvider
-
-        provider = trace.get_tracer_provider()
-        if isinstance(provider, TracerProvider):
-            success = provider.force_flush(timeout_millis=5000)
-            if success:
-                hud_console.info("✓ Telemetry uploaded successfully")
-                logger.debug("OpenTelemetry spans flushed successfully")
-            else:
-                logger.debug("OpenTelemetry flush timed out (will export on exit)")
-    except Exception as e:
-        logger.debug("Failed to flush OpenTelemetry: %s", e)
diff --git a/hud/tests/test_datasets_extended.py b/hud/tests/test_datasets_extended.py
index 5f7366e6..ee8a5725 100644
--- a/hud/tests/test_datasets_extended.py
+++ b/hud/tests/test_datasets_extended.py
@@ -2,28 +2,24 @@
 
 from __future__ import annotations
 
-from typing import Any, cast
+from typing import cast
 from unittest.mock import AsyncMock, MagicMock, patch
 
 import pytest
 
-from hud.datasets import (
-    Task,
-    run_dataset,
-)
-from hud.types import MCPToolCall
-from hud.utils.tasks import save_tasks
+from hud.datasets import run_dataset
+from hud.types import LegacyTask, MCPToolCall
 
 
 class TestTaskExtended:
-    """Extended tests for Task functionality."""
+    """Extended tests for LegacyTask functionality."""
 
     def test_taskconfig_with_all_fields(self):
-        """Test Task with all possible fields."""
+        """Test LegacyTask with all possible fields."""
         setup_tool = MCPToolCall(name="setup", arguments={"board_size": 4})
         evaluate_tool = MCPToolCall(name="evaluate", arguments={"metric": "score"})
 
-        task = Task(
+        task = LegacyTask(
             id="test-123",
             prompt="Play the game",
             mcp_config={
@@ -43,13 +39,15 @@ def test_taskconfig_with_all_fields(self):
         assert task.metadata["version"] == 2
 
     def test_taskconfig_list_tools(self):
-        """Test Task with list of tools."""
+        """Test LegacyTask with list of tools."""
         setup_tools = [
             MCPToolCall(name="init", arguments={}),
             MCPToolCall(name="configure", arguments={"mode": "test"}),
         ]
 
-        task = Task(prompt="Multi-setup task", mcp_config={"test": True}, setup_tool=setup_tools)
+        task = LegacyTask(
+            prompt="Multi-setup task", mcp_config={"test": True}, setup_tool=setup_tools
+        )
 
         assert isinstance(task.setup_tool, list)
         assert len(task.setup_tool) == 2
@@ -77,7 +75,7 @@ def test_env_var_complex_resolution(self, monkeypatch):
                 "hud_telemetry_url": "https://api.example.com",
             }
 
-            task = Task(
+            task = LegacyTask(
                 prompt="Complex env test",
                 mcp_config={
                     "auth": {
@@ -104,7 +102,7 @@ def test_env_var_complex_resolution(self, monkeypatch):
 
     def test_non_string_values_preserved(self):
         """Test that non-string values are preserved during env resolution."""
-        task = Task(
+        task = LegacyTask(
             prompt="Test non-strings",
             mcp_config={
                 "string": "${MISSING}",
@@ -123,245 +121,120 @@ def test_non_string_values_preserved(self):
         assert task.mcp_config["nested"]["dict"]["num"] == 123
 
 
-class TestDatasetOperations:
-    """Test dataset conversion and operations."""
-
-    def test_save_taskconfigs_empty_list(self):
-        """Test saving empty task list."""
-        with patch("hud.utils.tasks.Dataset") as MockDataset:
-            mock_instance = MagicMock()
-            MockDataset.from_list.return_value = mock_instance
-            mock_instance.push_to_hub.return_value = None
-
-            save_tasks([], "test-org/empty-dataset")
-
-            MockDataset.from_list.assert_called_once_with([])
-            mock_instance.push_to_hub.assert_called_once_with("test-org/empty-dataset")
-
-    def test_save_taskconfigs_mixed_rejection(self):
-        """Test that mixing dicts and Task objects is rejected."""
-        valid_dict = {"prompt": "Dict task", "mcp_config": {"test": True}}
-
-        task_object = Task(prompt="Object task", mcp_config={"resolved": "${SOME_VAR}"})
-
-        # First item is dict, second is object
-        with pytest.raises(ValueError, match="Item 1 is a Task object"):
-            save_tasks([valid_dict, task_object], "test-org/mixed")  # type: ignore
-
-
 class TestRunDatasetExtended:
     """Extended tests for run_dataset functionality."""
 
     @pytest.mark.asyncio
     async def test_run_dataset_empty(self):
-        """Test running empty dataset."""
-        with (
-            patch("hud.clients.MCPClient"),
-            patch("hud.async_job") as mock_job_func,
-            patch("hud.async_trace") as mock_trace,
-        ):
-            mock_job_obj = MagicMock()
-            mock_job_obj.id = "job-empty"
-            mock_job_func.return_value.__aenter__.return_value = mock_job_obj
+        """Test running empty dataset raises ValueError."""
+        from hud.types import AgentType
 
-            # Create a mock agent class with proper type
-            from hud.agents import MCPAgent
+        # Empty task list should raise ValueError
+        with pytest.raises(ValueError, match="No tasks to run"):
+            await run_dataset([], agent_type=AgentType.CLAUDE)
 
-            mock_agent_class = type("MockAgent", (MCPAgent,), {})
+    @pytest.mark.asyncio
+    async def test_run_dataset_with_task_list(self):
+        """Test run_dataset with Task objects."""
+        from hud.eval.task import Task
+        from hud.types import Trace
 
-            results = await run_dataset(
-                "empty_run",
-                [],  # Empty task list
-                mock_agent_class,
-            )
+        # Create mock tasks with env as dict (to avoid real connections)
+        mock_env = {"name": "test"}
 
-            assert results == []
-            mock_trace.assert_not_called()
+        tasks = [
+            Task(env=mock_env, scenario="test1"),
+            Task(env=mock_env, scenario="test2"),
+        ]
 
-    @pytest.mark.asyncio
-    async def test_run_dataset_with_metadata(self):
-        """Test run_dataset with custom metadata."""
-        from hud.agents import MCPAgent
+        # Mock hud.eval to avoid real eval context
+        mock_ctx = AsyncMock()
+        mock_ctx.results = None
+        mock_ctx.reward = None
 
-        # Create a proper mock agent class
+        # Create mock agent class and instance (use MagicMock since create() is sync)
         mock_agent_instance = AsyncMock()
-        mock_agent_instance.run.return_value = {"status": "complete"}
-
-        mock_agent_class = type(
-            "MockAgent",
-            (MCPAgent,),
-            {
-                "__init__": lambda self, **kwargs: None,
-                "__new__": lambda cls, **kwargs: mock_agent_instance,
-            },
-        )
-
-        tasks = [{"prompt": "Task 1", "mcp_config": {"url": "test1"}}]
-
-        custom_metadata = {
-            "experiment_id": "exp-123",
-            "tags": ["test", "v2"],
-            "config": {"temperature": 0.7},
-        }
+        mock_agent_instance.run.return_value = Trace(reward=1.0, done=True)
+        mock_agent_cls = MagicMock()
+        mock_agent_cls.create.return_value = mock_agent_instance
 
         with (
-            patch("hud.clients.MCPClient") as MockClient,
-            patch("hud.datasets.runner.async_job") as mock_job_func,
-            patch("hud.datasets.runner.async_trace") as mock_trace,
+            patch("hud.datasets.runner.hud.eval") as mock_eval,
+            patch("hud.agents.claude.ClaudeAgent", mock_agent_cls),
         ):
-            mock_job = AsyncMock()
-            mock_job.id = "job-meta"
-            mock_job_func.return_value.__aenter__.return_value = mock_job
-            mock_trace.return_value.__aenter__.return_value = "trace-id"
-
-            mock_client = AsyncMock()
-            MockClient.return_value = mock_client
-
-            await run_dataset(
-                "metadata_run",
-                tasks,
-                mock_agent_class,  # type: ignore
-                {"verbose": True},
-                metadata=custom_metadata,
-            )
+            mock_eval.return_value.__aenter__ = AsyncMock(return_value=mock_ctx)
+            mock_eval.return_value.__aexit__ = AsyncMock(return_value=None)
 
-            # Verify job was created with merged metadata
-            expected_metadata = {
-                "experiment_id": "exp-123",
-                "tags": ["test", "v2"],
-                "config": {"temperature": 0.7},
-                "agent_config": {"verbose": True},
-            }
+            results = await run_dataset(tasks, agent_type="claude", max_steps=5)
 
-            mock_job_func.assert_called_once_with("metadata_run", metadata=expected_metadata)
+            # Should return list with ctx
+            assert len(results) == 1
+            mock_agent_instance.run.assert_called_once()
 
     @pytest.mark.asyncio
-    async def test_run_dataset_exception_handling(self):
-        """Test exception handling during task execution."""
-        # Track execution by task index
-        executed_task_indices: set[int] = set()
-
-        # Create a mock agent class where behavior depends on the task being run
-        def create_mock_agent(**kwargs):
-            agent = AsyncMock()
+    async def test_run_dataset_from_source_string(self):
+        """Test run_dataset with source string calls load_tasks."""
+        from hud.eval.task import Task
+        from hud.types import Trace
 
-            async def mock_run(task, **run_kwargs):
-                # Extract task index from prompt "Task {i}"
-                task_idx = int(task.prompt.split()[-1])
-                executed_task_indices.add(task_idx)
+        mock_env = {"name": "test"}
+        mock_tasks = [Task(env=mock_env, scenario="loaded")]  # type: ignore[arg-type]
 
-                if task_idx == 1:  # Second task (index 1) should fail
-                    raise RuntimeError("Task 2 failed")
-                return {"result": f"success-{task_idx + 1}"}
+        mock_ctx = AsyncMock()
+        mock_ctx.results = None
 
-            agent.run = mock_run
-            return agent
-
-        # Mock the agent class itself - runner calls agent_class.create()
-        mock_agent_class = MagicMock()
-        mock_agent_class.create = MagicMock(side_effect=create_mock_agent)
-        mock_agent_class.__name__ = "MockAgent"
-
-        tasks = [{"prompt": f"Task {i}", "mcp_config": {"url": f"test{i}"}} for i in range(3)]
+        # Create mock agent class and instance (use MagicMock since create() is sync)
+        mock_agent_instance = AsyncMock()
+        mock_agent_instance.run.return_value = Trace(reward=1.0, done=True)
+        mock_agent_cls = MagicMock()
+        mock_agent_cls.create.return_value = mock_agent_instance
 
         with (
-            patch("hud.clients.MCPClient") as MockClient,
-            patch("hud.async_job") as mock_job_func,
-            patch("hud.async_trace") as mock_trace,
+            patch("hud.datasets.loader.load_tasks", return_value=mock_tasks) as mock_load,
+            patch("hud.datasets.runner.hud.eval") as mock_eval,
+            patch("hud.agents.OpenAIAgent", mock_agent_cls),
         ):
-            mock_job = MagicMock()
-            mock_job.id = "job-error"
-            mock_job_func.return_value.__aenter__.return_value = mock_job
-            mock_trace.return_value.__aenter__.return_value = "trace-id"
+            mock_eval.return_value.__aenter__ = AsyncMock(return_value=mock_ctx)
+            mock_eval.return_value.__aexit__ = AsyncMock(return_value=None)
 
-            mock_client = AsyncMock()
-            MockClient.return_value = mock_client
+            await run_dataset("test-org/dataset", agent_type="openai")
 
-            # Should complete without raising
-            results = await run_dataset("error_run", tasks, mock_agent_class)  # type: ignore
-
-            # All tasks should be attempted
-            assert len(executed_task_indices) == 3
-            assert executed_task_indices == {0, 1, 2}
-
-            # First and third should succeed
-            assert results[0] == {"result": "success-1"}
-            assert results[2] == {"result": "success-3"}
-            # Second result should be None due to exception
-            assert results[1] is None
+            # Should call load_dataset with the source string
+            mock_load.assert_called_once_with("test-org/dataset")
 
     @pytest.mark.asyncio
-    async def test_run_dataset_client_cleanup(self):
-        """Test that MCP clients are properly cleaned up."""
-        from hud.agents import MCPAgent
-
-        # Track client instances
-        client_instances = []
+    async def test_run_dataset_passes_parameters(self):
+        """Test that run_dataset passes parameters correctly to hud.eval."""
+        from hud.eval.task import Task
+        from hud.types import AgentType, Trace
 
-        def create_client(**kwargs):
-            client = AsyncMock()
-            client_instances.append(client)
-            return client
+        mock_env = {"name": "test"}
+        tasks = [Task(env=mock_env, scenario="test")]
 
-        # Mock agent that creates a client
-        def mock_agent_init(self, client=None, **kwargs):
-            if client is None:
-                # Create client if not provided - this simulates real agent behavior
-                from hud.clients import MCPClient
-
-                self.client = MCPClient()  # This will use our mocked version
-            else:
-                self.client = client
+        mock_ctx = AsyncMock()
+        mock_ctx.results = None
 
+        # Create mock agent class and instance (use MagicMock since create() is sync)
         mock_agent_instance = AsyncMock()
-        mock_agent_instance.run.return_value = {"done": True}
-
-        mock_agent_class = type(
-            "MockAgent",
-            (MCPAgent,),
-            {
-                "__init__": mock_agent_init,
-                "__new__": lambda cls, **kwargs: mock_agent_instance,
-            },
-        )
-
-        tasks = [{"prompt": f"Task {i}", "mcp_config": {"url": f"test{i}"}} for i in range(3)]
+        mock_agent_instance.run.return_value = Trace(reward=1.0, done=True)
+        mock_agent_cls = MagicMock()
+        mock_agent_cls.create.return_value = mock_agent_instance
 
         with (
-            patch("hud.clients.MCPClient", side_effect=create_client),
-            patch("hud.job") as mock_job_func,
-            patch("hud.trace") as mock_trace,
+            patch("hud.datasets.runner.hud.eval") as mock_eval,
+            patch("hud.agents.claude.ClaudeAgent", mock_agent_cls),
         ):
-            mock_job = MagicMock()
-            mock_job.id = "job-cleanup"
-            mock_job_func.return_value.__enter__.return_value = mock_job
-            mock_trace.return_value.__enter__.return_value = "trace-id"
-
-            await run_dataset("cleanup_run", tasks, mock_agent_class)  # type: ignore
-
-            # Since agents might not create clients in our current implementation,
-            # just verify the test completes successfully
-            assert len(client_instances) >= 0  # Accept any number of clients created
-
-    @pytest.mark.asyncio
-    async def test_run_dataset_validation_error(self):
-        """Test that tasks without required fields cause validation errors."""
-        from pydantic import ValidationError
-
-        from hud.agents import MCPAgent
+            mock_eval.return_value.__aenter__ = AsyncMock(return_value=mock_ctx)
+            mock_eval.return_value.__aexit__ = AsyncMock(return_value=None)
 
-        # Create a task without mcp_config (required field)
-        task: dict[str, Any] = {
-            "prompt": "Test task",
-            # No mcp_config - should cause validation error during Task(**task_dict)
-        }
-
-        mock_agent_class = type("MockAgent", (MCPAgent,), {})
-
-        # Validation errors should be raised immediately when Task objects are created
-        with pytest.raises(ValidationError):
             await run_dataset(
-                "validation_run",
-                [task],  # Pass the task directly
-                mock_agent_class,  # type: ignore
+                tasks, agent_type=AgentType.CLAUDE, max_steps=25, max_concurrent=10, group_size=3
+            )
+
+            # Verify hud.eval was called with correct params
+            mock_eval.assert_called_once_with(
+                tasks,
+                group=3,
+                max_concurrent=10,
+                quiet=True,
             )
diff --git a/hud/tests/test_init.py b/hud/tests/test_init.py
index 8e7ecf4b..4c264405 100644
--- a/hud/tests/test_init.py
+++ b/hud/tests/test_init.py
@@ -41,12 +41,10 @@ def test_all_exports_available(self):
         import hud
 
         expected_exports = [
-            "clear_trace",
-            "create_job",
-            "get_trace",
+            "Environment",
+            "EvalContext",
+            "eval",
             "instrument",
-            "job",
-            "trace",
         ]
 
         for export in expected_exports:
diff --git a/hud/tests/test_init_module.py b/hud/tests/test_init_module.py
index 6f76d52c..607dbfae 100644
--- a/hud/tests/test_init_module.py
+++ b/hud/tests/test_init_module.py
@@ -21,15 +21,11 @@ def test_all_exports(self):
         import hud
 
         expected = [
-            "Trace",
-            "async_job",
-            "async_trace",
-            "clear_trace",
-            "create_job",
-            "get_trace",
+            "Environment",
+            "EvalContext",
+            "eval",
             "instrument",
-            "job",
-            "trace",
+            "trace",  # Deprecated alias for eval
         ]
 
         assert set(hud.__all__) == set(expected)
diff --git a/hud/tests/test_settings.py b/hud/tests/test_settings.py
index 538427cb..47a605ac 100644
--- a/hud/tests/test_settings.py
+++ b/hud/tests/test_settings.py
@@ -13,10 +13,11 @@ def test_get_settings():
 
 
 def test_settings_defaults():
-    """Test that settings have expected default values."""
+    """Test that settings have expected default values or env overrides."""
     s = get_settings()
-    assert s.hud_telemetry_url == "https://telemetry.hud.ai/v3/api"
-    assert s.hud_mcp_url == "https://mcp.hud.ai/v3/mcp"
+    # These URLs may be overridden by environment variables
+    assert s.hud_telemetry_url.endswith("/v3/api")
+    assert s.hud_mcp_url.endswith("/v3/mcp")
     # Default may be overridden in CI; just assert the field exists and is bool
     assert isinstance(s.telemetry_enabled, bool)
     assert s.hud_logging is True
diff --git a/hud/tests/test_types.py b/hud/tests/test_types.py
index d202f707..127cca5c 100644
--- a/hud/tests/test_types.py
+++ b/hud/tests/test_types.py
@@ -1,16 +1,16 @@
 from __future__ import annotations
 
-from unittest.mock import MagicMock, patch
+from unittest.mock import patch
 
 import pytest
 from mcp.types import ImageContent, TextContent
 
-from hud.types import AgentResponse, MCPToolCall, MCPToolResult, Task, Trace, TraceStep
+from hud.types import AgentResponse, LegacyTask, MCPToolCall, MCPToolResult, Trace, TraceStep
 
 
 def test_task_with_json_strings():
-    """Test Task with JSON strings for config fields."""
-    task = Task(
+    """Test LegacyTask with JSON strings for config fields."""
+    task = LegacyTask(
         prompt="test",
         mcp_config='{"test": "config"}',  # type: ignore
         metadata='{"key": "value"}',  # type: ignore
@@ -23,19 +23,19 @@ def test_task_with_json_strings():
 
 
 def test_task_json_parse_error():
-    """Test Task raises error on invalid JSON."""
+    """Test LegacyTask raises error on invalid JSON."""
     from hud.shared.exceptions import HudConfigError
 
     with pytest.raises(HudConfigError, match="Invalid JSON string"):
-        Task(prompt="test", mcp_config="{invalid json}")  # type: ignore
+        LegacyTask(prompt="test", mcp_config="{invalid json}")  # type: ignore
 
 
 def test_task_agent_config_rejects_extra_fields():
-    """Test Task agent_config rejects unknown fields."""
+    """Test LegacyTask agent_config rejects unknown fields."""
     from pydantic import ValidationError
 
     with pytest.raises(ValidationError):
-        Task(
+        LegacyTask(
             prompt="test",
             mcp_config={},
             agent_config={"model": "test", "unknown_field": "value"},  # type: ignore
@@ -43,8 +43,8 @@ def test_task_agent_config_rejects_extra_fields():
 
 
 def test_task_setup_tool_from_json_string():
-    """Test Task converts JSON string to tool call."""
-    task = Task(
+    """Test LegacyTask converts JSON string to tool call."""
+    task = LegacyTask(
         prompt="test",
         mcp_config={},
         setup_tool='{"name": "test_tool", "arguments": {"x": 1}}',  # type: ignore
@@ -54,16 +54,16 @@ def test_task_setup_tool_from_json_string():
 
 
 def test_task_setup_tool_json_error():
-    """Test Task raises error on invalid tool JSON."""
+    """Test LegacyTask raises error on invalid tool JSON."""
     from hud.shared.exceptions import HudConfigError
 
     with pytest.raises(HudConfigError, match="Invalid JSON string"):
-        Task(prompt="test", mcp_config={}, setup_tool="{invalid}")  # type: ignore
+        LegacyTask(prompt="test", mcp_config={}, setup_tool="{invalid}")  # type: ignore
 
 
 def test_task_setup_tool_from_list():
-    """Test Task converts list of dicts to list of tool calls."""
-    task = Task(
+    """Test LegacyTask converts list of dicts to list of tool calls."""
+    task = LegacyTask(
         prompt="test",
         mcp_config={},
         setup_tool=[
@@ -77,9 +77,9 @@ def test_task_setup_tool_from_list():
 
 
 def test_task_env_var_substitution():
-    """Test Task resolves environment variables."""
+    """Test LegacyTask resolves environment variables."""
     with patch.dict("os.environ", {"TEST_VAR": "test_value"}):
-        task = Task(
+        task = LegacyTask(
             prompt="test",
             mcp_config={"url": "${TEST_VAR}"},
         )
@@ -87,9 +87,9 @@ def test_task_env_var_substitution():
 
 
 def test_task_env_var_nested():
-    """Test Task resolves env vars in nested structures."""
+    """Test LegacyTask resolves env vars in nested structures."""
     with patch.dict("os.environ", {"NESTED_VAR": "nested_value"}):
-        task = Task(
+        task = LegacyTask(
             prompt="test",
             mcp_config={"level1": {"level2": {"url": "${NESTED_VAR}"}}},
         )
@@ -97,9 +97,9 @@ def test_task_env_var_nested():
 
 
 def test_task_env_var_in_list():
-    """Test Task resolves env vars in lists."""
+    """Test LegacyTask resolves env vars in lists."""
     with patch.dict("os.environ", {"LIST_VAR": "list_value"}):
-        task = Task(
+        task = LegacyTask(
             prompt="test",
             mcp_config={"items": ["${LIST_VAR}", "static"]},
         )
@@ -249,45 +249,3 @@ def test_trace_num_messages():
     """Test Trace num_messages property."""
     trace = Trace(messages=[{"role": "user"}, {"role": "assistant"}])
     assert trace.num_messages == 2
-
-
-def test_trace_populate_from_context():
-    """Test Trace.populate_from_context with no context."""
-    trace = Trace()
-    # Should not raise when no context
-    trace.populate_from_context()
-    assert len(trace.trace) == 0
-
-
-def test_trace_populate_from_context_with_context():
-    """Test Trace.populate_from_context with active context."""
-    with (
-        patch("hud.otel.context.get_current_task_run_id") as mock_get_id,
-        patch("hud.telemetry.replay.get_trace") as mock_get_trace,
-    ):
-        mock_get_id.return_value = "test_run_id"
-        mock_trace = MagicMock()
-        mock_trace.trace = [TraceStep(category="mcp")]
-        mock_get_trace.return_value = mock_trace
-
-        trace = Trace()
-        trace.populate_from_context()
-
-        assert len(trace.trace) == 1
-        mock_get_id.assert_called_once()
-        mock_get_trace.assert_called_once_with("test_run_id")
-
-
-def test_trace_populate_from_context_no_trace():
-    """Test Trace.populate_from_context when get_trace returns None."""
-    with (
-        patch("hud.otel.context.get_current_task_run_id") as mock_get_id,
-        patch("hud.telemetry.replay.get_trace") as mock_get_trace,
-    ):
-        mock_get_id.return_value = "test_run_id"
-        mock_get_trace.return_value = None
-
-        trace = Trace()
-        trace.populate_from_context()
-
-        assert len(trace.trace) == 0
diff --git a/hud/tools/__init__.py b/hud/tools/__init__.py
index 57f70e99..8451a04f 100644
--- a/hud/tools/__init__.py
+++ b/hud/tools/__init__.py
@@ -17,6 +17,7 @@
         GeminiComputerTool,
         HudComputerTool,
         OpenAIComputerTool,
+        QwenComputerTool,
     )
 
 __all__ = [
@@ -29,6 +30,7 @@
     "HudComputerTool",
     "OpenAIComputerTool",
     "PlaywrightTool",
+    "QwenComputerTool",
     "ResponseTool",
     "SubmitTool",
 ]
@@ -41,6 +43,7 @@ def __getattr__(name: str) -> Any:
         "HudComputerTool",
         "OpenAIComputerTool",
         "GeminiComputerTool",
+        "QwenComputerTool",
     ):
         from . import computer
 
diff --git a/hud/tools/base.py b/hud/tools/base.py
index 95e1fa4a..faa475de 100644
--- a/hud/tools/base.py
+++ b/hud/tools/base.py
@@ -416,8 +416,12 @@ def _update_dispatcher_description(self) -> None:
                 }
 
     # Override _list_tools to hide internal tools when mounted
-    async def _list_tools(self) -> list[Tool]:
-        """Override _list_tools to hide internal tools when mounted."""
+    async def _list_tools(self, context: Any = None) -> list[Tool]:
+        """Override _list_tools to hide internal tools when mounted.
+
+        Args:
+            context: MiddlewareContext passed by FastMCP (optional for backwards compat)
+        """
         return [
             tool
             for key, tool in self._tool_manager._tools.items()
diff --git a/hud/tools/computer/anthropic.py b/hud/tools/computer/anthropic.py
index 310fbbf3..59854215 100644
--- a/hud/tools/computer/anthropic.py
+++ b/hud/tools/computer/anthropic.py
@@ -141,13 +141,13 @@ def _map_anthropic_key_to_cla(self, key: str) -> str:
     async def __call__(
         self,
         action: str = Field(..., description="The action to perform on the computer"),
-        coordinate: list[int] | tuple[int, int] | None = Field(
+        coordinate: list[int] | None = Field(
             None, description="The coordinate to interact with on the computer [x, y]"
         ),
         text: str | None = Field(
             None, description="The text to type on the computer or key to press"
         ),
-        start_coordinate: list[int] | tuple[int, int] | None = Field(
+        start_coordinate: list[int] | None = Field(
             None, description="The starting coordinate for drag actions [x, y]"
         ),
         scroll_direction: str | None = Field(
diff --git a/hud/tools/computer/hud.py b/hud/tools/computer/hud.py
index 6c3dc9c2..2bade98a 100644
--- a/hud/tools/computer/hud.py
+++ b/hud/tools/computer/hud.py
@@ -13,7 +13,7 @@
 from hud.tools.executors.base import BaseExecutor
 from hud.tools.executors.pyautogui import PyAutoGUIExecutor
 from hud.tools.executors.xdo import XDOExecutor
-from hud.tools.types import ContentResult, ToolError
+from hud.tools.types import ContentResult, Coordinate, ToolError
 
 from .settings import computer_settings
 
@@ -270,8 +270,8 @@ async def __call__(
         offset_x: int | None = Field(None, description="X offset for relative move"),
         offset_y: int | None = Field(None, description="Y offset for relative move"),
         # Drag parameters
-        path: list[tuple[int, int]] | None = Field(
-            None, description="Path for drag actions as list of (x, y) coordinates"
+        path: list[Coordinate] | None = Field(
+            None, description="Path for drag actions as list of {x, y} coordinates"
         ),
         # Wait parameter
         time: int | None = Field(None, description="Time in milliseconds for wait action"),
@@ -348,8 +348,9 @@ async def __call__(
             elif action == "drag":
                 if path is None:
                     raise ToolError("path parameter is required for drag")
-                # Scale path from client space to screen space
-                scaled_path = self._scale_path(path)
+                # Convert Coordinate objects to tuples and scale from client space to screen space
+                path_tuples = [(point.x, point.y) for point in path]
+                scaled_path = self._scale_path(path_tuples)
                 result = await self.executor.drag(
                     path=scaled_path, pattern=pattern, hold_keys=hold_keys
                 )
diff --git a/hud/tools/computer/openai.py b/hud/tools/computer/openai.py
index 53806550..576cc618 100644
--- a/hud/tools/computer/openai.py
+++ b/hud/tools/computer/openai.py
@@ -6,10 +6,10 @@
 
 from mcp import ErrorData, McpError
 from mcp.types import INTERNAL_ERROR, INVALID_PARAMS, ContentBlock, TextContent
-from pydantic import BaseModel, Field
+from pydantic import Field
 
 from hud.tools.computer.settings import computer_settings
-from hud.tools.types import ContentResult
+from hud.tools.types import ContentResult, Coordinate
 
 from .hud import HudComputerTool
 
@@ -19,15 +19,6 @@
 logger = logging.getLogger(__name__)
 
 
-class Coordinate(BaseModel):
-    """A coordinate point with x and y values."""
-
-    model_config = {"extra": "forbid"}  # Ensures additionalProperties: false in JSON schema
-
-    x: int = Field(..., description="X coordinate")
-    y: int = Field(..., description="Y coordinate")
-
-
 # Map OpenAI key names to CLA standard keys
 OPENAI_TO_CLA_KEYS = {
     # Common variations
diff --git a/hud/tools/computer/qwen.py b/hud/tools/computer/qwen.py
index 71da53fb..6f3db5cc 100644
--- a/hud/tools/computer/qwen.py
+++ b/hud/tools/computer/qwen.py
@@ -194,7 +194,7 @@ async def __call__(
         action: str = Field(..., description="The action to perform on the computer"),
         keys: list[str] | None = Field(None, description="Keys for key action"),
         text: str | None = Field(None, description="Text to type"),
-        coordinate: list[int] | tuple[int, int] | None = Field(
+        coordinate: list[int] | None = Field(
             None, description="The coordinate to interact with on the computer [x, y]"
         ),
         pixels: int | None = Field(None, description="Pixels to scroll"),
diff --git a/hud/tools/grounding/grounded_tool.py b/hud/tools/grounding/grounded_tool.py
index bc9d0345..21537afb 100644
--- a/hud/tools/grounding/grounded_tool.py
+++ b/hud/tools/grounding/grounded_tool.py
@@ -3,14 +3,15 @@
 from __future__ import annotations
 
 import logging
-from typing import Any
+from typing import TYPE_CHECKING, Any
 
 from mcp import ErrorData, McpError
 from mcp.types import INVALID_PARAMS, ContentBlock
 
-from hud.clients.base import AgentMCPClient  # noqa: TC001
 from hud.tools.grounding.grounder import Grounder  # noqa: TC001
-from hud.types import MCPToolCall
+
+if TYPE_CHECKING:
+    from hud.environment import Environment
 
 logger = logging.getLogger(__name__)
 
@@ -33,18 +34,18 @@ def __init__(
         self,
         *,
         grounder: Grounder,
-        mcp_client: AgentMCPClient,
+        ctx: Environment,
         computer_tool_name: str = "computer",
     ) -> None:
         """Initialize the grounded computer tool.
 
         Args:
             grounder: Grounder instance for visual grounding
-            mcp_client: MCP client to call the environment's computer tool
+            ctx: Environment or EvalContext to call tools through
             computer_tool_name: Name of the computer tool in the environment
         """
         self._grounder = grounder
-        self._mcp_client = mcp_client
+        self._ctx = ctx
         self._computer_tool_name = computer_tool_name
 
     def get_openai_tool_schema(self) -> dict:
@@ -172,10 +173,8 @@ async def __call__(
                 if keys is not None:
                     computer_args["keys"] = keys
 
-                result = await self._mcp_client.call_tool(
-                    MCPToolCall(
-                        name=self._computer_tool_name, arguments={**computer_args, **kwargs}
-                    )
+                result = await self._ctx.call_tool(
+                    (self._computer_tool_name, {**computer_args, **kwargs})
                 )
                 return result.content
 
@@ -224,10 +223,8 @@ async def __call__(
                 if scroll_y is not None:
                     computer_args["scroll_y"] = scroll_y
 
-                result = await self._mcp_client.call_tool(
-                    MCPToolCall(
-                        name=self._computer_tool_name, arguments={**computer_args, **kwargs}
-                    )
+                result = await self._ctx.call_tool(
+                    (self._computer_tool_name, {**computer_args, **kwargs})
                 )
                 return result.content
 
@@ -292,10 +289,8 @@ async def __call__(
                 if button:
                     computer_args["button"] = button
 
-                result = await self._mcp_client.call_tool(
-                    MCPToolCall(
-                        name=self._computer_tool_name, arguments={**computer_args, **kwargs}
-                    )
+                result = await self._ctx.call_tool(
+                    (self._computer_tool_name, {**computer_args, **kwargs})
                 )
                 return result.content
 
diff --git a/hud/tools/grounding/grounder.py b/hud/tools/grounding/grounder.py
index fa593831..862432d0 100644
--- a/hud/tools/grounding/grounder.py
+++ b/hud/tools/grounding/grounder.py
@@ -4,15 +4,15 @@
 
 import base64
 import io
-import json
+import logging
 import re
 
 from openai import AsyncOpenAI
-from opentelemetry import trace
 
-from hud import instrument
 from hud.tools.grounding.config import GrounderConfig  # noqa: TC001
 
+logger = logging.getLogger(__name__)
+
 
 class Grounder:
     """Grounder that uses AsyncOpenAI to call vLLM or other model endpoints for visual grounding.
@@ -181,12 +181,6 @@ def _convert_coordinates(
 
         return (final_x, final_y)
 
-    @instrument(
-        name="Grounding.predict_click",
-        span_type="agent",
-        record_args=True,
-        record_result=True,
-    )
     async def predict_click(
         self, *, image_b64: str, instruction: str, max_retries: int = 3
     ) -> tuple[int, int] | None:
@@ -247,12 +241,7 @@ async def predict_click(
 
                 # Extract response text
                 response_text = response.choices[0].message.content
-
-                # Manually record the raw response in the span
-                span = trace.get_current_span()
-                if span and span.is_recording():
-                    span.set_attribute("grounder.raw_response", json.dumps(response.model_dump()))
-                    span.set_attribute("grounder.attempt", attempt + 1)
+                logger.debug("Grounder attempt %d response: %s", attempt + 1, response_text)
 
                 # Parse coordinates from response
                 if response_text is None:
@@ -277,26 +266,16 @@ async def predict_click(
                     y = max(0, min(y, original_size[1] - 1))
                     pixel_coords = (x, y)
 
-                # Record successful grounding in span
-                span = trace.get_current_span()
-                if span and span.is_recording():
-                    span.set_attribute("grounder.success", True)
-                    span.set_attribute(
-                        "grounder.final_coords", f"{pixel_coords[0]},{pixel_coords[1]}"
-                    )
-                    span.set_attribute("grounder.total_attempts", attempt + 1)
-
+                logger.debug(
+                    "Grounder success: coords=%s after %d attempts",
+                    pixel_coords,
+                    attempt + 1,
+                )
                 return pixel_coords
 
             except Exception:
                 if attempt < max_retries - 1:
                     continue
 
-        # Record failure in span
-        span = trace.get_current_span()
-        if span and span.is_recording():
-            span.set_attribute("grounder.success", False)
-            span.set_attribute("grounder.total_attempts", max_retries)
-            span.set_attribute("grounder.failure_reason", "All attempts exhausted")
-
+        logger.debug("Grounder failed after %d attempts", max_retries)
         return None
diff --git a/hud/tools/grounding/tests/test_grounded_tool.py b/hud/tools/grounding/tests/test_grounded_tool.py
index 8b625c27..28fd6d23 100644
--- a/hud/tools/grounding/tests/test_grounded_tool.py
+++ b/hud/tools/grounding/tests/test_grounded_tool.py
@@ -7,7 +7,7 @@
 import pytest
 
 from hud.tools.grounding.grounded_tool import GroundedComputerTool
-from hud.types import MCPToolCall, MCPToolResult
+from hud.types import MCPToolResult
 
 
 @dataclass
@@ -17,36 +17,18 @@ class FakeResult:
     structuredContent: dict | None = None
 
 
-class FakeMCPClient:
-    """Fake MCP client that implements AgentMCPClient protocol."""
-
-    _initialized: bool
+class FakeEnvironment:
+    """Fake Environment that implements the call_tool interface."""
 
     def __init__(self) -> None:
         self.calls: list[tuple[str, dict[str, Any]]] = []
-        self._initialized = False
-
-    @property
-    def mcp_config(self) -> dict[str, dict[str, Any]]:
-        return {"test": {"command": "echo", "args": ["test"]}}
-
-    @property
-    def is_connected(self) -> bool:
-        return self._initialized
 
-    async def initialize(self, mcp_config: dict[str, dict[str, Any]] | None = None) -> None:
-        self._initialized = True
-
-    async def list_tools(self) -> list[types.Tool]:
-        return [types.Tool(name="computer", description="Test tool", inputSchema={})]
-
-    async def call_tool(self, tool_call: MCPToolCall) -> MCPToolResult:
-        self.calls.append((tool_call.name, tool_call.arguments or {}))
+    async def call_tool(self, call: tuple[str, dict[str, Any]], /, **kwargs: Any) -> MCPToolResult:
+        """Record the tool call and return a fake result."""
+        tool_name, tool_args = call
+        self.calls.append((tool_name, tool_args))
         return MCPToolResult(content=[types.TextContent(text="ok", type="text")], isError=False)
 
-    async def shutdown(self) -> None:
-        self._initialized = False
-
 
 class FakeGrounder:
     """Fake grounder that implements Grounder interface."""
@@ -72,9 +54,9 @@ def _png_b64() -> str:
 
 @pytest.mark.asyncio
 async def test_click_action_grounds_and_calls_mcp() -> None:
-    client = FakeMCPClient()
+    ctx = FakeEnvironment()
     grounder = FakeGrounder(coords=(123, 456))
-    tool = GroundedComputerTool(grounder=grounder, mcp_client=client)  # type: ignore
+    tool = GroundedComputerTool(grounder=grounder, ctx=ctx)  # type: ignore
 
     blocks = await tool(
         action="click",
@@ -87,14 +69,14 @@ async def test_click_action_grounds_and_calls_mcp() -> None:
     # Grounder called once
     assert len(grounder.calls) == 1
     # MCP called with resolved coordinates
-    assert client.calls == [("computer", {"action": "click", "x": 123, "y": 456, "button": "left"})]
+    assert ctx.calls == [("computer", {"action": "click", "x": 123, "y": 456, "button": "left"})]
 
 
 @pytest.mark.asyncio
 async def test_move_and_scroll_require_element_description_and_screenshot() -> None:
-    client = FakeMCPClient()
+    ctx = FakeEnvironment()
     grounder = FakeGrounder(coords=(5, 6))
-    tool = GroundedComputerTool(grounder=grounder, mcp_client=client)  # type: ignore
+    tool = GroundedComputerTool(grounder=grounder, ctx=ctx)  # type: ignore
 
     # Missing element_description
     with pytest.raises(Exception) as ei:
@@ -109,9 +91,9 @@ async def test_move_and_scroll_require_element_description_and_screenshot() -> N
 
 @pytest.mark.asyncio
 async def test_drag_grounds_both_points_and_calls_mcp() -> None:
-    client = FakeMCPClient()
+    ctx = FakeEnvironment()
     grounder = FakeGrounder(coords=(10, 20))
-    tool = GroundedComputerTool(grounder=grounder, mcp_client=client)  # type: ignore
+    tool = GroundedComputerTool(grounder=grounder, ctx=ctx)  # type: ignore
 
     await tool(
         action="drag",
@@ -124,7 +106,7 @@ async def test_drag_grounds_both_points_and_calls_mcp() -> None:
     # Two grounding calls (start and end)
     assert len(grounder.calls) == 2
     # Drag path contains two points, same coords from fake grounder
-    name, args = client.calls[0]
+    name, args = ctx.calls[0]
     assert name == "computer"
     assert args["action"] == "drag"
     assert args["button"] == "left"
@@ -133,9 +115,9 @@ async def test_drag_grounds_both_points_and_calls_mcp() -> None:
 
 @pytest.mark.asyncio
 async def test_drag_requires_both_descriptions_and_screenshot() -> None:
-    client = FakeMCPClient()
+    ctx = FakeEnvironment()
     grounder = FakeGrounder()
-    tool = GroundedComputerTool(grounder=grounder, mcp_client=client)  # type: ignore
+    tool = GroundedComputerTool(grounder=grounder, ctx=ctx)  # type: ignore
 
     with pytest.raises(Exception) as ei:
         await tool(action="drag", start_element_description="a", screenshot_b64=_png_b64())
@@ -152,9 +134,9 @@ async def test_drag_requires_both_descriptions_and_screenshot() -> None:
 
 @pytest.mark.asyncio
 async def test_direct_actions_bypass_grounding_and_call_mcp() -> None:
-    client = FakeMCPClient()
+    ctx = FakeEnvironment()
     grounder = FakeGrounder()
-    tool = GroundedComputerTool(grounder=grounder, mcp_client=client)  # type: ignore
+    tool = GroundedComputerTool(grounder=grounder, ctx=ctx)  # type: ignore
 
     # Actions that bypass grounding
     for action, extra in [
@@ -166,19 +148,19 @@ async def test_direct_actions_bypass_grounding_and_call_mcp() -> None:
         ("get_dimensions", {}),
         ("get_environment", {}),
     ]:
-        client.calls.clear()
+        ctx.calls.clear()
         _ = await tool(action=action, **extra)
-        assert client.calls and client.calls[0][0] == "computer"
-        assert client.calls[0][1]["action"] == action
+        assert ctx.calls and ctx.calls[0][0] == "computer"
+        assert ctx.calls[0][1]["action"] == action
     # Grounder not invoked for these
     assert grounder.calls == []
 
 
 @pytest.mark.asyncio
 async def test_unsupported_action_raises() -> None:
-    client = FakeMCPClient()
+    ctx = FakeEnvironment()
     grounder = FakeGrounder()
-    tool = GroundedComputerTool(grounder=grounder, mcp_client=client)  # type: ignore
+    tool = GroundedComputerTool(grounder=grounder, ctx=ctx)  # type: ignore
 
     with pytest.raises(Exception) as ei:
         await tool(action="zoom")
@@ -187,9 +169,9 @@ async def test_unsupported_action_raises() -> None:
 
 @pytest.mark.asyncio
 async def test_grounding_failure_propagates_as_error() -> None:
-    client = FakeMCPClient()
+    ctx = FakeEnvironment()
     grounder = FakeGrounder(coords=None)
-    tool = GroundedComputerTool(grounder=grounder, mcp_client=client)  # type: ignore
+    tool = GroundedComputerTool(grounder=grounder, ctx=ctx)  # type: ignore
 
     with pytest.raises(Exception) as ei:
         await tool(action="click", element_description="x", screenshot_b64=_png_b64())
diff --git a/hud/tools/jupyter.py b/hud/tools/jupyter.py
index 479e647b..b525caa2 100644
--- a/hud/tools/jupyter.py
+++ b/hud/tools/jupyter.py
@@ -1,4 +1,7 @@
-"""Jupyter execution tool."""
+"""Jupyter execution tool.
+
+Requires the [agents] extra: pip install hud-python[agents]
+"""
 
 from __future__ import annotations
 
@@ -8,12 +11,6 @@
 from typing import TYPE_CHECKING, Any, ClassVar
 from uuid import uuid4
 
-import tornado
-from tornado.escape import json_decode, json_encode, url_escape
-from tornado.httpclient import AsyncHTTPClient, HTTPRequest
-from tornado.ioloop import PeriodicCallback
-from tornado.websocket import websocket_connect
-
 from hud.tools.base import BaseTool
 from hud.tools.types import ContentResult, ToolError
 
@@ -80,6 +77,15 @@ def __init__(
             kernel_id: (Optional) If set, connect to the existed kernel with kernel_id.
                 If empty, create new kernel
         """
+        # Check tornado is available
+        try:
+            import tornado  # noqa: F401
+        except ImportError as e:
+            raise ImportError(
+                "JupyterTool requires the [agents] extra. "
+                "Install with: pip install hud-python[agents]"
+            ) from e
+
         super().__init__(
             env=None,
             name="jupyter",
@@ -94,12 +100,12 @@ def __init__(
 
         # Kernel state (reuse existing or create new)
         self._kernel_id = kernel_id
-        self._ws = None
+        self._ws: Any = None
         self._initialized = False
 
         # WebSocket heartbeat
         self._heartbeat_interval = 10000  # 10 seconds
-        self._heartbeat_callback = None
+        self._heartbeat_callback: Any = None
 
     async def __call__(self, code: str, execution_timeout: int = 15) -> list[ContentBlock]:
         """Execute Python code in the Jupyter kernel.
@@ -140,6 +146,12 @@ async def _ensure_kernel(self) -> None:
 
     async def _connect(self) -> None:
         """Connect to Jupyter kernel via WebSocket."""
+        import tornado.iostream
+        from tornado.escape import json_decode, json_encode, url_escape
+        from tornado.httpclient import AsyncHTTPClient, HTTPRequest
+        from tornado.ioloop import PeriodicCallback
+        from tornado.websocket import websocket_connect
+
         if self._ws:
             self._ws.close()
             self._ws = None
@@ -177,22 +189,22 @@ async def _connect(self) -> None:
         # Setup heartbeat to keep connection alive
         if self._heartbeat_callback:
             self._heartbeat_callback.stop()
-        self._heartbeat_callback = PeriodicCallback(self._send_heartbeat, self._heartbeat_interval)
-        self._heartbeat_callback.start()
 
-    async def _send_heartbeat(self) -> None:
-        """Send heartbeat to maintain WebSocket connection."""
-        if not self._ws:
-            return
-        try:
-            self._ws.ping()
-        except tornado.iostream.StreamClosedError:
+        async def heartbeat() -> None:
+            if not self._ws:
+                return
             try:
-                await self._connect()
-            except ConnectionRefusedError:
-                logger.warning(
-                    "Failed to reconnect to kernel websocket - Is the kernel still running?"
-                )
+                self._ws.ping()
+            except tornado.iostream.StreamClosedError:
+                try:
+                    await self._connect()
+                except ConnectionRefusedError:
+                    logger.warning(
+                        "Failed to reconnect to kernel websocket - Is the kernel still running?"
+                    )
+
+        self._heartbeat_callback = PeriodicCallback(heartbeat, self._heartbeat_interval)
+        self._heartbeat_callback.start()
 
     async def _execute(self, code: str, execution_timeout: int = 15) -> str:
         """Execute code in Jupyter kernel and return output.
@@ -204,11 +216,14 @@ async def _execute(self, code: str, execution_timeout: int = 15) -> str:
         Returns:
             String output from the kernel
         """
+        from tornado.escape import json_decode, json_encode
+        from tornado.httpclient import AsyncHTTPClient
+
         if not self._ws:
             await self._connect()
 
         msg_id = uuid4().hex
-        self._ws.write_message(  # type: ignore
+        self._ws.write_message(
             json_encode(
                 {
                     "header": {
@@ -233,13 +248,13 @@ async def _execute(self, code: str, execution_timeout: int = 15) -> str:
             )
         )
 
-        outputs = []
+        outputs: list[str] = []
 
         async def wait_for_messages() -> bool:
             execution_done = False
             while not execution_done:
-                msg = await self._ws.read_message()  # type: ignore
-                msg = json_decode(msg)  # type: ignore
+                msg = await self._ws.read_message()
+                msg = json_decode(msg)
                 msg_type = msg["msg_type"]
                 parent_msg_id = msg["parent_header"].get("msg_id", None)
 
@@ -285,6 +300,8 @@ async def interrupt_kernel() -> None:
 
     async def shutdown(self) -> None:
         """Shutdown the kernel connection."""
+        from tornado.httpclient import AsyncHTTPClient
+
         if self._kernel_id:
             client = AsyncHTTPClient()
             try:
diff --git a/hud/tools/shell.py b/hud/tools/shell.py
index eff208b8..fe6a7efa 100644
--- a/hud/tools/shell.py
+++ b/hud/tools/shell.py
@@ -11,6 +11,7 @@
 
 import asyncio
 import os
+import sys
 from dataclasses import dataclass
 from typing import Any, Literal
 
@@ -81,15 +82,21 @@ async def start(self) -> None:
             await asyncio.sleep(0)
             return
 
-        def demote() -> None:
-            # This only runs in the child process
-            os.setsid()
-            os.setgid(1000)
-            os.setuid(1000)
+        # preexec_fn and user demotion only available on Unix
+        preexec_fn = None
+        if sys.platform != "win32":
+
+            def demote() -> None:
+                # This only runs in the child process (Unix only)
+                os.setsid()  # type: ignore[attr-defined]
+                os.setgid(1000)  # type: ignore[attr-defined]
+                os.setuid(1000)  # type: ignore[attr-defined]
+
+            preexec_fn = demote
 
         self._process = await asyncio.create_subprocess_shell(  # noqa: S604
             self.command,
-            preexec_fn=demote,
+            preexec_fn=preexec_fn,
             shell=True,
             bufsize=0,
             stdin=asyncio.subprocess.PIPE,
diff --git a/hud/tools/tests/test_apply_patch.py b/hud/tools/tests/test_apply_patch.py
index d27263ea..87255f75 100644
--- a/hud/tools/tests/test_apply_patch.py
+++ b/hud/tools/tests/test_apply_patch.py
@@ -423,7 +423,9 @@ def test_validate_path_valid(self):
         with tempfile.TemporaryDirectory() as tmpdir:
             tool = ApplyPatchTool(base_path=tmpdir)
             result = tool._validate_path("subdir/file.txt")
-            assert result == os.path.join(tmpdir, "subdir/file.txt")
+            # Normalize path separators for cross-platform compatibility
+            expected = os.path.normpath(os.path.join(tmpdir, "subdir/file.txt"))
+            assert result == expected
 
     @pytest.mark.asyncio
     async def test_call_missing_type(self):
diff --git a/hud/tools/tests/test_computer.py b/hud/tools/tests/test_computer.py
index 77aa2ded..195423a8 100644
--- a/hud/tools/tests/test_computer.py
+++ b/hud/tools/tests/test_computer.py
@@ -9,6 +9,7 @@
 from hud.tools.computer.hud import HudComputerTool
 from hud.tools.computer.openai import OpenAIComputerTool
 from hud.tools.executors.base import BaseExecutor
+from hud.tools.types import Coordinate
 
 
 @pytest.mark.asyncio
@@ -193,7 +194,9 @@ async def test_move_action(self, base_executor):
     async def test_drag_action(self, base_executor):
         """Test drag action with BaseExecutor."""
         tool = HudComputerTool(executor=base_executor)
-        result = await tool(action="drag", path=[(100, 100), (200, 200)])
+        result = await tool(
+            action="drag", path=[Coordinate(x=100, y=100), Coordinate(x=200, y=200)]
+        )
         assert result
         assert any("Drag" in content.text for content in result if isinstance(content, TextContent))
 
diff --git a/hud/tools/tests/test_computer_actions.py b/hud/tools/tests/test_computer_actions.py
index 12ef17b6..cd15d6df 100644
--- a/hud/tools/tests/test_computer_actions.py
+++ b/hud/tools/tests/test_computer_actions.py
@@ -6,6 +6,7 @@
 from mcp.types import ImageContent, TextContent
 
 from hud.tools.computer.hud import HudComputerTool
+from hud.tools.types import Coordinate
 
 # (action, kwargs)
 CASES = [
@@ -19,7 +20,7 @@
     # Skip move test - it has Field parameter handling issues when called directly
     # ("move", {"x": 5, "y": 5}),  # x,y are for absolute positioning
     ("wait", {"time": 5}),
-    ("drag", {"path": [(0, 0), (10, 10)]}),
+    ("drag", {"path": [Coordinate(x=0, y=0), Coordinate(x=10, y=10)]}),
     ("mouse_down", {}),
     ("mouse_up", {}),
     ("hold_key", {"text": "a", "duration": 0.1}),
diff --git a/hud/tools/tests/test_jupyter_tool.py b/hud/tools/tests/test_jupyter_tool.py
index 02bd1c6b..cb27a4b9 100644
--- a/hud/tools/tests/test_jupyter_tool.py
+++ b/hud/tools/tests/test_jupyter_tool.py
@@ -5,6 +5,11 @@
 from unittest.mock import AsyncMock, MagicMock, patch
 
 import pytest
+
+# Import tornado modules before tests to avoid forward reference issues with mocking
+import tornado.httpclient
+import tornado.ioloop
+import tornado.websocket  # noqa: F401
 from mcp.types import TextContent
 
 from hud.tools.jupyter import JupyterTool, strip_ansi
@@ -82,9 +87,9 @@ async def test_connect_new_kernel(self):
         mock_client = MagicMock(fetch=AsyncMock(return_value=mock_response))
 
         with (
-            patch("hud.tools.jupyter.AsyncHTTPClient", return_value=mock_client),
-            patch("hud.tools.jupyter.websocket_connect", new_callable=AsyncMock),
-            patch("hud.tools.jupyter.PeriodicCallback"),
+            patch("tornado.httpclient.AsyncHTTPClient", return_value=mock_client),
+            patch("tornado.websocket.websocket_connect", new_callable=AsyncMock),
+            patch("tornado.ioloop.PeriodicCallback"),
         ):
             await tool._connect()
             assert tool._kernel_id == "new-kernel-123"
@@ -94,9 +99,9 @@ async def test_connect_existing_kernel(self):
         """Test connecting to an existing kernel."""
         tool = JupyterTool(kernel_id="existing-kernel-456")
         with (
-            patch("hud.tools.jupyter.AsyncHTTPClient"),
-            patch("hud.tools.jupyter.websocket_connect", new_callable=AsyncMock),
-            patch("hud.tools.jupyter.PeriodicCallback"),
+            patch("tornado.httpclient.AsyncHTTPClient"),
+            patch("tornado.websocket.websocket_connect", new_callable=AsyncMock),
+            patch("tornado.ioloop.PeriodicCallback"),
         ):
             await tool._connect()
             assert tool._kernel_id == "existing-kernel-456"
@@ -150,7 +155,7 @@ async def hang_forever():
 
         with (
             patch("hud.tools.jupyter.uuid4") as mock_uuid,
-            patch("hud.tools.jupyter.AsyncHTTPClient", return_value=mock_client),
+            patch("tornado.httpclient.AsyncHTTPClient", return_value=mock_client),
         ):
             mock_uuid.return_value.hex = "test-msg"
             result = await tool._execute("while True: pass", execution_timeout=1)
@@ -164,7 +169,7 @@ async def test_shutdown(self):
         tool._ws = MagicMock()
         tool._heartbeat_callback = MagicMock()
 
-        with patch("hud.tools.jupyter.AsyncHTTPClient"):
+        with patch("tornado.httpclient.AsyncHTTPClient"):
             await tool.shutdown()
             assert tool._kernel_id == ""
             assert tool._ws is None
diff --git a/hud/tools/types.py b/hud/tools/types.py
index f3285258..282e4de2 100644
--- a/hud/tools/types.py
+++ b/hud/tools/types.py
@@ -6,6 +6,18 @@
 from pydantic import BaseModel, ConfigDict, Field
 
 
+class Coordinate(BaseModel):
+    """A coordinate point with x and y values.
+
+    Used for path-based actions like drag operations.
+    """
+
+    model_config = ConfigDict(extra="forbid")
+
+    x: int = Field(..., description="X coordinate")
+    y: int = Field(..., description="Y coordinate")
+
+
 class EvaluationResult(BaseModel):
     """Standard evaluation result format."""
 
diff --git a/hud/types.py b/hud/types.py
index 75bf8c11..17f7cadc 100644
--- a/hud/types.py
+++ b/hud/types.py
@@ -8,7 +8,7 @@
 
 import mcp.types as types
 from mcp.types import CallToolRequestParams, CallToolResult
-from pydantic import BaseModel, ConfigDict, Field, field_validator
+from pydantic import AliasChoices, BaseModel, ConfigDict, Field, field_validator
 
 from hud.settings import settings
 from hud.utils.env import resolve_env_vars as _resolve_env_vars
@@ -31,7 +31,9 @@ class AgentType(str, Enum):
 
     @property
     def cls(self) -> type:
-        from hud.agents import ClaudeAgent, GeminiAgent, OpenAIAgent, OperatorAgent
+        from hud.agents import OpenAIAgent, OperatorAgent
+        from hud.agents.claude import ClaudeAgent
+        from hud.agents.gemini import GeminiAgent
         from hud.agents.gemini_cua import GeminiCUAAgent
         from hud.agents.openai_chat import OpenAIChatAgent
 
@@ -53,28 +55,59 @@ def cls(self) -> type:
 
 
 class BaseAgentConfig(BaseModel):
-    """Standard agent configuration that tasks can override.
-    Provider-specific configs should not be included here.
+    """Agent configuration for LLM-specific settings.
+
+    Note: allowed_tools, disallowed_tools, append_setup_output, and initial_screenshot
+    are kept for backwards compatibility with v4 task configs but are no longer applied
+    at the agent level. These should be configured on the Environment/Task instead.
     """
 
-    model_config = ConfigDict(arbitrary_types_allowed=True, extra="forbid")
+    model_config = ConfigDict(arbitrary_types_allowed=True, extra="forbid", populate_by_name=True)
+
+    # Model identifier - use 'model' (preferred) or 'checkpoint_name' (alias)
+    model: str | None = Field(
+        default=None, validation_alias=AliasChoices("model", "checkpoint_name")
+    )
+    model_name: str = "Agent"  # Human-readable display name
+
+    # LLM-specific setting
+    system_prompt: str | None = None
 
+    # Deprecated: kept for backwards compat with v4 task configs, not applied by agent
     allowed_tools: list[str] | None = None
     disallowed_tools: list[str] | None = None
-    response_tool_name: str | None = None
-    system_prompt: str | None = None
     append_setup_output: bool = True
+    append_setup_tool: bool = True  # Alias for append_setup_output (backwards compat)
     initial_screenshot: bool = True
 
+    @property
+    def checkpoint_name(self) -> str | None:
+        """Alias for model (for backwards compatibility)."""
+        return self.model
+
 
-class Task(BaseModel):
+class LegacyTask(BaseModel):
     """
+    DEPRECATED: Use Task from env() instead.
+
     A task configuration that can be used to create a task.
 
     The mcp_config field supports environment variable substitution using
     template placeholders in the format ${VAR_NAME} or ${VAR_NAME:default_value}.
 
-    Example:
+    .. deprecated:: 0.5.0
+        LegacyTask is deprecated in v0.5.0 and will be removed in v0.6.0
+        (no earlier than March 1st, 2026).
+
+        Use one of these migration paths:
+
+        1. Quick conversion: ``Task.from_v4(legacy_task)`` converts LegacyTask to Task
+        2. Full migration: Use ``@env.scenario()`` with setup code before first yield
+           and evaluate code after first yield
+
+        See https://docs.hud.ai/migration for the full migration guide.
+
+    Example (deprecated):
         mcp_config: {
             "hud": {
                 "url": "${HUD_MCP_URL:https://mcp.hud.ai/v3/mcp}",
@@ -95,6 +128,20 @@ class Task(BaseModel):
     agent_config: BaseAgentConfig | None = None
     metadata: dict[str, Any] = Field(default_factory=dict)
 
+    def __init__(self, **data: Any) -> None:
+        """Initialize LegacyTask with deprecation warning."""
+        import warnings
+
+        warnings.warn(
+            "LegacyTask is deprecated in v0.5.0 and will be removed in v0.6.0 "
+            "(no earlier than March 1st, 2026). "
+            "Use Task.from_v4() for quick conversion, or migrate to @env.scenario(). "
+            "See https://docs.hud.ai/migration for details.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
+        super().__init__(**data)
+
     @field_validator("mcp_config", "metadata", mode="before")
     @classmethod
     def parse_json_strings(cls, v: Any) -> Any:
@@ -208,7 +255,9 @@ def __rich__(self) -> str:
 
 
 class MCPToolResult(CallToolResult):
-    """A tool result."""
+    """A tool result with optional call_id for correlation."""
+
+    call_id: str | None = None  # For correlating with provider-specific tool call IDs
 
     def _get_content_summary(self) -> str:
         """Extract a summary of the content."""
@@ -311,6 +360,27 @@ class TraceStep(BaseModel):
     model_config = ConfigDict(populate_by_name=True, extra="allow")
 
 
+class HudSpan(BaseModel):
+    """A telemetry span ready for export to HUD API."""
+
+    name: str
+    trace_id: str = Field(pattern=r"^[0-9a-fA-F]{32}$")
+    span_id: str = Field(pattern=r"^[0-9a-fA-F]{16}$")
+    parent_span_id: str | None = Field(default=None, pattern=r"^[0-9a-fA-F]{16}$")
+
+    start_time: str  # ISO format
+    end_time: str  # ISO format
+
+    status_code: str  # "UNSET", "OK", "ERROR"
+    status_message: str | None = None
+
+    attributes: TraceStep
+    exceptions: list[dict[str, Any]] | None = None
+    internal_type: str | None = None
+
+    model_config = ConfigDict(extra="forbid")
+
+
 class Trace(BaseModel):
     """Unified result from agent execution (task or prompt).
 
@@ -330,7 +400,7 @@ class Trace(BaseModel):
     isError: bool = Field(default=False)
 
     # Metadata
-    task: Task | None = Field(default=None)
+    task: LegacyTask | None = Field(default=None)
 
     # Trace
     trace: list[TraceStep] = Field(default_factory=list)
@@ -346,27 +416,22 @@ def num_messages(self) -> int:
     def append(self, step: TraceStep) -> None:
         self.trace.append(step)
 
-    def populate_from_context(self) -> None:
-        """Populate trace steps from the current trace context if available.
-
-        This checks if we're executing within a hud.trace() context and
-        automatically populates the trace field with collected steps.
-        """
-        from hud.otel.context import get_current_task_run_id
-        from hud.telemetry.replay import get_trace
 
-        task_run_id = get_current_task_run_id()
-        if task_run_id:
-            collected_trace = get_trace(task_run_id)
-            if collected_trace:
-                self.trace = collected_trace.trace
+# Re-export Task for backwards compatibility (after module defs to avoid circular import)
+from hud.eval.task import Task  # noqa: E402
 
+# Type alias for functions that accept v5 Task, v4 LegacyTask, or raw dicts
+TaskInput = Task | LegacyTask | dict[str, Any]
 
 __all__ = [
     "AgentResponse",
     "AgentType",
+    "HudSpan",
+    "LegacyTask",
     "MCPToolCall",
     "MCPToolResult",
+    "Task",
+    "TaskInput",
     "Trace",
     "TraceStep",
 ]
diff --git a/hud/utils/mcp.py b/hud/utils/mcp.py
index fe5044c9..882ac411 100644
--- a/hud/utils/mcp.py
+++ b/hud/utils/mcp.py
@@ -5,8 +5,6 @@
 
 from pydantic import BaseModel, Field
 
-from hud.settings import settings
-
 logger = logging.getLogger(__name__)
 
 
@@ -20,11 +18,15 @@ class MCPConfigPatch(BaseModel):
 def _is_hud_server(url: str) -> bool:
     """Check if a URL is a HUD MCP server.
 
-    Matches any mcp.hud.* domain (including .ai, .so, and future domains).
+    Matches:
+    - Any mcp.hud.* domain (including .ai, .so, and future domains)
+    - Staging servers (orcstaging.hud.so)
+    - Any *.hud.ai or *.hud.so domain
     """
     if not url:
         return False
-    return "mcp.hud." in url.lower()
+    url_lower = url.lower()
+    return "mcp.hud." in url_lower or ".hud.ai" in url_lower or ".hud.so" in url_lower
 
 
 def patch_mcp_config(mcp_config: dict[str, dict[str, Any]], patch: MCPConfigPatch) -> None:
@@ -43,55 +45,3 @@ def patch_mcp_config(mcp_config: dict[str, dict[str, Any]], patch: MCPConfigPatc
             for key, value in patch.meta.items():
                 meta = server_cfg.setdefault("meta", {})
                 meta.setdefault(key, value)
-
-
-def setup_hud_telemetry(
-    mcp_config: dict[str, dict[str, Any]], auto_trace: bool = True
-) -> Any | None:
-    """Setup telemetry for hud servers.
-
-    Returns:
-        The auto-created trace context manager if one was created, None otherwise.
-        Caller is responsible for exiting the context manager.
-    """
-    if mcp_config is None:
-        raise ValueError("Please run initialize() before setting up client-side telemetry")
-
-    # Check if there are any HUD servers to setup telemetry for
-    has_hud_servers = any(
-        _is_hud_server(server_cfg.get("url", "")) for server_cfg in mcp_config.values()
-    )
-
-    # If no HUD servers, no need for telemetry setup
-    if not has_hud_servers:
-        return None
-
-    from hud.otel import get_current_task_run_id
-    from hud.telemetry import trace
-
-    run_id = get_current_task_run_id()
-    auto_trace_cm = None
-
-    if not run_id and auto_trace:
-        # Start an auto trace and capture its ID for headers/metadata
-        auto_trace_cm = trace("My Trace")
-        _trace_obj = auto_trace_cm.__enter__()
-        try:
-            run_id = getattr(_trace_obj, "id", None) or str(_trace_obj)
-        except Exception:  # pragma: no cover - fallback shouldn't fail lint
-            run_id = None
-
-    # Patch HUD servers with run-id (works whether auto or user trace)
-    if run_id:
-        patch_mcp_config(
-            mcp_config,
-            MCPConfigPatch(headers={"Run-Id": run_id}, meta={"run_id": run_id}),
-        )
-
-    if settings.api_key:
-        patch_mcp_config(
-            mcp_config,
-            MCPConfigPatch(headers={"Authorization": f"Bearer {settings.api_key}"}),
-        )
-
-    return auto_trace_cm
diff --git a/hud/utils/tasks.py b/hud/utils/tasks.py
deleted file mode 100644
index 5e92c6d8..00000000
--- a/hud/utils/tasks.py
+++ /dev/null
@@ -1,186 +0,0 @@
-from __future__ import annotations
-
-import json
-from pathlib import Path
-from typing import Any
-
-from datasets import Dataset
-
-from hud.types import Task
-from hud.utils.hud_console import HUDConsole
-
-hud_console = HUDConsole()
-
-
-def load_tasks(tasks_input: str | list[dict], *, raw: bool = False) -> list[Task] | list[dict]:
-    """Load tasks from various sources.
-
-    Args:
-        tasks_input: Either:
-            - Path to a JSON file (array of tasks)
-            - Path to a JSONL file (one task per line)
-            - HuggingFace dataset name (format: "username/dataset" or "username/dataset:split")
-            - List of task dictionaries
-        raw: If True, return raw dicts without validation or env substitution
-
-    Returns:
-        - If raw=False (default): list[Task]
-        - If raw=True: list[dict]
-    """
-    tasks: list[Task] | list[dict] = []
-
-    if isinstance(tasks_input, list):
-        # Direct list of task dicts
-        hud_console.info(f"Loading {len(tasks_input)} tasks from provided list")
-        if raw:
-            return [item for item in tasks_input if isinstance(item, dict)]
-        for item in tasks_input:
-            task = Task(**item)
-            tasks.append(task)
-
-    elif isinstance(tasks_input, str):
-        # Check if it's a file path
-        if Path(tasks_input).exists():
-            file_path = Path(tasks_input)
-
-            with open(file_path, encoding="utf-8") as f:
-                # Handle JSON files (array of tasks)
-                if file_path.suffix.lower() == ".json":
-                    data = json.load(f)
-                    if not isinstance(data, list):
-                        raise ValueError(
-                            f"JSON file must contain an array of tasks, got {type(data)}"
-                        )
-                    if raw:
-                        return [item for item in data if isinstance(item, dict)]
-                    for item in data:
-                        task = Task(**item)
-                        tasks.append(task)
-
-                # Handle JSONL files (one task per line)
-                else:
-                    raw_items: list[dict] = []
-                    for line in f:
-                        line = line.strip()
-                        if not line:
-                            continue
-                        item = json.loads(line)
-                        if isinstance(item, list):
-                            raw_items.extend([it for it in item if isinstance(it, dict)])
-                        elif isinstance(item, dict):
-                            raw_items.append(item)
-                        else:
-                            raise ValueError(
-                                f"Invalid JSONL format: expected dict or list of dicts, got {type(item)}"  # noqa: E501
-                            )
-                    if raw:
-                        return raw_items
-                    for it in raw_items:
-                        task = Task(**it)
-                        tasks.append(task)
-
-        # Check if it's a HuggingFace dataset
-        elif "/" in tasks_input:
-            hud_console.info(f"Loading tasks from HuggingFace dataset: {tasks_input}")
-            try:
-                from datasets import load_dataset
-
-                # Parse dataset name and optional split
-                if ":" in tasks_input:
-                    dataset_name, split = tasks_input.split(":", 1)
-                else:
-                    dataset_name = tasks_input
-                    split = "train"  # Default split
-
-                dataset = load_dataset(dataset_name, split=split)
-
-                # Convert dataset rows to Task objects
-                raw_rows: list[dict] = []
-                for item in dataset:
-                    if not isinstance(item, dict):
-                        raise ValueError(
-                            f"Invalid HuggingFace dataset: expected dict, got {type(item)}"
-                        )
-                    if not item["mcp_config"] or not item["prompt"]:
-                        raise ValueError(
-                            f"Invalid HuggingFace dataset: expected mcp_config and prompt, got {item}"  # noqa: E501
-                        )
-                    raw_rows.append(item)
-                if raw:
-                    return raw_rows
-                for row in raw_rows:
-                    task = Task(**row)
-                    tasks.append(task)
-
-            except ImportError as e:
-                raise ImportError(
-                    "Please install 'datasets' to load from HuggingFace: uv pip install datasets"
-                ) from e
-            except Exception as e:
-                raise ValueError(f"Failed to load HuggingFace dataset '{tasks_input}': {e}") from e
-
-        else:
-            raise ValueError(
-                f"Invalid tasks input: '{tasks_input}' is neither a file path nor a HuggingFace dataset"  # noqa: E501
-            )
-
-    else:
-        raise TypeError(f"tasks_input must be str or list, got {type(tasks_input)}")
-
-    return tasks
-
-
-def save_tasks(
-    tasks: list[dict[str, Any]],
-    repo_id: str,
-    fields: list[str] | None = None,
-    **kwargs: Any,
-) -> None:
-    """
-    Save data to a HuggingFace dataset with JSON string serialization.
-
-    Complex fields (dicts, lists) are serialized as JSON strings to keep schemas clean
-    and avoid null-value pollution when uploaded to the Hub.
-
-    Args:
-        tasks: List of dictionaries to save.
-        repo_id: HuggingFace repository ID (e.g., "hud-evals/my-tasks").
-        fields: Optional subset of fields to persist. Defaults to all keys per task.
-        **kwargs: Extra kwargs forwarded to `Dataset.push_to_hub`.
-    """
-
-    if tasks and isinstance(tasks[0], Task):
-        raise ValueError(
-            "save_tasks expects dictionaries, not Task objects. "
-            "Task objects have resolved environment variables which would expose secrets. "
-            "Please pass raw dictionaries with template strings like '${HUD_API_KEY}' preserved."
-        )
-
-    data: list[dict[str, Any]] = []
-    for index, task_dict in enumerate(tasks):
-        if isinstance(task_dict, Task):
-            raise ValueError(
-                f"Item {index} is a Task object, not a dictionary. "
-                "This would expose resolved environment variables. "
-                "Please convert to dictionary format with template strings preserved."
-            )
-
-        row: dict[str, Any] = {}
-        fields_to_process = fields if fields is not None else list(task_dict.keys())
-
-        for field in fields_to_process:
-            if field not in task_dict:
-                continue
-
-            value = task_dict[field]
-            if isinstance(value, (dict | list)):
-                row[field] = json.dumps(value)
-            elif isinstance(value, (str | int | float | bool | type(None))):
-                row[field] = value if value is not None else ""
-            else:
-                row[field] = str(value)
-
-        data.append(row)
-
-    dataset = Dataset.from_list(data)
-    dataset.push_to_hub(repo_id, **kwargs)
diff --git a/hud/utils/tests/test_mcp.py b/hud/utils/tests/test_mcp.py
index 1af6daed..48b62675 100644
--- a/hud/utils/tests/test_mcp.py
+++ b/hud/utils/tests/test_mcp.py
@@ -2,9 +2,7 @@
 
 from __future__ import annotations
 
-import pytest
-
-from hud.utils.mcp import MCPConfigPatch, patch_mcp_config, setup_hud_telemetry
+from hud.utils.mcp import MCPConfigPatch, patch_mcp_config
 
 
 class TestPatchMCPConfig:
@@ -85,26 +83,3 @@ def test_patch_meta_preserves_existing(self):
         # Existing meta should be preserved, new one added
         assert mcp_config["test_server"]["meta"]["existing_key"] == "existing_value"
         assert mcp_config["test_server"]["meta"]["test_key"] == "test_value"
-
-
-class TestSetupHUDTelemetry:
-    """Tests for setup_hud_telemetry function."""
-
-    def test_empty_config_returns_none(self):
-        """Test that empty config returns None (no servers to set up telemetry for)."""
-        result = setup_hud_telemetry({})
-        assert result is None
-
-    def test_none_config_raises_error(self):
-        """Test that None config raises ValueError."""
-        with pytest.raises(
-            ValueError, match="Please run initialize\\(\\) before setting up client-side telemetry"
-        ):
-            setup_hud_telemetry(None)  # type: ignore[arg-type]
-
-    def test_valid_config_returns_none_when_no_hud_servers(self):
-        """Test that valid config with no HUD servers returns None."""
-        mcp_config = {"test_server": {"url": "http://example.com"}}
-
-        result = setup_hud_telemetry(mcp_config)
-        assert result is None
diff --git a/hud/utils/tests/test_tasks.py b/hud/utils/tests/test_tasks.py
deleted file mode 100644
index 8a038e43..00000000
--- a/hud/utils/tests/test_tasks.py
+++ /dev/null
@@ -1,356 +0,0 @@
-from __future__ import annotations
-
-import json
-import tempfile
-from pathlib import Path
-from unittest.mock import MagicMock, patch
-
-import pytest
-
-from hud.types import Task
-from hud.utils.tasks import load_tasks, save_tasks
-
-
-def test_load_tasks_from_list():
-    """Test loading tasks from a list of dictionaries."""
-    task_dicts = [
-        {"id": "1", "prompt": "Test task 1", "mcp_config": {}},
-        {"id": "2", "prompt": "Test task 2", "mcp_config": {}},
-    ]
-
-    tasks = load_tasks(task_dicts)
-
-    assert len(tasks) == 2
-    assert all(isinstance(t, Task) for t in tasks)
-    assert tasks[0].prompt == "Test task 1"  # type: ignore
-    assert tasks[1].prompt == "Test task 2"  # type: ignore
-
-
-def test_load_tasks_from_list_raw():
-    """Test loading tasks from a list in raw mode."""
-    task_dicts = [
-        {"id": "1", "prompt": "Test task 1", "mcp_config": {}},
-        {"id": "2", "prompt": "Test task 2", "mcp_config": {}},
-    ]
-
-    tasks = load_tasks(task_dicts, raw=True)
-
-    assert len(tasks) == 2
-    assert all(isinstance(t, dict) for t in tasks)
-    assert tasks[0]["prompt"] == "Test task 1"  # type: ignore
-
-
-def test_load_tasks_from_json_file():
-    """Test loading tasks from a JSON file."""
-    task_dicts = [
-        {"id": "1", "prompt": "Test task 1", "mcp_config": {}},
-        {"id": "2", "prompt": "Test task 2", "mcp_config": {}},
-    ]
-
-    with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False, encoding="utf-8") as f:
-        json.dump(task_dicts, f)
-        temp_path = f.name
-
-    try:
-        tasks = load_tasks(temp_path)
-
-        assert len(tasks) == 2
-        assert all(isinstance(t, Task) for t in tasks)
-        assert tasks[0].prompt == "Test task 1"  # type: ignore
-    finally:
-        Path(temp_path).unlink()
-
-
-def test_load_tasks_from_json_file_raw():
-    """Test loading tasks from a JSON file in raw mode."""
-    task_dicts = [
-        {"id": "1", "prompt": "Test task 1", "mcp_config": {}},
-        {"id": "2", "prompt": "Test task 2", "mcp_config": {}},
-    ]
-
-    with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False, encoding="utf-8") as f:
-        json.dump(task_dicts, f)
-        temp_path = f.name
-
-    try:
-        tasks = load_tasks(temp_path, raw=True)
-
-        assert len(tasks) == 2
-        assert all(isinstance(t, dict) for t in tasks)
-    finally:
-        Path(temp_path).unlink()
-
-
-def test_load_tasks_from_jsonl_file():
-    """Test loading tasks from a JSONL file."""
-    task_dicts = [
-        {"id": "1", "prompt": "Test task 1", "mcp_config": {}},
-        {"id": "2", "prompt": "Test task 2", "mcp_config": {}},
-    ]
-
-    with tempfile.NamedTemporaryFile(
-        mode="w", suffix=".jsonl", delete=False, encoding="utf-8"
-    ) as f:
-        for task_dict in task_dicts:
-            f.write(json.dumps(task_dict) + "\n")
-        temp_path = f.name
-
-    try:
-        tasks = load_tasks(temp_path)
-
-        assert len(tasks) == 2
-        assert all(isinstance(t, Task) for t in tasks)
-        assert tasks[0].prompt == "Test task 1"  # type: ignore
-    finally:
-        Path(temp_path).unlink()
-
-
-def test_load_tasks_from_jsonl_file_with_empty_lines():
-    """Test loading tasks from a JSONL file with empty lines."""
-    task_dicts = [
-        {"id": "1", "prompt": "Test task 1", "mcp_config": {}},
-        {"id": "2", "prompt": "Test task 2", "mcp_config": {}},
-    ]
-
-    with tempfile.NamedTemporaryFile(
-        mode="w", suffix=".jsonl", delete=False, encoding="utf-8"
-    ) as f:
-        f.write(json.dumps(task_dicts[0]) + "\n")
-        f.write("\n")  # Empty line
-        f.write(json.dumps(task_dicts[1]) + "\n")
-        temp_path = f.name
-
-    try:
-        tasks = load_tasks(temp_path)
-
-        assert len(tasks) == 2
-        assert all(isinstance(t, Task) for t in tasks)
-    finally:
-        Path(temp_path).unlink()
-
-
-def test_load_tasks_from_jsonl_file_with_list():
-    """Test loading tasks from a JSONL file where a line contains a list."""
-    task_dict = {"id": "1", "prompt": "Test task 1", "mcp_config": {}}
-
-    with tempfile.NamedTemporaryFile(
-        mode="w", suffix=".jsonl", delete=False, encoding="utf-8"
-    ) as f:
-        f.write(json.dumps([task_dict, task_dict]) + "\n")
-        temp_path = f.name
-
-    try:
-        tasks = load_tasks(temp_path)
-
-        assert len(tasks) == 2
-        assert all(isinstance(t, Task) for t in tasks)
-    finally:
-        Path(temp_path).unlink()
-
-
-def test_load_tasks_json_not_array_error():
-    """Test that loading from JSON file with non-array raises error."""
-    with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False, encoding="utf-8") as f:
-        json.dump({"not": "an array"}, f)
-        temp_path = f.name
-
-    try:
-        with pytest.raises(ValueError, match="JSON file must contain an array"):
-            load_tasks(temp_path)
-    finally:
-        Path(temp_path).unlink()
-
-
-def test_load_tasks_invalid_jsonl_format():
-    """Test that loading from JSONL with invalid format raises error."""
-    with tempfile.NamedTemporaryFile(
-        mode="w", suffix=".jsonl", delete=False, encoding="utf-8"
-    ) as f:
-        f.write(json.dumps("invalid") + "\n")
-        temp_path = f.name
-
-    try:
-        with pytest.raises(ValueError, match="Invalid JSONL format"):
-            load_tasks(temp_path)
-    finally:
-        Path(temp_path).unlink()
-
-
-def test_load_tasks_invalid_input_type():
-    """Test that invalid input type raises TypeError."""
-    with pytest.raises(TypeError, match="tasks_input must be str or list"):
-        load_tasks(123)  # type: ignore
-
-
-def test_load_tasks_nonexistent_file():
-    """Test that loading from nonexistent file raises error."""
-    with pytest.raises(ValueError, match="neither a file path nor a HuggingFace dataset"):
-        load_tasks("nonexistent_file_without_slash")
-
-
-def test_save_tasks_basic():
-    """Test basic save_tasks functionality."""
-    tasks = [
-        {"id": "1", "prompt": "test", "mcp_config": {"key": "value"}},
-        {"id": "2", "prompt": "test2", "mcp_config": {"key2": "value2"}},
-    ]
-
-    with patch("hud.utils.tasks.Dataset") as mock_dataset_class:
-        mock_dataset = MagicMock()
-        mock_dataset_class.from_list.return_value = mock_dataset
-
-        save_tasks(tasks, "test/repo")
-
-        mock_dataset_class.from_list.assert_called_once()
-        call_args = mock_dataset_class.from_list.call_args[0][0]
-        assert len(call_args) == 2
-        # Check that mcp_config was JSON serialized
-        assert isinstance(call_args[0]["mcp_config"], str)
-        mock_dataset.push_to_hub.assert_called_once_with("test/repo")
-
-
-def test_save_tasks_with_specific_fields():
-    """Test save_tasks with specific fields."""
-    tasks = [
-        {"id": "1", "prompt": "test", "mcp_config": {"key": "value"}, "extra": "data"},
-    ]
-
-    with patch("hud.utils.tasks.Dataset") as mock_dataset_class:
-        mock_dataset = MagicMock()
-        mock_dataset_class.from_list.return_value = mock_dataset
-
-        save_tasks(tasks, "test/repo", fields=["id", "prompt"])
-
-        call_args = mock_dataset_class.from_list.call_args[0][0]
-        assert "id" in call_args[0]
-        assert "prompt" in call_args[0]
-        assert "extra" not in call_args[0]
-
-
-def test_save_tasks_with_list_field():
-    """Test save_tasks serializes list fields."""
-    tasks = [
-        {"id": "1", "tags": ["tag1", "tag2"], "count": 5},
-    ]
-
-    with patch("hud.utils.tasks.Dataset") as mock_dataset_class:
-        mock_dataset = MagicMock()
-        mock_dataset_class.from_list.return_value = mock_dataset
-
-        save_tasks(tasks, "test/repo")
-
-        call_args = mock_dataset_class.from_list.call_args[0][0]
-        # List should be JSON serialized
-        assert isinstance(call_args[0]["tags"], str)
-        assert '"tag1"' in call_args[0]["tags"]
-
-
-def test_save_tasks_with_primitive_types():
-    """Test save_tasks handles various primitive types."""
-    tasks = [
-        {
-            "string": "text",
-            "integer": 42,
-            "float": 3.14,
-            "boolean": True,
-            "none": None,
-        },
-    ]
-
-    with patch("hud.utils.tasks.Dataset") as mock_dataset_class:
-        mock_dataset = MagicMock()
-        mock_dataset_class.from_list.return_value = mock_dataset
-
-        save_tasks(tasks, "test/repo")
-
-        call_args = mock_dataset_class.from_list.call_args[0][0]
-        assert call_args[0]["string"] == "text"
-        assert call_args[0]["integer"] == 42
-        assert call_args[0]["float"] == 3.14
-        assert call_args[0]["boolean"] is True
-        assert call_args[0]["none"] == ""  # None becomes empty string
-
-
-def test_save_tasks_with_other_type():
-    """Test save_tasks converts other types to string."""
-
-    class CustomObj:
-        def __str__(self):
-            return "custom_value"
-
-    tasks = [
-        {"id": "1", "custom": CustomObj()},
-    ]
-
-    with patch("hud.utils.tasks.Dataset") as mock_dataset_class:
-        mock_dataset = MagicMock()
-        mock_dataset_class.from_list.return_value = mock_dataset
-
-        save_tasks(tasks, "test/repo")
-
-        call_args = mock_dataset_class.from_list.call_args[0][0]
-        assert call_args[0]["custom"] == "custom_value"
-
-
-def test_save_tasks_rejects_task_objects():
-    """Test save_tasks raises error for Task objects."""
-    task = Task(prompt="test", mcp_config={})
-
-    with pytest.raises(ValueError, match="expects dictionaries, not Task objects"):
-        save_tasks([task], "test/repo")  # type: ignore
-
-
-def test_save_tasks_rejects_task_objects_in_list():
-    """Test save_tasks raises error when Task object is in the list."""
-    tasks = [
-        {"id": "1", "prompt": "test", "mcp_config": {}},
-        Task(prompt="test2", mcp_config={}),  # Task object
-    ]
-
-    with pytest.raises(ValueError, match="Item 1 is a Task object"):
-        save_tasks(tasks, "test/repo")  # type: ignore
-
-
-def test_save_tasks_with_kwargs():
-    """Test save_tasks passes kwargs to push_to_hub."""
-    tasks = [{"id": "1", "prompt": "test"}]
-
-    with patch("hud.utils.tasks.Dataset") as mock_dataset_class:
-        mock_dataset = MagicMock()
-        mock_dataset_class.from_list.return_value = mock_dataset
-
-        save_tasks(tasks, "test/repo", private=True, commit_message="Test commit")
-
-        mock_dataset.push_to_hub.assert_called_once_with(
-            "test/repo", private=True, commit_message="Test commit"
-        )
-
-
-def test_save_tasks_field_not_in_dict():
-    """Test save_tasks handles missing fields gracefully."""
-    tasks = [
-        {"id": "1", "prompt": "test"},
-    ]
-
-    with patch("hud.utils.tasks.Dataset") as mock_dataset_class:
-        mock_dataset = MagicMock()
-        mock_dataset_class.from_list.return_value = mock_dataset
-
-        # Request fields that don't exist
-        save_tasks(tasks, "test/repo", fields=["id", "missing_field"])
-
-        call_args = mock_dataset_class.from_list.call_args[0][0]
-        assert "id" in call_args[0]
-        assert "missing_field" not in call_args[0]
-
-
-def test_save_tasks_empty_list():
-    """Test save_tasks with empty list."""
-    with patch("hud.utils.tasks.Dataset") as mock_dataset_class:
-        mock_dataset = MagicMock()
-        mock_dataset_class.from_list.return_value = mock_dataset
-
-        save_tasks([], "test/repo")
-
-        mock_dataset_class.from_list.assert_called_once_with([])
-        mock_dataset.push_to_hub.assert_called_once()
diff --git a/hud/utils/tests/test_version.py b/hud/utils/tests/test_version.py
index 96099974..13da2b66 100644
--- a/hud/utils/tests/test_version.py
+++ b/hud/utils/tests/test_version.py
@@ -5,4 +5,4 @@ def test_import():
     """Test that the package can be imported."""
     import hud
 
-    assert hud.__version__ == "0.4.74"
+    assert hud.__version__ == "0.5.0"
diff --git a/hud/version.py b/hud/version.py
index f115ea5e..b16da8fa 100644
--- a/hud/version.py
+++ b/hud/version.py
@@ -4,4 +4,4 @@
 
 from __future__ import annotations
 
-__version__ = "0.4.74"
+__version__ = "0.5.0"
diff --git a/pyproject.toml b/pyproject.toml
index 31403075..e5f4165d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "hud-python"
-version = "0.4.74"
+version = "0.5.0"
 description = "SDK for the HUD platform."
 readme = "README.md"
 requires-python = ">=3.11, <3.13"
@@ -15,35 +15,18 @@ dependencies = [
     "pydantic>=2.6,<3",
     "pydantic-settings>=2.2,<3",
     # MCP dependencies
-    "hud-mcp-python-sdk>=3.13.2",
-    "hud-fastmcp-python-sdk>=0.1.2",
-    "hud-mcp-use-python-sdk==2.3.20",
-    "langchain==0.3.27",
-    "pathspec>=0.12.1",
-    "wrapt>=1.14.0",
+    "mcp>1.21.1,<1.23",
+    "fastmcp==2.13.3",
+    # For all inference agents
+    "openai>=2.8.1",
     # CLI dependencies
     "typer>=0.9.0",
     "rich>=13.0.0",
     "toml>=0.10.2",
     "watchfiles>=0.21.0",
     "questionary==2.1.0",
-    "prompt-toolkit==3.0.51",
-    # Terminal library with mouse support for JSON viewer
+    "prompt-toolkit==3.0.51",  # Locked for questionary compatibility
     "blessed>=1.20.0",
-    # Telemetry
-    "opentelemetry-instrumentation-mcp==0.47.0",
-    "opentelemetry-api>=1.34.1",
-    "opentelemetry-sdk>=1.34.1",
-    "opentelemetry-exporter-otlp-proto-http>=1.34.1",
-    # Data and evaluation
-    "datasets>=2.14.0",
-    "numpy>=1.24.0",
-    "pillow>=11.1.0",
-    # AI providers
-    "anthropic>=0.75",
-    "openai>=2.8.1",
-    "google-genai",
-    "tornado>=6.5.2",
     "scarf-sdk>=0.1.0",
 ]
 classifiers = [
@@ -124,13 +107,21 @@ packages = ["hud"]
 "hud/py.typed" = "hud/py.typed"
 
 [project.optional-dependencies]
-rl = [
-    "peft>=0.17.1",
-    "vllm==0.10.1.1",
-    "bitsandbytes>=0.41.0 ; sys_platform == 'linux'",  # For 8-bit optimizers (Linux only)
-    "liger-kernel>=0.5.0 ; sys_platform == 'linux'",  # Optimized Triton kernels for LLM training (Linux only)
-    # Note: flash-attn is recommended but optional
-    # Install separately with: uv pip install flash-attn --no-build-isolation
+# Agent implementations, AI providers, datasets, and telemetry
+agents = [
+    # MCP-use client (legacy)
+    "mcp-use==1.5.0",
+    "langchain>=1.1.0",  # Required by mcp-use
+    # AI providers
+    "anthropic>=0.75",
+    "google-genai",
+    "openai-agents",
+    # Dataset loading (HuggingFace)
+    "datasets>=2.14.0",
+    # Image processing for screenshots/grounding
+    "pillow>=11.1.0",
+    # Jupyter kernel support
+    "tornado>=6.5.2",
 ]
 
 # AWS Bedrock support for ClaudeAgent
@@ -140,7 +131,7 @@ bedrock = [
 
 # Development dependencies - includes testing, linting, and automation tools
 dev = [
-    # Include agent dependencies
+    "hud-python[agents]",  # Include agents for dev
     # Jupyter support
     "ipykernel",
     "ipython <9",
@@ -157,13 +148,13 @@ dev = [
     # Automation and computer control
     "playwright",
     "pyautogui>=0.9.54",
-    "pillow>=11.1.0",
+    # Optional integrations (for type checking)
+    "llama-index-core",
+    "google-adk",
 ]
 
-# Agent dependencies extend dev
-agent = ["hud-python[dev]"]
-
-agents = ["hud-python[agent]"]
+# Alias for backwards compatibility
+agent = ["hud-python[agents]"]
 
 
 [tool.ruff]
@@ -220,7 +211,6 @@ exclude = [
     "**/node_modules",
     "**/__pycache__",
     "**/venv",
-    "hud/misc/claude_plays_pokemon.py",
 ]
 pythonVersion = "3.11"
 typeCheckingMode = "basic"
@@ -231,8 +221,6 @@ source = ["hud"]
 omit = [
     "*/tests/*",
     "*/examples/*",
-    "hud/rl/*",
-    "hud/cli/rl/*",
     "hud/misc/*",
 ]
 
@@ -254,8 +242,6 @@ fail_under = 58
 omit = [
     "*/tests/*",
     "*/examples/*",
-    "hud/rl/*",
-    "hud/cli/rl/*",
     "hud/misc/*",
 ]
 
@@ -265,3 +251,6 @@ asyncio_mode = "auto"
 testpaths = ["hud", "examples"]
 # Ignore the dev folder and other non-test directories
 addopts = "--ignore=dev --ignore=ref --ignore=test_env --ignore=environments"
+markers = [
+    "integration: marks tests as integration tests (require HUD_API_KEY, network access)",
+]
diff --git a/scripts/pre_release_check.py b/scripts/pre_release_check.py
index f0ac6ddf..f239ce61 100644
--- a/scripts/pre_release_check.py
+++ b/scripts/pre_release_check.py
@@ -14,7 +14,6 @@
 import sys
 from typing import Any
 
-from hud.agents import ClaudeAgent
 from hud.settings import settings
 
 # Configure logging
@@ -54,8 +53,7 @@ async def run_evaluation(self) -> bool:
             from hud.datasets import run_dataset
 
             # Run the evaluation
-            agent_class = ClaudeAgent
-            agent_config = {
+            agent_params = {
                 "model": "claude-sonnet-4-5",
                 "allowed_tools": ["anthropic_computer"],
                 "verbose": False,
@@ -63,14 +61,11 @@ async def run_evaluation(self) -> bool:
 
             logger.info("Running evaluation...")
             self.results = await run_dataset(
-                name=f"Pre-release check: {self.dataset}",
-                dataset=self.dataset,
-                agent_class=agent_class,
-                agent_config=agent_config,
+                tasks=self.dataset,
+                agent_type="claude",
+                agent_params=agent_params,
                 max_concurrent=25,
                 max_steps=25,
-                auto_respond=False,
-                metadata={"purpose": "pre-release-check", "dataset": self.dataset},
             )
 
             return self._validate_results()