TIGER-AI-Lab · nibzard · Apr 28, 2026
diff --git a/Dockerfile.base b/Dockerfile.base
@@ -32,6 +32,11 @@ RUN cd extension-server && UV_PYTHON_PREFERENCE=only-system uv sync
 
 COPY chrome-extension/ ./chrome-extension/
 
+# Steel browser-provider scripts (no-ops unless STEEL_API_KEY is set at
+# container start). They share the extension-server venv created above.
+COPY steel-cdp-shim.py /app/steel-cdp-shim.py
+COPY steel-collect-artifacts.py /app/steel-collect-artifacts.py
+
 COPY entrypoint.sh /entrypoint.sh
 RUN chmod +x /entrypoint.sh
 

diff --git a/README.md b/README.md
@@ -454,6 +454,36 @@ See [test-driver/README.md](test-driver/README.md) for full CLI documentation, b
 
 <br/>
 
+## Cloud browser provider — Steel (optional)
+
+ClawBench's default browser provider is a per-case Docker container running local Chromium. For users who want to run ClawBench against a cloud-hosted browser pool (no local Chromium, no Xvfb/x11vnc gymnastics, scale `--max-concurrent` past local CPU limits), the `--browser=steel` flag routes the in-container CDP traffic to a [Steel](https://docs.steel.dev) cloud session via a small CDP proxy shim. **Local Docker mode remains the default and the canonical scoring path** — Steel is an alternative provider, not a replacement.
+
+```bash
+export STEEL_API_KEY=sk-steel-...
+
+# Single run against a Steel session
+clawbench run test-cases/001-daily-life-food-uber-eats claude-sonnet-4-6 --browser=steel
+
+# Batch — each parallel job creates its own Steel session
+clawbench batch --all-models --case-range 1-50 --max-concurrent 10 --browser=steel
+```
+
+What you get with `--browser=steel` that you don't with local mode:
+
+- `data/steel/session.json` — full Steel session record (`userAgent`, `dimensions`, `deviceConfig`, `region`, `stealthConfig`, `proxySource`, `creditsUsed`, `duration`, `eventCount`, `status`)
+- `data/steel/events.jsonl` — Steel's rrweb event stream (DOM mutations, input, network meta) — strictly richer than ClawBench's recorder-extension `actions.jsonl`
+- `data/steel/context.json` — post-run cookies / localStorage / IndexedDB snapshot (forensics for "did the agent actually log in?")
+- `data/steel/browser-version.json` — Chrome / V8 version captured via CDP `Browser.getVersion`
+- `run-meta.json` includes `steel_session_viewer_url` — one-click replay of any failed run
+
+Notes:
+- Each parallel batch job creates one Steel session. Steel concurrency caps apply per API key (5 on Hobby, 100 on Pro).
+- `claude-code-chrome-extension` harness is incompatible with `--browser=steel` (its bridge talks to the in-Chrome extension via Chrome native messaging, which can't span the cloud boundary).
+- `--human` mode requires the local browser; pair with `--browser=local`.
+- Eval scoring is unchanged — the existing `extension-server` CDP interceptor connects to `127.0.0.1:9222` exactly as today; the shim transparently forwards to `wss://connect.steel.dev`.
+
+<br/>
+
 # <img src="static/icons/chart-bar.svg" width="28" height="28"> Evaluation
 
 Evaluation is a **post-session** step -- first run agents to collect trajectories, then evaluate them against human reference runs.

diff --git a/entrypoint.sh b/entrypoint.sh
@@ -4,6 +4,86 @@ set -e
 # Ensure /data exists for recording output and diagnostic logs
 mkdir -p /data
 
+# Steel browser-provider mode: front the eval-interceptor and harnesses with
+# a CDP shim that bridges to wss://connect.steel.dev. Skips Xvfb/Chromium/
+# socat/x11vnc/noVNC/ffmpeg — Steel renders cloud-side and provides rrweb +
+# HLS recording via its session API. We still start the extension-server
+# because its CDP handler does the eval interception (Fetch.enable on
+# Fetch.requestPaused → Fetch.failRequest), and CDP_URL still points at
+# 127.0.0.1:9222 — only now the shim is what's listening there.
+if [ -n "$STEEL_API_KEY" ]; then
+  export CLAWBENCH_STEEL_MODE=1
+  echo "============================================"
+  echo "Steel browser provider active (STEEL_API_KEY detected)"
+  echo "============================================"
+
+  cd /app/extension-server
+  uv run uvicorn server:app --host 0.0.0.0 --port 7878 &
+  sleep 1
+
+  # Start the shim; it creates the Steel session, writes initial artifacts,
+  # then serves /json/version + WS proxy on 127.0.0.1:9222.
+  uv run python /app/steel-cdp-shim.py &
+  SHIM_PID=$!
+  echo "Steel CDP shim started (pid $SHIM_PID); waiting for session..."
+
+  # Wait for the shim to be ready by polling /json/version (max 60s — Steel
+  # session create is normally <5s but allow headroom for cold starts).
+  for i in $(seq 1 60); do
+    if curl -sf http://127.0.0.1:9222/json/version >/dev/null 2>&1; then
+      echo "Steel CDP shim ready"
+      break
+    fi
+    if ! kill -0 "$SHIM_PID" 2>/dev/null; then
+      echo "ERROR: steel-cdp-shim exited before becoming ready"
+      [ -z "$(cat /data/.stop-reason 2>/dev/null)" ] && \
+        echo "steel_shim_failed" > /data/.stop-reason
+      exit 1
+    fi
+    sleep 1
+  done
+
+  if ! curl -sf http://127.0.0.1:9222/json/version >/dev/null 2>&1; then
+    echo "ERROR: steel-cdp-shim did not become ready within 60s"
+    [ -z "$(cat /data/.stop-reason 2>/dev/null)" ] && \
+      echo "steel_shim_failed" > /data/.stop-reason
+    exit 1
+  fi
+
+  # Run the harness (or stay manual). Trap exit so we always release the
+  # Steel session and pull artifacts before the container goes away.
+  cleanup_steel() {
+    if kill -0 "$SHIM_PID" 2>/dev/null; then
+      kill -TERM "$SHIM_PID" 2>/dev/null || true
+      wait "$SHIM_PID" 2>/dev/null || true
+    fi
+    cd /app/extension-server
+    uv run python /app/steel-collect-artifacts.py || true
+  }
+  trap cleanup_steel EXIT
+
+  if [ -z "$INSTRUCTION" ]; then
+    echo "No INSTRUCTION set; staying live for manual CDP use against the shim."
+    wait "$SHIM_PID"
+    exit 0
+  fi
+
+  if [ ! -x /run-harness.sh ]; then
+    echo "ERROR: /run-harness.sh missing — image was built without a harness layer"
+    echo "missing_harness" > /data/.stop-reason
+    exit 1
+  fi
+  # Capture the harness exit status without `set -e` short-circuiting
+  # the trap-driven Steel cleanup below.
+  set +e
+  /run-harness.sh
+  HARNESS_EXIT=$?
+  set -e
+  exit $HARNESS_EXIT
+fi
+
+# --- Local Chromium / Docker mode (default) -------------------------------
+
 # Start virtual display
 Xvfb :99 -screen 0 1920x1080x24 &
 export DISPLAY=:99

diff --git a/extension-server/pyproject.toml b/extension-server/pyproject.toml
@@ -4,4 +4,13 @@ version = "0.1.0"
 description = "ClawBench extension server"
 readme = "README.md"
 requires-python = "==3.12.*"
-dependencies = ["fastapi[standard]>=0.115", "websocket-client>=1.8"]
+dependencies = [
+    "fastapi[standard]>=0.115",
+    "websocket-client>=1.8",
+    # Used by ../steel-cdp-shim.py and ../steel-collect-artifacts.py when
+    # the container is launched with STEEL_API_KEY set. They share this
+    # venv (uv sync runs against this pyproject) so we keep their deps
+    # alongside the eval interceptor's.
+    "aiohttp>=3.9",
+    "steel-sdk>=0.17",
+]
diff --git a/extension-server/server.py b/extension-server/server.py
@@ -278,23 +278,28 @@ async def lifespan(app: FastAPI):
         match_body = eval_schema.get("body")
         match_params = eval_schema.get("params")
 
-    # Start screen recording of the Xvfb display
-    display = os.environ.get("DISPLAY", ":99")
-    ffmpeg_proc = subprocess.Popen(
-        [
-            "ffmpeg", "-y",
-            "-f", "x11grab",
-            "-video_size", "1920x1080",
-            "-framerate", "15",
-            "-i", display,
-            "-c:v", "libx264",
-            "-preset", "ultrafast",
-            "-crf", "28",
-            str(RECORDING_PATH),
-        ],
-        stdout=subprocess.DEVNULL,
-        stderr=subprocess.DEVNULL,
-    )
+    # Start screen recording of the Xvfb display.
+    # In Steel mode there is no Xvfb (the shim fronts a cloud browser);
+    # Steel's session API exposes rrweb events + HLS, so we skip ffmpeg.
+    if os.environ.get("CLAWBENCH_STEEL_MODE"):
+        ffmpeg_proc = None
+    else:
+        display = os.environ.get("DISPLAY", ":99")
+        ffmpeg_proc = subprocess.Popen(
+            [
+                "ffmpeg", "-y",
+                "-f", "x11grab",
+                "-video_size", "1920x1080",
+                "-framerate", "15",
+                "-i", display,
+                "-c:v", "libx264",
+                "-preset", "ultrafast",
+                "-crf", "28",
+                str(RECORDING_PATH),
+            ],
+            stdout=subprocess.DEVNULL,
+            stderr=subprocess.DEVNULL,
+        )
 
     # Start CDP handler: always logs requests, optionally blocks by URL pattern + method + body/params
     threading.Thread(target=start_cdp_handler,

diff --git a/pyproject.toml b/pyproject.toml
@@ -51,6 +51,12 @@ dev = [
     "build>=1.2",
     "pytest>=8.0",
     "pytest-asyncio>=0.23",
+    # Steel browser-provider scripts (steel-cdp-shim.py,
+    # steel-collect-artifacts.py, tools/probe_steel_multiclient.py).
+    # Listed under dev so host-side tests + the multi-client probe can
+    # run outside a container.
+    "aiohttp>=3.9",
+    "steel-sdk>=0.17",
 ]
 
 [project.scripts]

diff --git a/src/clawbench/batch.py b/src/clawbench/batch.py
@@ -206,6 +206,7 @@ async def run_job(
     batch_start: float,
     no_upload: bool = False,
     harness: str | None = None,
+    browser: str = "local",
 ) -> None:
     assert shutdown_event is not None
     try:
@@ -242,6 +243,8 @@ async def run_job(
                     cmd_parts.append("--no-upload")
                 if harness:
                     cmd_parts += ["--harness", harness]
+                if browser and browser != "local":
+                    cmd_parts += ["--browser", browser]
                 proc = await asyncio.create_subprocess_exec(
                     *cmd_parts,
                     stdout=asyncio.subprocess.PIPE,
@@ -558,7 +561,8 @@ def on_signal() -> None:
     all_tasks = [
         asyncio.create_task(
             run_job(j, sem, throttle, base_output, log_dir, jobs, batch_start,
-                    no_upload=args.no_upload, harness=args.harness)
+                    no_upload=args.no_upload, harness=args.harness,
+                    browser=args.browser)
         )
         for j in jobs
     ]
@@ -618,8 +622,32 @@ def main(argv: list[str] | None = None) -> None:
     from clawbench.run import HARNESSES, DEFAULT_HARNESS
     p.add_argument("--harness", choices=HARNESSES, default=DEFAULT_HARNESS,
                    help=f"Coding-agent harness (default: {DEFAULT_HARNESS})")
+    p.add_argument("--browser", choices=("local", "steel"), default="local",
+                   help="Browser provider passed to each per-case run (default: local)")
     args = p.parse_args(argv)
 
+    if args.browser == "steel":
+        if not os.environ.get("STEEL_API_KEY", "").strip():
+            print("ERROR: --browser=steel requires STEEL_API_KEY in the environment")
+            sys.exit(1)
+        if args.harness == "claude-code-chrome-extension":
+            print("ERROR: --harness=claude-code-chrome-extension is incompatible with "
+                  "--browser=steel (native-messaging bridge cannot span the cloud boundary)")
+            sys.exit(1)
+        # Best-effort visibility into Steel concurrent-session caps. Each
+        # parallel job creates its own Steel session, so --max-concurrent
+        # is also the peak Steel session count.
+        try:
+            from steel import Steel  # type: ignore
+            client = Steel(steel_api_key=os.environ["STEEL_API_KEY"])
+            # Don't fail on this; some SDK versions may not expose .list.
+            sessions = list(client.sessions.list())
+            live = sum(1 for s in sessions if getattr(s, "status", "") == "live")
+            print(f"[batch] Steel: {live} live sessions before start; "
+                  f"--max-concurrent={args.max_concurrent} will add up to that many more")
+        except Exception:
+            pass
+
     rc = asyncio.run(async_main(args))
     sys.exit(rc)
 

diff --git a/src/clawbench/cli.py b/src/clawbench/cli.py
@@ -115,6 +115,9 @@ def tui_cmd() -> None:
 @click.option("--no-upload", is_flag=True, help="Skip HuggingFace upload even if configured.")
 @click.option("--harness", type=_harness_choice(), default=None,
               help="Coding-agent harness (default: openclaw).")
+@click.option("--browser", type=click.Choice(["local", "steel"]), default=None,
+              help="Browser provider: 'local' (Chromium-in-container, default) or "
+                   "'steel' (cloud browser via $STEEL_API_KEY).")
 @click.pass_context
 def run_cmd(
     ctx: click.Context,
@@ -125,6 +128,7 @@ def run_cmd(
     no_build: bool,
     no_upload: bool,
     harness: str | None,
+    browser: str | None,
 ) -> None:
     """Run a single test case against a model (or in --human mode)."""
     from clawbench import run as _run
@@ -152,6 +156,8 @@ def run_cmd(
         argv.append("--no-upload")
     if harness:
         argv += ["--harness", harness]
+    if browser:
+        argv += ["--browser", browser]
     argv += list(ctx.args)
     _run.main(argv)
 
@@ -182,6 +188,8 @@ def run_cmd(
 @click.option("--no-upload", is_flag=True, help="Skip HuggingFace upload for all runs.")
 @click.option("--harness", type=_harness_choice(), default=None,
               help="Coding-agent harness (default: openclaw).")
+@click.option("--browser", type=click.Choice(["local", "steel"]), default=None,
+              help="Browser provider passed to each per-case run (default: local).")
 @click.pass_context
 def batch_cmd(
     ctx: click.Context,
@@ -196,6 +204,7 @@ def batch_cmd(
     dry_run: bool,
     no_upload: bool,
     harness: str | None,
+    browser: str | None,
 ) -> None:
     """Run a model x case cross-product concurrently."""
     from clawbench import batch as _batch
@@ -220,6 +229,8 @@ def batch_cmd(
         argv.append("--no-upload")
     if harness:
         argv += ["--harness", harness]
+    if browser:
+        argv += ["--browser", browser]
     argv += list(ctx.args)
     _batch.main(argv)