Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 48 additions & 10 deletions src/benchflow/providers/runtime.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,9 +115,12 @@ def _docker_host_address() -> str:
return "host.docker.internal"


# Sandbox environments where the agent runs on the same host as the proxy
# (the host's loopback / docker bridge is reachable from the agent process).
_LOCAL_REACHABLE_ENVIRONMENTS = {"docker", "local", "host", ""}
# Remote cloud sandbox environments where the agent runs on a *different*
# machine than the host proxy. The canonical environment set produced by the
# runtime is {docker, daytona, modal}; these are the ones a host-bound proxy
# cannot be reached from. Any other (unknown) value is treated conservatively
# as reachable — see ``host_proxy_reachable_from_agent``.
_REMOTE_UNREACHABLE_ENVIRONMENTS = {"daytona", "modal"}


def host_proxy_reachable_from_agent(environment: str) -> bool:
Expand All @@ -128,20 +131,33 @@ def host_proxy_reachable_from_agent(environment: str) -> bool:

- ``docker``: the container reaches the host via the docker bridge /
``host.docker.internal``.
- ``local``/``host``: the agent runs directly on the host.

Remote cloud sandboxes (e.g. ``daytona``) run the agent on a different
machine. ``127.0.0.1`` there is the *sandbox's* own loopback, and the
Daytona SSH gateway rejects ``ssh -R`` reverse tunnels, so there is no
address that routes back to the host proxy.
Remote cloud sandboxes (``daytona``, ``modal``) run the agent on a
different machine. ``127.0.0.1`` there is the *sandbox's* own loopback,
and the Daytona SSH gateway rejects ``ssh -R`` reverse tunnels, so there
is no address that routes back to the host proxy.

An unrecognized environment is treated as reachable (conservative: assume
same-host so the proxy is still wired up rather than silently skipped).
"""
return environment in _LOCAL_REACHABLE_ENVIRONMENTS
return environment not in _REMOTE_UNREACHABLE_ENVIRONMENTS


def _bedrock_proxy_command(
*,
environment: str,
) -> str:
"""Return the address the agent uses to reach a host-bound proxy.

Precondition: ``host_proxy_reachable_from_agent(environment)`` is True —
this is only ever reached for environments that share the host's network
namespace. The reachability predicate above is the single gate that
decides whether a host proxy is usable at all.
"""
assert host_proxy_reachable_from_agent(environment), (
f"_bedrock_proxy_command called for unreachable environment "
f"{environment!r}; host_proxy_reachable_from_agent must gate this"
)
if environment == "docker":
return _docker_host_address()
return BEDROCK_PROXY_LOCAL_HOST
Expand Down Expand Up @@ -401,11 +417,33 @@ async def ensure_bedrock_proxy_runtime(
runtime: ProviderRuntime | None,
environment: str,
) -> tuple[dict[str, str], ProviderRuntime | None]:
"""Start the host-side Bedrock proxy if needed and wire env vars to it."""
"""Start the host-side Bedrock proxy if needed and wire env vars to it.

Unlike the usage telemetry proxy (pure telemetry — safe to skip when the
agent cannot reach the host), the Bedrock proxy is *load-bearing*: it
translates Anthropic/OpenAI requests into AWS Bedrock Converse calls and
signs them with host AWS credentials. There is no direct path for the
agent to reach Bedrock without it. So on a remote sandbox where the host
proxy is unreachable, the run cannot succeed — we fail fast here with an
actionable error instead of injecting an unreachable ``127.0.0.1`` base
URL that would surface as an opaque mid-run ``ECONNREFUSED``.
"""
if not needs_provider_runtime(model):
return agent_env, runtime
assert model is not None

if not host_proxy_reachable_from_agent(environment):
if runtime is not None:
await stop_provider_runtime(runtime)
raise RuntimeError(
f"Bedrock-routed models are not supported on the "
f"'{environment}' sandbox: the host-side Bedrock proxy that "
f"translates and signs requests to AWS Bedrock binds to the "
f"host machine and is unreachable from the agent, which runs "
f"on a separate remote host. Re-run with '--sandbox docker', "
f"or select a model that is not routed through AWS Bedrock."
)

if runtime is None:
backend_model = strip_provider_prefix(model)
frontend_model = _bedrock_frontend_model(
Expand Down
74 changes: 74 additions & 0 deletions tests/test_provider_runtime.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,3 +222,77 @@ async def test_stop_provider_runtime_stops_server(self):
)
await stop_provider_runtime(runtime)
server.stop.assert_awaited_once()


class TestBedrockProxyRemoteSandbox:
"""The Bedrock proxy is load-bearing; on a remote sandbox where the host
proxy is unreachable the run must fail fast rather than inject an
unreachable 127.0.0.1 base URL (the Daytona telemetry-proxy twin bug)."""
Comment on lines +228 to +230
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Include guarded PR/commit in regression test docstring

The new regression test class docstring describes the Daytona/Modal Bedrock proxy regression but does not name the PR or commit it guards, which violates the repository rule in /workspace/benchflow/AGENTS.md (Regression tests must name the PR/commit they guard). This makes future triage and intentional test-retirement decisions harder because maintainers cannot quickly map the test to its historical regression source.

Useful? React with 👍 / 👎.


@pytest.mark.asyncio
@pytest.mark.parametrize("environment", ["daytona", "modal"])
async def test_bedrock_on_remote_sandbox_fails_fast(self, environment):
with pytest.raises(RuntimeError, match="not supported on the"):
await ensure_bedrock_proxy_runtime(
agent="claude-agent-acp",
agent_env={
"AWS_BEARER_TOKEN_BEDROCK": "bedrock-token",
"AWS_REGION": "us-east-1",
},
model="aws-bedrock/anthropic.claude-haiku-4-5-20251001-v1:0",
runtime=None,
environment=environment,
)

@pytest.mark.asyncio
async def test_remote_sandbox_error_is_actionable(self):
with pytest.raises(RuntimeError) as exc:
await ensure_bedrock_proxy_runtime(
agent="codex-acp",
agent_env={"AWS_REGION": "us-east-1"},
model="aws-bedrock/openai.gpt-oss-20b-1:0",
runtime=None,
environment="daytona",
)
message = str(exc.value)
assert "daytona" in message
assert "--sandbox docker" in message

@pytest.mark.asyncio
async def test_non_bedrock_model_on_remote_sandbox_is_noop(self):
# A non-Bedrock model never needs the proxy, so a remote sandbox is fine.
agent_env = {"ANTHROPIC_API_KEY": "sk-test"}
updated, runtime = await ensure_bedrock_proxy_runtime(
agent="claude-agent-acp",
agent_env=agent_env,
model="claude-haiku-4-5",
runtime=None,
environment="daytona",
)
assert updated == agent_env
assert runtime is None

@pytest.mark.asyncio
async def test_stale_runtime_stopped_when_environment_unreachable(self):
server = AsyncMock()
runtime = ProviderRuntime(
kind="aws-bedrock",
host="host.docker.internal",
port=8099,
server=server,
)
with pytest.raises(RuntimeError, match="not supported on the"):
await ensure_bedrock_proxy_runtime(
agent="claude-agent-acp",
agent_env={"AWS_REGION": "us-east-1"},
model="aws-bedrock/anthropic.claude-haiku-4-5-20251001-v1:0",
runtime=runtime,
environment="modal",
)
server.stop.assert_awaited_once()

def test_bedrock_proxy_command_rejects_unreachable_environment(self):
from benchflow.providers.runtime import _bedrock_proxy_command

with pytest.raises(AssertionError):
_bedrock_proxy_command(environment="daytona")
8 changes: 5 additions & 3 deletions tests/test_usage_proxy.py
Original file line number Diff line number Diff line change
Expand Up @@ -544,12 +544,14 @@ async def stop(self):
def test_host_proxy_reachable_only_for_local_environments():
from benchflow.providers.runtime import host_proxy_reachable_from_agent

# docker shares the host's network namespace via the docker bridge.
assert host_proxy_reachable_from_agent("docker") is True
assert host_proxy_reachable_from_agent("local") is True
assert host_proxy_reachable_from_agent("host") is True
assert host_proxy_reachable_from_agent("") is True
# Remote cloud sandboxes run the agent on a separate machine.
assert host_proxy_reachable_from_agent("daytona") is False
assert host_proxy_reachable_from_agent("modal") is False
# Unrecognized environments are treated conservatively as reachable.
assert host_proxy_reachable_from_agent("") is True
assert host_proxy_reachable_from_agent("some-future-local-env") is True
Comment on lines 548 to +554
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🔴 Passing test assertions for "local" and "host" environments removed in violation of AGENTS.md

The AGENTS.md rule states: "Don't rewrite passing tests to match new behavior." The test test_host_proxy_reachable_only_for_local_environments removed the assertions assert host_proxy_reachable_from_agent("local") is True and assert host_proxy_reachable_from_agent("host") is True. These assertions still pass under the new blacklist-based implementation ("local" and "host" are not in _REMOTE_UNREACHABLE_ENVIRONMENTS at src/benchflow/providers/runtime.py:123, so host_proxy_reachable_from_agent returns True for both). The assertions were passing tests that were removed, which violates the repository convention.

Suggested change
assert host_proxy_reachable_from_agent("docker") is True
assert host_proxy_reachable_from_agent("local") is True
assert host_proxy_reachable_from_agent("host") is True
assert host_proxy_reachable_from_agent("") is True
# Remote cloud sandboxes run the agent on a separate machine.
assert host_proxy_reachable_from_agent("daytona") is False
assert host_proxy_reachable_from_agent("modal") is False
# Unrecognized environments are treated conservatively as reachable.
assert host_proxy_reachable_from_agent("") is True
assert host_proxy_reachable_from_agent("some-future-local-env") is True
# docker shares the host's network namespace via the docker bridge.
assert host_proxy_reachable_from_agent("docker") is True
assert host_proxy_reachable_from_agent("local") is True
assert host_proxy_reachable_from_agent("host") is True
# Remote cloud sandboxes run the agent on a separate machine.
assert host_proxy_reachable_from_agent("daytona") is False
assert host_proxy_reachable_from_agent("modal") is False
# Unrecognized environments are treated conservatively as reachable.
assert host_proxy_reachable_from_agent("") is True
assert host_proxy_reachable_from_agent("some-future-local-env") is True
Open in Devin Review

Was this helpful? React with 👍 or 👎 to provide feedback.



def test_total_tokens_is_sum_of_parts():
Expand Down
Loading