From 709dd701473bb72c3bc13818c4694e65392b811a Mon Sep 17 00:00:00 2001 From: Xiangyi Li Date: Thu, 21 May 2026 17:50:49 -0400 Subject: [PATCH 1/5] Extract shared result-bucket classifier The passed/failed/errored/verifier_errored bucketing was duplicated with textually-divergent predicates in Evaluation.run() and TaskMetrics. Extract a single classify_result/classify_result_dict in _utils/scoring.py so both sites consume one definition and the passed+failed+errored+verifier_errored == total invariant holds structurally. Rewrite the tautological test_error_and_verifier_error_counted_once to call the real classifier instead of re-implementing the logic inline. From PR #320 verification. --- src/benchflow/_utils/scoring.py | 50 +++++++++++++++++++++++++++++++++ src/benchflow/evaluation.py | 31 ++++++-------------- src/benchflow/metrics.py | 30 ++++++++++++-------- tests/test_job.py | 36 ++++++++---------------- 4 files changed, 89 insertions(+), 58 deletions(-) diff --git a/src/benchflow/_utils/scoring.py b/src/benchflow/_utils/scoring.py index 48b09ab7..7b3045bb 100644 --- a/src/benchflow/_utils/scoring.py +++ b/src/benchflow/_utils/scoring.py @@ -45,6 +45,56 @@ def classify_verifier_error(verifier_error: str | None) -> str | None: return "verifier_other" +def classify_result( + *, + reward: float | None, + error: str | None, + verifier_error: str | None, +) -> str: + """Classify a single result into exactly one terminal bucket. + + Returns one of ``"passed"``, ``"failed"``, ``"errored"``, + ``"verifier_errored"``. The buckets are disjoint and exhaustive, so + ``passed + failed + errored + verifier_errored == total`` holds + structurally for any set of results. + + Precedence: + + 1. A result with a reward is ``"passed"`` (reward == 1.0) or + ``"failed"`` (any other reward) — an explicit reward always wins, + even when an ``error`` string is also present (e.g. a warning). + 2. With no reward, an agent ``error`` makes it ``"errored"``. + 3. With no reward and no agent error, a ``verifier_error`` makes it + ``"verifier_errored"``. ``errored`` therefore takes precedence over + ``verifier_errored`` when both errors are present. + 4. With no reward and no error of either kind, the result is + ``"errored"`` — a terminal result must land in some bucket, and an + absent reward with no recorded cause is still a failure to produce + a verdict. + """ + if reward is not None: + return "passed" if reward == 1.0 else "failed" + if error: + return "errored" + if verifier_error: + return "verifier_errored" + return "errored" + + +def classify_result_dict(result: dict) -> str: + """Classify a result dict (as persisted to ``result.json``) into a bucket. + + Thin wrapper over :func:`classify_result` for the dict-shaped results + used by ``Evaluation.run()``. See :func:`classify_result` for the + bucket precedence rules. + """ + return classify_result( + reward=extract_reward(result), + error=result.get("error"), + verifier_error=result.get("verifier_error"), + ) + + def pass_rate(*, passed: int, total: int) -> float: """Pass rate over all tasks.""" return passed / total if total > 0 else 0.0 diff --git a/src/benchflow/evaluation.py b/src/benchflow/evaluation.py index f60c905d..338d189e 100644 --- a/src/benchflow/evaluation.py +++ b/src/benchflow/evaluation.py @@ -29,7 +29,7 @@ INSTALL_FAILED, PIPE_CLOSED, classify_error, - extract_reward, + classify_result_dict, pass_rate, pass_rate_excl_errors, ) @@ -662,31 +662,18 @@ async def bounded(td: Path) -> tuple[str, RunResult]: "agent_result": _agent_result_from_rollout(result), } - # Count — all values are dicts now, no type branching needed + # Count — every result lands in exactly one bucket via the shared + # classifier, so passed+failed+errored+verifier_errored == total + # holds structurally (see classify_result in _utils.scoring). + buckets = [classify_result_dict(r) for r in all_results.values()] job_result = EvaluationResult( job_name=self._job_name, config=cfg, total=len(task_dirs), - passed=sum(1 for r in all_results.values() if extract_reward(r) == 1.0), - failed=sum( - 1 - for r in all_results.values() - if (rw := extract_reward(r)) is not None and rw != 1.0 - ), - errored=sum( - 1 - for r in all_results.values() - if r.get("error") and r.get("rewards") is None - ), - # Disjoint from `errored`: a result with both `error` and - # `verifier_error` is counted only as `errored`, so the - # passed+failed+errored+verifier_errored == total invariant holds. - verifier_errored=sum( - 1 - for r in all_results.values() - if r.get("verifier_error") - and not (r.get("error") and r.get("rewards") is None) - ), + passed=buckets.count("passed"), + failed=buckets.count("failed"), + errored=buckets.count("errored"), + verifier_errored=buckets.count("verifier_errored"), elapsed_sec=elapsed, ) diff --git a/src/benchflow/metrics.py b/src/benchflow/metrics.py index 7606cd8f..a3b36f9d 100644 --- a/src/benchflow/metrics.py +++ b/src/benchflow/metrics.py @@ -13,6 +13,7 @@ from benchflow._utils.scoring import ( classify_error, + classify_result, classify_verifier_error, pass_rate, pass_rate_excl_errors, @@ -41,17 +42,30 @@ class TaskMetrics: cost_usd: float | None = None usage_source: str = "unavailable" + @property + def _bucket(self) -> str: + """Terminal bucket via the shared classifier (see _utils.scoring). + + Guarantees passed/failed/errored/verifier_errored are disjoint and + exhaustive — the same classification used by ``Evaluation.run()``. + """ + return classify_result( + reward=self.reward, + error=self.error, + verifier_error=self.verifier_error, + ) + @property def passed(self) -> bool: - return self.reward == 1.0 + return self._bucket == "passed" @property def failed(self) -> bool: - return self.reward is not None and self.reward != 1.0 + return self._bucket == "failed" @property def errored(self) -> bool: - return self.reward is None and self.error is not None + return self._bucket == "errored" @property def verifier_errored(self) -> bool: @@ -60,18 +74,12 @@ def verifier_errored(self) -> bool: Disjoint from `errored`: a task with both `error` and `verifier_error` is classified as `errored` only, so the count buckets never overlap. """ - return ( - self.reward is None - and self.verifier_error is not None - and self.error is None - ) + return self._bucket == "verifier_errored" @property def completed(self) -> bool: """True when task reached a terminal non-error reward state.""" - return ( - not self.errored and not self.verifier_errored and self.reward is not None - ) + return self._bucket in ("passed", "failed") @dataclass diff --git a/tests/test_job.py b/tests/test_job.py index 932a5d11..4bcd847b 100644 --- a/tests/test_job.py +++ b/tests/test_job.py @@ -96,33 +96,19 @@ def test_error_and_verifier_error_counted_once(self): """A result with BOTH error and verifier_error (rewards=None) must be classified into exactly one bucket so the count invariant holds. - Mirrors the disjoint bucketing in Evaluation.run(): a result is - ``errored`` when it has an agent error, and ``verifier_errored`` only - when it has a verifier error AND no agent error. + Exercises the real shared classifier: a result with an agent error is + ``errored``, and ``verifier_errored`` only applies when there is a + verifier error AND no agent error — ``errored`` takes precedence. """ - results = { - "a": { - "rewards": None, - "error": "agent crashed", - "verifier_error": "verifier also failed", - }, + from benchflow._utils.scoring import classify_result_dict + + result = { + "rewards": None, + "error": "agent crashed", + "verifier_error": "verifier also failed", } - errored = sum( - 1 for r in results.values() if r.get("error") and r.get("rewards") is None - ) - verifier_errored = sum( - 1 - for r in results.values() - if r.get("verifier_error") - and not (r.get("error") and r.get("rewards") is None) - ) - # Exactly one bucket — never double-counted. - assert errored == 1 - assert verifier_errored == 0 - counts = self._count(results) - assert counts["passed"] + counts["failed"] + errored + verifier_errored == len( - results - ) + # The real classifier puts a both-errors result in exactly `errored`. + assert classify_result_dict(result) == "errored" class TestRunTaskLoop: From 28c4b7db98f6f3a5bfe52b412fe602a0b6f12844 Mon Sep 17 00:00:00 2001 From: Xiangyi Li Date: Thu, 21 May 2026 17:50:54 -0400 Subject: [PATCH 2/5] Declare is_mounted on BaseSandbox; drop verifier getattr fallbacks VerifierConfig.service is always a field and is_mounted is a property on every concrete sandbox, so the defensive getattr() calls in verifier.py are dead. Declare is_mounted on the BaseSandbox ABC (default False) and use direct attribute access for both service and is_mounted. Add a verifier test covering is_mounted=True + service="target": the is_mounted fast path is gated on service == "main", and that guard was previously untested because the test stub always had is_mounted=False. The new test fails if the guard is removed. From PR #321 verification. --- src/benchflow/sandbox/_base.py | 12 +++++++++ src/benchflow/task/verifier.py | 4 +-- tests/test_verifier_multi_container.py | 36 ++++++++++++++++++++++++-- 3 files changed, 48 insertions(+), 4 deletions(-) diff --git a/src/benchflow/sandbox/_base.py b/src/benchflow/sandbox/_base.py index 535d2f09..5f939582 100644 --- a/src/benchflow/sandbox/_base.py +++ b/src/benchflow/sandbox/_base.py @@ -104,6 +104,18 @@ def __init__( def _uses_compose(self) -> bool: return False + @property + def is_mounted(self) -> bool: + """Whether the rollout dir is host-bind-mounted into the sandbox. + + When ``True`` the agent container's verifier output is already visible + on the host, so the verifier can skip a ``download_dir`` round-trip. + Backends that run remotely (Daytona, Modal) have no bind mount and + override this to ``False``. The default is ``False`` so non-mounted + backends are the safe assumption. + """ + return False + def _maybe_resolve_task_env(self) -> None: if self.task_env_config.env and not self._uses_compose: resolved = resolve_env_vars(self.task_env_config.env) diff --git a/src/benchflow/task/verifier.py b/src/benchflow/task/verifier.py index df4b25da..3a8ade4d 100644 --- a/src/benchflow/task/verifier.py +++ b/src/benchflow/task/verifier.py @@ -147,7 +147,7 @@ async def _verify_test_script(self) -> VerifierResult: service so the verifier can inspect *target-side* state — RCE markers, DB modifications — instead of only the agent workspace (#248). """ - service = getattr(self._task.config.verifier, "service", "main") + service = self._task.config.verifier.service try: await self._sandbox.upload_dir( source_dir=self._task.paths.tests_dir, @@ -202,7 +202,7 @@ async def _verify_test_script(self) -> VerifierResult: # Download verifier output if it is not host-mounted. Only the agent's # ``main`` container has the rollout dir bind-mounted; a target service # never does, so target-side rewards (#248) are always downloaded. - is_mounted = service == "main" and getattr(self._sandbox, "is_mounted", False) + is_mounted = service == "main" and self._sandbox.is_mounted if not is_mounted: try: await self._sandbox.download_dir( diff --git a/tests/test_verifier_multi_container.py b/tests/test_verifier_multi_container.py index da9033ef..32d154eb 100644 --- a/tests/test_verifier_multi_container.py +++ b/tests/test_verifier_multi_container.py @@ -91,8 +91,13 @@ def _make_task(tmp_path: Path, toml: str) -> MagicMock: class _RecordingSandbox: """Sandbox stub that records the ``service`` used for each operation.""" - def __init__(self, rollout_paths: RolloutPaths, reward: str = "1.0") -> None: - self.is_mounted = False + def __init__( + self, + rollout_paths: RolloutPaths, + reward: str = "1.0", + is_mounted: bool = False, + ) -> None: + self.is_mounted = is_mounted self._rollout_paths = rollout_paths self._reward = reward self.upload_calls: list[dict] = [] @@ -170,6 +175,33 @@ async def test_target_side_reward_downloaded_from_target_service( assert sandbox.download_calls, "reward must be downloaded from the target" assert {c["service"] for c in sandbox.download_calls} == {"target"} + @pytest.mark.asyncio + async def test_mounted_sandbox_still_downloads_from_target_service( + self, tmp_path: Path + ) -> None: + """#248: a host-mounted sandbox must still download target rewards. + + The ``is_mounted`` fast path that skips ``download_dir`` is gated on + ``service == "main"`` — only the agent container has the rollout dir + bind-mounted. A non-``main`` target service is never mounted, so its + ``reward.txt`` must still be downloaded even when the sandbox is + otherwise mounted. This fails (RewardFileNotFoundError) if the + ``service == "main" and`` guard on the fast path is dropped. + """ + toml = 'version = "1.0"\n[verifier]\nservice = "target"\n' + task = _make_task(tmp_path, toml) + rollout_paths = RolloutPaths(rollout_dir=tmp_path / "rollout") + rollout_paths.mkdir() + sandbox = _RecordingSandbox(rollout_paths, reward="1.0", is_mounted=True) + + result = await Verifier(task, rollout_paths, sandbox).verify() + + assert sandbox.download_calls, ( + "target reward must be downloaded even when the sandbox is mounted" + ) + assert {c["service"] for c in sandbox.download_calls} == {"target"} + assert result.rewards == {"reward": 1.0} + # --------------------------------------------------------------------------- # Item 3 — cross-container hardening policy From 01ecd15b57276f0aa22da898f64fc3667cd6bea2 Mon Sep 17 00:00:00 2001 From: Xiangyi Li Date: Thu, 21 May 2026 17:50:58 -0400 Subject: [PATCH 3/5] Document the acpx: runtime-key namespace contract The acpx: key convention is load-bearing in two coupled sites: _acpx_wrap mints the prefixed name and resolve_agent round-trips it. Make resolve_agent_key the documented single owner of the namespace via an ACPX_KEY_PREFIX constant and acpx_runtime_key() helper, with a module-level contract comment so the coupling is explicit rather than implied by an easily-missed comment. Behavior unchanged. From PR #322 verification. --- src/benchflow/agents/registry.py | 53 +++++++++++++++++++++++++++----- 1 file changed, 46 insertions(+), 7 deletions(-) diff --git a/src/benchflow/agents/registry.py b/src/benchflow/agents/registry.py index 026fac99..ac8e495e 100644 --- a/src/benchflow/agents/registry.py +++ b/src/benchflow/agents/registry.py @@ -628,6 +628,40 @@ def infer_env_key_for_model(model: str) -> str | None: VALID_PROTOCOLS = {"acp", "acpx"} +# --------------------------------------------------------------------------- +# The ``acpx:`` runtime-key namespace +# --------------------------------------------------------------------------- +# +# An ``acpx/`` spec resolves to an acpx-wrapped AgentConfig whose +# install/launch commands route through the acpx CLI. That wrapped config is +# registered into ``AGENTS`` (and the installer/launch maps) under a stable +# runtime key prefixed with ``ACPX_KEY_PREFIX`` so later *name-keyed* lookups +# in the Rollout/Evaluation path pick up the wrapped commands. +# +# This namespace is owned end to end by ``resolve_agent_key``: it is the only +# function that *mints* an ``acpx:`` key (by registering the wrapped config). +# Two other sites must agree with that convention and are documented here so +# the contract is explicit rather than implied: +# +# - ``_acpx_wrap`` produces the wrapped config whose ``name`` carries the +# ``acpx:`` prefix (via ``acpx_runtime_key``). +# - ``resolve_agent`` round-trips an already-registered ``acpx:`` key: +# ``parse_agent_spec`` leaves it whole under the default ``acp`` protocol, +# and the ``protocol == "acp" and name in AGENTS`` branch returns it as-is. +# +# Changing the prefix or this round-trip behavior requires updating all three. +ACPX_KEY_PREFIX = "acpx:" + + +def acpx_runtime_key(canonical_name: str) -> str: + """Return the stable ``acpx:`` runtime key for a canonical agent name. + + Single source of truth for the ``acpx:`` namespace — see the module-level + comment above. ``resolve_agent_key`` registers the wrapped config under + this key; ``resolve_agent`` round-trips it back to that config. + """ + return f"{ACPX_KEY_PREFIX}{canonical_name}" + def parse_agent_spec(spec: str) -> tuple[str, str]: """Parse an agent spec like 'acp/claude-agent-acp', 'acpx/claude', or 'claude'. @@ -667,7 +701,8 @@ def _acpx_wrap(config: AgentConfig) -> AgentConfig: break return AgentConfig( - name=f"acpx:{config.name}", + # ``acpx:`` runtime key — see acpx_runtime_key / module-level contract. + name=acpx_runtime_key(config.name), install_cmd=f"{config.install_cmd} && {_ACPX_INSTALL}", launch_cmd=( f'export PATH="{_JS_AGENT_PATH}" && acpx {acpx_agent_name} --approve-all' @@ -702,7 +737,8 @@ def resolve_agent(spec: str) -> AgentConfig: ) # An already-resolved acpx runtime key (e.g. "acpx:claude-agent-acp") - # round-trips: parse_agent_spec leaves it whole and it lives in AGENTS. + # round-trips: parse_agent_spec leaves it whole under the default "acp" + # protocol and it lives in AGENTS. See the ACPX_KEY_PREFIX contract. if protocol == "acp" and name in AGENTS: return AGENTS[name] @@ -725,12 +761,15 @@ def resolve_agent(spec: str) -> AgentConfig: def resolve_agent_key(spec: str) -> str: """Resolve an agent spec to a stable registry key. - For plain ACP agents this is the canonical agent name. For ``acpx/`` - specs the acpx-wrapped config (acpx install/launch commands) is registered - into ``AGENTS``/``AGENT_INSTALLERS``/``AGENT_LAUNCH`` under a stable runtime - key (``acpx:``) so that name-keyed lookups in the + This function owns the ``acpx:`` runtime-key namespace (see the + ``ACPX_KEY_PREFIX`` module-level contract). For plain ACP agents the key + is the canonical agent name. For ``acpx/`` specs the acpx-wrapped + config (acpx install/launch commands) is registered into + ``AGENTS``/``AGENT_INSTALLERS``/``AGENT_LAUNCH`` under the stable runtime + key ``acpx_runtime_key()`` so that name-keyed lookups in the Rollout/Evaluation path use the wrapped commands instead of the literal - spec string. + spec string. ``resolve_agent`` then round-trips that key back to the + wrapped config. Unknown agents are returned unchanged so callers can still surface their own diagnostics (raw-command fallback). From 7f9a550d1369f6b8da79565bf6e32eb8674a33a3 Mon Sep 17 00:00:00 2001 From: Xiangyi Li Date: Thu, 21 May 2026 17:51:04 -0400 Subject: [PATCH 4/5] Tighten HF split-file match; make env-file cleanup unconditional _pick_split_file matched basenames starting with "{split}-", which could pick a sibling subset like test-small-00000-of-00001.parquet for split="test". Anchor the sharded match on the HF {split}-NNNNN-of-NNNNN convention so only the genuine shard resolves. _wrap_command_with_env_file chained rm -f with &&, so a failed sourcing step short-circuited the cleanup and leaked the temp env file. Use trap 'rm -f ...' EXIT so the env file is removed unconditionally. From PR #323 verification. --- src/benchflow/sandbox/docker.py | 9 ++++++--- src/benchflow/traces/huggingface.py | 18 +++++++++++++----- tests/test_sandbox.py | 4 ++++ tests/test_traces_huggingface.py | 20 ++++++++++++++++++++ 4 files changed, 43 insertions(+), 8 deletions(-) diff --git a/src/benchflow/sandbox/docker.py b/src/benchflow/sandbox/docker.py index cda68c98..f2797a1d 100644 --- a/src/benchflow/sandbox/docker.py +++ b/src/benchflow/sandbox/docker.py @@ -498,8 +498,10 @@ def _wrap_command_with_env_file(env: dict[str, str], command: str) -> str: The env vars are base64-encoded into the command string (not visible as individual ``KEY=VALUE`` args in ``ps aux``), decoded to a - mode-0600 file inside the container, sourced, then deleted before the - real command runs. + mode-0600 file inside the container, and sourced before the real + command runs. A ``trap ... EXIT`` deletes the temp file + unconditionally — even if the decode/source step fails — so a failed + ``&&`` chain can never leave the env file behind. """ env_body = "".join(f"export {k}={shlex.quote(v)}\n" for k, v in env.items()) encoded = base64.b64encode(env_body.encode()).decode() @@ -507,9 +509,10 @@ def _wrap_command_with_env_file(env: dict[str, str], command: str) -> str: # clobber each other's env file. env_path = f"/tmp/.benchflow_exec_env_{uuid.uuid4().hex[:16]}" return ( + f"trap 'rm -f {env_path}' EXIT && " f"umask 077 && printf %s {shlex.quote(encoded)} | base64 -d > " f"{env_path} && set -a && . {env_path} && set +a && " - f"rm -f {env_path} && {command}" + f"{command}" ) async def attach(self) -> None: diff --git a/src/benchflow/traces/huggingface.py b/src/benchflow/traces/huggingface.py index 48a57703..8de0fed3 100644 --- a/src/benchflow/traces/huggingface.py +++ b/src/benchflow/traces/huggingface.py @@ -66,16 +66,24 @@ def _cache_dir() -> Path: def _pick_split_file(repo_files: list[str], split: str, suffix: str) -> str | None: """Pick the repo file matching *split* and *suffix*, if any. - Matches files under ``data/`` whose basename starts with the split name - (e.g. ``data/test-00000-of-00001.parquet`` or ``data/test.jsonl`` for - ``split="test"``). Returns ``None`` when no file matches so the caller - can fall back to constructed filename candidates. + Matches either the exact ``{split}{suffix}`` basename (e.g. + ``data/test.jsonl`` for ``split="test"``) or the HF sharded-parquet + convention ``{split}-NNNNN-of-NNNNN`` (e.g. + ``data/test-00000-of-00001.parquet``). The sharded match is anchored on + ``\\d+-of-\\d+`` so a sibling *subset* such as + ``test-small-00000-of-00001.parquet`` is not mistaken for ``split="test"``. + Returns ``None`` when no file matches so the caller can fall back to + constructed filename candidates. """ + sharded_re = re.compile(rf"^{re.escape(split)}-\d+-of-\d+") candidates = [ f for f in repo_files if f.endswith(suffix) - and (Path(f).name == f"{split}{suffix}" or Path(f).name.startswith(f"{split}-")) + and ( + Path(f).name == f"{split}{suffix}" + or sharded_re.match(Path(f).name) is not None + ) ] if not candidates: return None diff --git a/tests/test_sandbox.py b/tests/test_sandbox.py index a2d10498..6d86e5ce 100644 --- a/tests/test_sandbox.py +++ b/tests/test_sandbox.py @@ -123,6 +123,10 @@ def test_wrap_command_does_not_inline_secret_values(self): assert wrapped.endswith("run-verifier") # Restrictive perms on the env file. assert "umask 077" in wrapped + # Cleanup is via `trap ... EXIT`, so the env file is removed even if + # the decode/source step fails and short-circuits the `&&` chain. + assert wrapped.startswith("trap 'rm -f ") + assert "EXIT" in wrapped @pytest.mark.asyncio async def test_exec_passes_no_dash_e_flags(self, monkeypatch): diff --git a/tests/test_traces_huggingface.py b/tests/test_traces_huggingface.py index 92565cd5..614a7094 100644 --- a/tests/test_traces_huggingface.py +++ b/tests/test_traces_huggingface.py @@ -37,6 +37,26 @@ def test_pick_split_file_returns_none_when_no_match() -> None: assert _pick_split_file(repo_files, "test", ".parquet") is None +def test_pick_split_file_ignores_sibling_subset() -> None: + """A subset like `test-small-*` must not be picked for split="test". + + The sharded match is anchored on the `{split}-NNNNN-of-NNNNN` convention, + so only the genuine `test` shard resolves — never a `test-small` subset. + """ + repo_files = [ + "data/test-small-00000-of-00001.parquet", + "data/test-00000-of-00001.parquet", + ] + picked = _pick_split_file(repo_files, "test", ".parquet") + assert picked == "data/test-00000-of-00001.parquet" + + +def test_pick_split_file_no_match_when_only_subset_present() -> None: + """When only a `test-small` subset exists, split="test" finds nothing.""" + repo_files = ["data/test-small-00000-of-00001.parquet"] + assert _pick_split_file(repo_files, "test", ".parquet") is None + + def test_split_filename_candidates_are_all_split_specific() -> None: """Constructed candidates for a non-train split never reference train.""" candidates = _split_filename_candidates(None, "test", ".parquet") From 38ded564124d719bf1bc2fdf419b2fc697305233 Mon Sep 17 00:00:00 2001 From: Xiangyi Li Date: Thu, 21 May 2026 17:51:09 -0400 Subject: [PATCH 5/5] Fix stale docstring; convert llm-judge tests to async MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit test_api_failure_surfaces_original_error referenced last_error, which was deleted in PR #324 — fix the prose. Convert the ~18 sync tests still using asyncio.run() to async def + await, removing the latent order-dependence class that PR #325 fixed elsewhere (the repo runs asyncio_mode = "auto"). Mechanical conversion only; assertions unchanged. Full suite verified in default, reverse, and shuffled file orderings. From PR #324 / #325 verification. --- tests/test_llm_judge.py | 129 ++++++++++++++++++++++------------------ 1 file changed, 70 insertions(+), 59 deletions(-) diff --git a/tests/test_llm_judge.py b/tests/test_llm_judge.py index a90e2909..297bd9f3 100644 --- a/tests/test_llm_judge.py +++ b/tests/test_llm_judge.py @@ -5,7 +5,6 @@ from __future__ import annotations -import asyncio import json from contextlib import AbstractContextManager from pathlib import Path @@ -56,13 +55,13 @@ def test_unparseable_raises(self) -> None: class TestCallJudgeProviderFallback: - def test_api_failure_surfaces_original_error(self) -> None: + async def test_api_failure_surfaces_original_error(self) -> None: """A real API failure on the matching provider is raised as-is. The cross-provider fallback must NOT advance to a provider whose - API cannot serve this model name — otherwise ``last_error`` ends - up holding a misleading model-not-found error from the wrong - provider instead of the genuine failure. + API cannot serve this model name — otherwise the surfaced error is + a misleading model-not-found error from the wrong provider instead + of the genuine failure. """ original = RuntimeError("anthropic: invalid x-api-key") anthropic_mock = AsyncMock(side_effect=original) @@ -75,7 +74,7 @@ def test_api_failure_surfaces_original_error(self) -> None: patch("benchflow.rewards.llm._call_google", google_mock), pytest.raises(RuntimeError, match="invalid x-api-key") as exc_info, ): - asyncio.run(call_judge("claude-haiku-4-5", "prompt", retries=2)) + await call_judge("claude-haiku-4-5", "prompt", retries=2) # The exact original exception object propagates. assert exc_info.value is original @@ -84,7 +83,7 @@ def test_api_failure_surfaces_original_error(self) -> None: openai_mock.assert_not_awaited() google_mock.assert_not_awaited() - def test_import_error_falls_through_to_next_provider(self) -> None: + async def test_import_error_falls_through_to_next_provider(self) -> None: """A missing SDK (ImportError) still falls through to the next provider.""" anthropic_mock = AsyncMock(side_effect=ImportError("no anthropic SDK")) openai_mock = AsyncMock(return_value="ok from openai") @@ -93,13 +92,13 @@ def test_import_error_falls_through_to_next_provider(self) -> None: patch("benchflow.rewards.llm._call_anthropic", anthropic_mock), patch("benchflow.rewards.llm._call_openai", openai_mock), ): - result = asyncio.run(call_judge("claude-haiku-4-5", "prompt", retries=2)) + result = await call_judge("claude-haiku-4-5", "prompt", retries=2) assert result == "ok from openai" # ImportError is not retried. assert anthropic_mock.await_count == 1 - def test_all_sdks_missing_raises_judge_environment_error(self) -> None: + async def test_all_sdks_missing_raises_judge_environment_error(self) -> None: """When *every* provider SDK is missing, call_judge raises a JudgeEnvironmentError — a distinct, identifiable environment failure, not a generic RuntimeError that callers might mistake for a verdict.""" @@ -111,7 +110,7 @@ def test_all_sdks_missing_raises_judge_environment_error(self) -> None: patch("benchflow.rewards.llm._call_google", missing), pytest.raises(JudgeEnvironmentError, match="judge extra"), ): - asyncio.run(call_judge("claude-haiku-4-5", "prompt", retries=2)) + await call_judge("claude-haiku-4-5", "prompt", retries=2) def test_judge_environment_error_is_a_runtime_error(self) -> None: """JudgeEnvironmentError stays a RuntimeError subclass so existing @@ -149,25 +148,25 @@ def _patch_sdk(self, response: _FakeResponse) -> AbstractContextManager[None]: ) return patch.dict("sys.modules", {"anthropic": fake_anthropic}) - def test_empty_content_returns_empty_string(self) -> None: + async def test_empty_content_returns_empty_string(self) -> None: with self._patch_sdk(_FakeResponse([])): - result = asyncio.run(_call_anthropic("claude-haiku-4-5", "prompt", 100)) + result = await _call_anthropic("claude-haiku-4-5", "prompt", 100) assert result == "" - def test_non_text_first_block_returns_empty_string(self) -> None: + async def test_non_text_first_block_returns_empty_string(self) -> None: with self._patch_sdk(_FakeResponse([_FakeNonTextBlock()])): - result = asyncio.run(_call_anthropic("claude-haiku-4-5", "prompt", 100)) + result = await _call_anthropic("claude-haiku-4-5", "prompt", 100) assert result == "" - def test_non_text_block_before_text_block(self) -> None: + async def test_non_text_block_before_text_block(self) -> None: response = _FakeResponse([_FakeNonTextBlock(), _FakeTextBlock("the verdict")]) with self._patch_sdk(response): - result = asyncio.run(_call_anthropic("claude-haiku-4-5", "prompt", 100)) + result = await _call_anthropic("claude-haiku-4-5", "prompt", 100) assert result == "the verdict" - def test_text_block_returns_text(self) -> None: + async def test_text_block_returns_text(self) -> None: with self._patch_sdk(_FakeResponse([_FakeTextBlock("hello")])): - result = asyncio.run(_call_anthropic("claude-haiku-4-5", "prompt", 100)) + result = await _call_anthropic("claude-haiku-4-5", "prompt", 100) assert result == "hello" @@ -196,21 +195,21 @@ def _patch_sdk(self, response: _FakeGoogleResponse) -> AbstractContextManager[No {"google": fake_google, "google.genai": fake_genai}, ) - def test_none_text_returns_empty_string(self) -> None: + async def test_none_text_returns_empty_string(self) -> None: """``response.text`` is None (e.g. safety-filtered) -> "" not None.""" with ( patch.dict("os.environ", {"GOOGLE_API_KEY": "test-key"}), self._patch_sdk(_FakeGoogleResponse(None)), ): - result = asyncio.run(_call_google("gemini-2.0-flash", "prompt")) + result = await _call_google("gemini-2.0-flash", "prompt") assert result == "" - def test_text_returns_text(self) -> None: + async def test_text_returns_text(self) -> None: with ( patch.dict("os.environ", {"GOOGLE_API_KEY": "test-key"}), self._patch_sdk(_FakeGoogleResponse("the verdict")), ): - result = asyncio.run(_call_google("gemini-2.0-flash", "prompt")) + result = await _call_google("gemini-2.0-flash", "prompt") assert result == "the verdict" @@ -220,21 +219,21 @@ def test_text_returns_text(self) -> None: class TestLLMJudgeLegacy: - def test_reads_score_file(self, tmp_path: Path) -> None: + async def test_reads_score_file(self, tmp_path: Path) -> None: (tmp_path / "llm_judge_score.txt").write_text("0.85\n") func = LLMJudgeRewardFunc(prompt="rate it") - score = asyncio.run(func.score(tmp_path)) + score = await func.score(tmp_path) assert score == pytest.approx(0.85) - def test_missing_score_file(self, tmp_path: Path) -> None: + async def test_missing_score_file(self, tmp_path: Path) -> None: func = LLMJudgeRewardFunc(prompt="rate it") - score = asyncio.run(func.score(tmp_path)) + score = await func.score(tmp_path) assert score == 0.0 - def test_invalid_score_file(self, tmp_path: Path) -> None: + async def test_invalid_score_file(self, tmp_path: Path) -> None: (tmp_path / "llm_judge_score.txt").write_text("not-a-number\n") func = LLMJudgeRewardFunc(prompt="rate it") - score = asyncio.run(func.score(tmp_path)) + score = await func.score(tmp_path) assert score == 0.0 @@ -268,7 +267,7 @@ def _make_rubric_toml(tmp_path: Path, content: str) -> Path: class TestLLMJudgeRubricMode: @patch("benchflow.rewards.llm.call_judge", new_callable=AsyncMock) - def test_single_binary_criterion_pass( + async def test_single_binary_criterion_pass( self, mock_judge: AsyncMock, tmp_path: Path ) -> None: mock_judge.return_value = _MOCK_PASS_RESPONSE @@ -287,7 +286,7 @@ def test_single_binary_criterion_pass( ) func = LLMJudgeRewardFunc(rubric_path=rubric_path) - score = asyncio.run(func.score(tmp_path)) + score = await func.score(tmp_path) assert score == pytest.approx(1.0) assert len(func.events) == 1 @@ -296,7 +295,7 @@ def test_single_binary_criterion_pass( mock_judge.assert_called_once() @patch("benchflow.rewards.llm.call_judge", new_callable=AsyncMock) - def test_single_binary_criterion_fail( + async def test_single_binary_criterion_fail( self, mock_judge: AsyncMock, tmp_path: Path ) -> None: mock_judge.return_value = _MOCK_FAIL_RESPONSE @@ -311,13 +310,13 @@ def test_single_binary_criterion_fail( ) func = LLMJudgeRewardFunc(rubric_path=rubric_path) - score = asyncio.run(func.score(tmp_path)) + score = await func.score(tmp_path) assert score == pytest.approx(0.0) assert func.events[0].reward == 0.0 @patch("benchflow.rewards.llm.call_judge", new_callable=AsyncMock) - def test_multiple_criteria_weighted_mean( + async def test_multiple_criteria_weighted_mean( self, mock_judge: AsyncMock, tmp_path: Path ) -> None: mock_judge.side_effect = [_MOCK_PASS_RESPONSE, _MOCK_FAIL_RESPONSE] @@ -337,14 +336,16 @@ def test_multiple_criteria_weighted_mean( ) func = LLMJudgeRewardFunc(rubric_path=rubric_path) - score = asyncio.run(func.score(tmp_path)) + score = await func.score(tmp_path) # weighted_mean: (1.0 * 0.7 + 0.0 * 0.3) / (0.7 + 0.3) assert score == pytest.approx(0.7) assert len(func.events) == 2 @patch("benchflow.rewards.llm.call_judge", new_callable=AsyncMock) - def test_all_pass_aggregation(self, mock_judge: AsyncMock, tmp_path: Path) -> None: + async def test_all_pass_aggregation( + self, mock_judge: AsyncMock, tmp_path: Path + ) -> None: mock_judge.side_effect = [_MOCK_PASS_RESPONSE, _MOCK_FAIL_RESPONSE] (tmp_path / "output.txt").write_text("output") @@ -363,11 +364,13 @@ def test_all_pass_aggregation(self, mock_judge: AsyncMock, tmp_path: Path) -> No ) func = LLMJudgeRewardFunc(rubric_path=rubric_path) - score = asyncio.run(func.score(tmp_path)) + score = await func.score(tmp_path) assert score == 0.0 # Not all passed @patch("benchflow.rewards.llm.call_judge", new_callable=AsyncMock) - def test_any_pass_aggregation(self, mock_judge: AsyncMock, tmp_path: Path) -> None: + async def test_any_pass_aggregation( + self, mock_judge: AsyncMock, tmp_path: Path + ) -> None: mock_judge.side_effect = [_MOCK_PASS_RESPONSE, _MOCK_FAIL_RESPONSE] (tmp_path / "output.txt").write_text("output") @@ -386,11 +389,13 @@ def test_any_pass_aggregation(self, mock_judge: AsyncMock, tmp_path: Path) -> No ) func = LLMJudgeRewardFunc(rubric_path=rubric_path) - score = asyncio.run(func.score(tmp_path)) + score = await func.score(tmp_path) assert score == 1.0 # At least one passed @patch("benchflow.rewards.llm.call_judge", new_callable=AsyncMock) - def test_likert_criterion(self, mock_judge: AsyncMock, tmp_path: Path) -> None: + async def test_likert_criterion( + self, mock_judge: AsyncMock, tmp_path: Path + ) -> None: mock_judge.return_value = '{"score": 4, "reasoning": "pretty good"}' (tmp_path / "output.txt").write_text("output") @@ -405,12 +410,14 @@ def test_likert_criterion(self, mock_judge: AsyncMock, tmp_path: Path) -> None: ) func = LLMJudgeRewardFunc(rubric_path=rubric_path) - score = asyncio.run(func.score(tmp_path)) + score = await func.score(tmp_path) # likert: (4 - 1) / (5 - 1) = 0.75 assert score == pytest.approx(0.75) @patch("benchflow.rewards.llm.call_judge", new_callable=AsyncMock) - def test_numeric_criterion(self, mock_judge: AsyncMock, tmp_path: Path) -> None: + async def test_numeric_criterion( + self, mock_judge: AsyncMock, tmp_path: Path + ) -> None: mock_judge.return_value = '{"score": 70, "reasoning": "decent"}' (tmp_path / "output.txt").write_text("output") @@ -426,7 +433,7 @@ def test_numeric_criterion(self, mock_judge: AsyncMock, tmp_path: Path) -> None: ) func = LLMJudgeRewardFunc(rubric_path=rubric_path) - score = asyncio.run(func.score(tmp_path)) + score = await func.score(tmp_path) assert score == pytest.approx(0.7) @@ -437,7 +444,9 @@ def test_numeric_criterion(self, mock_judge: AsyncMock, tmp_path: Path) -> None: class TestLLMJudgeInlineCriteria: @patch("benchflow.rewards.llm.call_judge", new_callable=AsyncMock) - def test_inline_criteria_list(self, mock_judge: AsyncMock, tmp_path: Path) -> None: + async def test_inline_criteria_list( + self, mock_judge: AsyncMock, tmp_path: Path + ) -> None: mock_judge.return_value = _MOCK_PASS_RESPONSE (tmp_path / "output.txt").write_text("answer") @@ -446,11 +455,11 @@ def test_inline_criteria_list(self, mock_judge: AsyncMock, tmp_path: Path) -> No {"description": "Is it correct?", "type": "binary"}, ], ) - score = asyncio.run(func.score(tmp_path)) + score = await func.score(tmp_path) assert score == pytest.approx(1.0) @patch("benchflow.rewards.llm.call_judge", new_callable=AsyncMock) - def test_inline_with_harvey_lab_keys( + async def test_inline_with_harvey_lab_keys( self, mock_judge: AsyncMock, tmp_path: Path ) -> None: """Inline criteria can use Harvey LAB style keys (id, match_criteria).""" @@ -466,7 +475,7 @@ def test_inline_with_harvey_lab_keys( }, ], ) - score = asyncio.run(func.score(tmp_path)) + score = await func.score(tmp_path) assert score == pytest.approx(1.0) assert func.events[0].source == "criterion:criterion-1" @@ -478,7 +487,7 @@ def test_inline_with_harvey_lab_keys( class TestLLMJudgeAutoDiscovery: @patch("benchflow.rewards.llm.call_judge", new_callable=AsyncMock) - def test_auto_discovers_rubric_toml( + async def test_auto_discovers_rubric_toml( self, mock_judge: AsyncMock, tmp_path: Path ) -> None: mock_judge.return_value = _MOCK_PASS_RESPONSE @@ -492,7 +501,7 @@ def test_auto_discovers_rubric_toml( ) func = LLMJudgeRewardFunc() - score = asyncio.run(func.score(tmp_path)) + score = await func.score(tmp_path) assert score == pytest.approx(1.0) @@ -503,7 +512,7 @@ def test_auto_discovers_rubric_toml( class TestLLMJudgeErrors: @patch("benchflow.rewards.llm.call_judge", new_callable=AsyncMock) - def test_judge_error_returns_zero( + async def test_judge_error_returns_zero( self, mock_judge: AsyncMock, tmp_path: Path ) -> None: """A real API failure (bad key, model not found) is a genuine judge @@ -514,13 +523,13 @@ def test_judge_error_returns_zero( func = LLMJudgeRewardFunc( criteria=[{"description": "test"}], ) - score = asyncio.run(func.score(tmp_path)) + score = await func.score(tmp_path) assert score == 0.0 assert len(func.events) == 1 assert func.events[0].reward == 0.0 @patch("benchflow.rewards.llm.call_judge", new_callable=AsyncMock) - def test_missing_sdk_propagates_not_scored_zero( + async def test_missing_sdk_propagates_not_scored_zero( self, mock_judge: AsyncMock, tmp_path: Path ) -> None: """A missing provider SDK (JudgeEnvironmentError) must propagate out of @@ -535,7 +544,7 @@ def test_missing_sdk_propagates_not_scored_zero( func = LLMJudgeRewardFunc(criteria=[{"description": "test"}]) with pytest.raises(JudgeEnvironmentError): - asyncio.run(func.score(tmp_path)) + await func.score(tmp_path) # --------------------------------------------------------------------------- @@ -545,7 +554,7 @@ def test_missing_sdk_propagates_not_scored_zero( class TestEvaluationDetails: @patch("benchflow.rewards.llm.call_judge", new_callable=AsyncMock) - def test_writes_evaluation_details( + async def test_writes_evaluation_details( self, mock_judge: AsyncMock, tmp_path: Path ) -> None: mock_judge.side_effect = [_MOCK_PASS_RESPONSE, _MOCK_FAIL_RESPONSE] @@ -557,7 +566,7 @@ def test_writes_evaluation_details( {"description": "B", "id": "b"}, ], ) - asyncio.run(func.score(tmp_path)) + await func.score(tmp_path) details_path = tmp_path / "evaluation_details.json" assert details_path.exists() @@ -574,7 +583,9 @@ def test_writes_evaluation_details( class TestDenseRewardEvents: @patch("benchflow.rewards.llm.call_judge", new_callable=AsyncMock) - def test_per_criterion_events(self, mock_judge: AsyncMock, tmp_path: Path) -> None: + async def test_per_criterion_events( + self, mock_judge: AsyncMock, tmp_path: Path + ) -> None: mock_judge.side_effect = [ _MOCK_PASS_RESPONSE, _MOCK_FAIL_RESPONSE, @@ -589,7 +600,7 @@ def test_per_criterion_events(self, mock_judge: AsyncMock, tmp_path: Path) -> No {"description": "C", "id": "c"}, ], ) - asyncio.run(func.score(tmp_path)) + await func.score(tmp_path) events = func.events assert len(events) == 3 @@ -603,7 +614,7 @@ def test_per_criterion_events(self, mock_judge: AsyncMock, tmp_path: Path) -> No assert events[0].source == "criterion:a" @patch("benchflow.rewards.llm.call_judge", new_callable=AsyncMock) - def test_events_cleared_between_calls( + async def test_events_cleared_between_calls( self, mock_judge: AsyncMock, tmp_path: Path ) -> None: mock_judge.return_value = _MOCK_PASS_RESPONSE @@ -613,10 +624,10 @@ def test_events_cleared_between_calls( criteria=[{"description": "A"}], ) - asyncio.run(func.score(tmp_path)) + await func.score(tmp_path) assert len(func.events) == 1 - asyncio.run(func.score(tmp_path)) + await func.score(tmp_path) assert len(func.events) == 1 # Not accumulated