From 709dd701473bb72c3bc13818c4694e65392b811a Mon Sep 17 00:00:00 2001
From: Xiangyi Li <xiangyi@benchflow.ai>
Date: Thu, 21 May 2026 17:50:49 -0400
Subject: [PATCH 1/5] Extract shared result-bucket classifier

The passed/failed/errored/verifier_errored bucketing was duplicated with
textually-divergent predicates in Evaluation.run() and TaskMetrics. Extract
a single classify_result/classify_result_dict in _utils/scoring.py so both
sites consume one definition and the
passed+failed+errored+verifier_errored == total invariant holds
structurally. Rewrite the tautological test_error_and_verifier_error_counted_once
to call the real classifier instead of re-implementing the logic inline.

From PR #320 verification.
---
 src/benchflow/_utils/scoring.py | 50 +++++++++++++++++++++++++++++++++
 src/benchflow/evaluation.py     | 31 ++++++--------------
 src/benchflow/metrics.py        | 30 ++++++++++++--------
 tests/test_job.py               | 36 ++++++++----------------
 4 files changed, 89 insertions(+), 58 deletions(-)

diff --git a/src/benchflow/_utils/scoring.py b/src/benchflow/_utils/scoring.py
index 48b09ab7..7b3045bb 100644
--- a/src/benchflow/_utils/scoring.py
+++ b/src/benchflow/_utils/scoring.py
@@ -45,6 +45,56 @@ def classify_verifier_error(verifier_error: str | None) -> str | None:
     return "verifier_other"
 
 
+def classify_result(
+    *,
+    reward: float | None,
+    error: str | None,
+    verifier_error: str | None,
+) -> str:
+    """Classify a single result into exactly one terminal bucket.
+
+    Returns one of ``"passed"``, ``"failed"``, ``"errored"``,
+    ``"verifier_errored"``. The buckets are disjoint and exhaustive, so
+    ``passed + failed + errored + verifier_errored == total`` holds
+    structurally for any set of results.
+
+    Precedence:
+
+    1. A result with a reward is ``"passed"`` (reward == 1.0) or
+       ``"failed"`` (any other reward) — an explicit reward always wins,
+       even when an ``error`` string is also present (e.g. a warning).
+    2. With no reward, an agent ``error`` makes it ``"errored"``.
+    3. With no reward and no agent error, a ``verifier_error`` makes it
+       ``"verifier_errored"``. ``errored`` therefore takes precedence over
+       ``verifier_errored`` when both errors are present.
+    4. With no reward and no error of either kind, the result is
+       ``"errored"`` — a terminal result must land in some bucket, and an
+       absent reward with no recorded cause is still a failure to produce
+       a verdict.
+    """
+    if reward is not None:
+        return "passed" if reward == 1.0 else "failed"
+    if error:
+        return "errored"
+    if verifier_error:
+        return "verifier_errored"
+    return "errored"
+
+
+def classify_result_dict(result: dict) -> str:
+    """Classify a result dict (as persisted to ``result.json``) into a bucket.
+
+    Thin wrapper over :func:`classify_result` for the dict-shaped results
+    used by ``Evaluation.run()``. See :func:`classify_result` for the
+    bucket precedence rules.
+    """
+    return classify_result(
+        reward=extract_reward(result),
+        error=result.get("error"),
+        verifier_error=result.get("verifier_error"),
+    )
+
+
 def pass_rate(*, passed: int, total: int) -> float:
     """Pass rate over all tasks."""
     return passed / total if total > 0 else 0.0
diff --git a/src/benchflow/evaluation.py b/src/benchflow/evaluation.py
index f60c905d..338d189e 100644
--- a/src/benchflow/evaluation.py
+++ b/src/benchflow/evaluation.py
@@ -29,7 +29,7 @@
     INSTALL_FAILED,
     PIPE_CLOSED,
     classify_error,
-    extract_reward,
+    classify_result_dict,
     pass_rate,
     pass_rate_excl_errors,
 )
@@ -662,31 +662,18 @@ async def bounded(td: Path) -> tuple[str, RunResult]:
                 "agent_result": _agent_result_from_rollout(result),
             }
 
-        # Count — all values are dicts now, no type branching needed
+        # Count — every result lands in exactly one bucket via the shared
+        # classifier, so passed+failed+errored+verifier_errored == total
+        # holds structurally (see classify_result in _utils.scoring).
+        buckets = [classify_result_dict(r) for r in all_results.values()]
         job_result = EvaluationResult(
             job_name=self._job_name,
             config=cfg,
             total=len(task_dirs),
-            passed=sum(1 for r in all_results.values() if extract_reward(r) == 1.0),
-            failed=sum(
-                1
-                for r in all_results.values()
-                if (rw := extract_reward(r)) is not None and rw != 1.0
-            ),
-            errored=sum(
-                1
-                for r in all_results.values()
-                if r.get("error") and r.get("rewards") is None
-            ),
-            # Disjoint from `errored`: a result with both `error` and
-            # `verifier_error` is counted only as `errored`, so the
-            # passed+failed+errored+verifier_errored == total invariant holds.
-            verifier_errored=sum(
-                1
-                for r in all_results.values()
-                if r.get("verifier_error")
-                and not (r.get("error") and r.get("rewards") is None)
-            ),
+            passed=buckets.count("passed"),
+            failed=buckets.count("failed"),
+            errored=buckets.count("errored"),
+            verifier_errored=buckets.count("verifier_errored"),
             elapsed_sec=elapsed,
         )
 
diff --git a/src/benchflow/metrics.py b/src/benchflow/metrics.py
index 7606cd8f..a3b36f9d 100644
--- a/src/benchflow/metrics.py
+++ b/src/benchflow/metrics.py
@@ -13,6 +13,7 @@
 
 from benchflow._utils.scoring import (
     classify_error,
+    classify_result,
     classify_verifier_error,
     pass_rate,
     pass_rate_excl_errors,
@@ -41,17 +42,30 @@ class TaskMetrics:
     cost_usd: float | None = None
     usage_source: str = "unavailable"
 
+    @property
+    def _bucket(self) -> str:
+        """Terminal bucket via the shared classifier (see _utils.scoring).
+
+        Guarantees passed/failed/errored/verifier_errored are disjoint and
+        exhaustive — the same classification used by ``Evaluation.run()``.
+        """
+        return classify_result(
+            reward=self.reward,
+            error=self.error,
+            verifier_error=self.verifier_error,
+        )
+
     @property
     def passed(self) -> bool:
-        return self.reward == 1.0
+        return self._bucket == "passed"
 
     @property
     def failed(self) -> bool:
-        return self.reward is not None and self.reward != 1.0
+        return self._bucket == "failed"
 
     @property
     def errored(self) -> bool:
-        return self.reward is None and self.error is not None
+        return self._bucket == "errored"
 
     @property
     def verifier_errored(self) -> bool:
@@ -60,18 +74,12 @@ def verifier_errored(self) -> bool:
         Disjoint from `errored`: a task with both `error` and `verifier_error`
         is classified as `errored` only, so the count buckets never overlap.
         """
-        return (
-            self.reward is None
-            and self.verifier_error is not None
-            and self.error is None
-        )
+        return self._bucket == "verifier_errored"
 
     @property
     def completed(self) -> bool:
         """True when task reached a terminal non-error reward state."""
-        return (
-            not self.errored and not self.verifier_errored and self.reward is not None
-        )
+        return self._bucket in ("passed", "failed")
 
 
 @dataclass
diff --git a/tests/test_job.py b/tests/test_job.py
index 932a5d11..4bcd847b 100644
--- a/tests/test_job.py
+++ b/tests/test_job.py
@@ -96,33 +96,19 @@ def test_error_and_verifier_error_counted_once(self):
         """A result with BOTH error and verifier_error (rewards=None) must be
         classified into exactly one bucket so the count invariant holds.
 
-        Mirrors the disjoint bucketing in Evaluation.run(): a result is
-        ``errored`` when it has an agent error, and ``verifier_errored`` only
-        when it has a verifier error AND no agent error.
+        Exercises the real shared classifier: a result with an agent error is
+        ``errored``, and ``verifier_errored`` only applies when there is a
+        verifier error AND no agent error — ``errored`` takes precedence.
         """
-        results = {
-            "a": {
-                "rewards": None,
-                "error": "agent crashed",
-                "verifier_error": "verifier also failed",
-            },
+        from benchflow._utils.scoring import classify_result_dict
+
+        result = {
+            "rewards": None,
+            "error": "agent crashed",
+            "verifier_error": "verifier also failed",
         }
-        errored = sum(
-            1 for r in results.values() if r.get("error") and r.get("rewards") is None
-        )
-        verifier_errored = sum(
-            1
-            for r in results.values()
-            if r.get("verifier_error")
-            and not (r.get("error") and r.get("rewards") is None)
-        )
-        # Exactly one bucket — never double-counted.
-        assert errored == 1
-        assert verifier_errored == 0
-        counts = self._count(results)
-        assert counts["passed"] + counts["failed"] + errored + verifier_errored == len(
-            results
-        )
+        # The real classifier puts a both-errors result in exactly `errored`.
+        assert classify_result_dict(result) == "errored"
 
 
 class TestRunTaskLoop:

From 28c4b7db98f6f3a5bfe52b412fe602a0b6f12844 Mon Sep 17 00:00:00 2001
From: Xiangyi Li <xiangyi@benchflow.ai>
Date: Thu, 21 May 2026 17:50:54 -0400
Subject: [PATCH 2/5] Declare is_mounted on BaseSandbox; drop verifier getattr
 fallbacks

VerifierConfig.service is always a field and is_mounted is a property on
every concrete sandbox, so the defensive getattr() calls in verifier.py
are dead. Declare is_mounted on the BaseSandbox ABC (default False) and
use direct attribute access for both service and is_mounted.

Add a verifier test covering is_mounted=True + service="target": the
is_mounted fast path is gated on service == "main", and that guard was
previously untested because the test stub always had is_mounted=False.
The new test fails if the guard is removed.

From PR #321 verification.
---
 src/benchflow/sandbox/_base.py         | 12 +++++++++
 src/benchflow/task/verifier.py         |  4 +--
 tests/test_verifier_multi_container.py | 36 ++++++++++++++++++++++++--
 3 files changed, 48 insertions(+), 4 deletions(-)

diff --git a/src/benchflow/sandbox/_base.py b/src/benchflow/sandbox/_base.py
index 535d2f09..5f939582 100644
--- a/src/benchflow/sandbox/_base.py
+++ b/src/benchflow/sandbox/_base.py
@@ -104,6 +104,18 @@ def __init__(
     def _uses_compose(self) -> bool:
         return False
 
+    @property
+    def is_mounted(self) -> bool:
+        """Whether the rollout dir is host-bind-mounted into the sandbox.
+
+        When ``True`` the agent container's verifier output is already visible
+        on the host, so the verifier can skip a ``download_dir`` round-trip.
+        Backends that run remotely (Daytona, Modal) have no bind mount and
+        override this to ``False``. The default is ``False`` so non-mounted
+        backends are the safe assumption.
+        """
+        return False
+
     def _maybe_resolve_task_env(self) -> None:
         if self.task_env_config.env and not self._uses_compose:
             resolved = resolve_env_vars(self.task_env_config.env)
diff --git a/src/benchflow/task/verifier.py b/src/benchflow/task/verifier.py
index df4b25da..3a8ade4d 100644
--- a/src/benchflow/task/verifier.py
+++ b/src/benchflow/task/verifier.py
@@ -147,7 +147,7 @@ async def _verify_test_script(self) -> VerifierResult:
         service so the verifier can inspect *target-side* state — RCE markers,
         DB modifications — instead of only the agent workspace (#248).
         """
-        service = getattr(self._task.config.verifier, "service", "main")
+        service = self._task.config.verifier.service
         try:
             await self._sandbox.upload_dir(
                 source_dir=self._task.paths.tests_dir,
@@ -202,7 +202,7 @@ async def _verify_test_script(self) -> VerifierResult:
         # Download verifier output if it is not host-mounted. Only the agent's
         # ``main`` container has the rollout dir bind-mounted; a target service
         # never does, so target-side rewards (#248) are always downloaded.
-        is_mounted = service == "main" and getattr(self._sandbox, "is_mounted", False)
+        is_mounted = service == "main" and self._sandbox.is_mounted
         if not is_mounted:
             try:
                 await self._sandbox.download_dir(
diff --git a/tests/test_verifier_multi_container.py b/tests/test_verifier_multi_container.py
index da9033ef..32d154eb 100644
--- a/tests/test_verifier_multi_container.py
+++ b/tests/test_verifier_multi_container.py
@@ -91,8 +91,13 @@ def _make_task(tmp_path: Path, toml: str) -> MagicMock:
 class _RecordingSandbox:
     """Sandbox stub that records the ``service`` used for each operation."""
 
-    def __init__(self, rollout_paths: RolloutPaths, reward: str = "1.0") -> None:
-        self.is_mounted = False
+    def __init__(
+        self,
+        rollout_paths: RolloutPaths,
+        reward: str = "1.0",
+        is_mounted: bool = False,
+    ) -> None:
+        self.is_mounted = is_mounted
         self._rollout_paths = rollout_paths
         self._reward = reward
         self.upload_calls: list[dict] = []
@@ -170,6 +175,33 @@ async def test_target_side_reward_downloaded_from_target_service(
         assert sandbox.download_calls, "reward must be downloaded from the target"
         assert {c["service"] for c in sandbox.download_calls} == {"target"}
 
+    @pytest.mark.asyncio
+    async def test_mounted_sandbox_still_downloads_from_target_service(
+        self, tmp_path: Path
+    ) -> None:
+        """#248: a host-mounted sandbox must still download target rewards.
+
+        The ``is_mounted`` fast path that skips ``download_dir`` is gated on
+        ``service == "main"`` — only the agent container has the rollout dir
+        bind-mounted. A non-``main`` target service is never mounted, so its
+        ``reward.txt`` must still be downloaded even when the sandbox is
+        otherwise mounted. This fails (RewardFileNotFoundError) if the
+        ``service == "main" and`` guard on the fast path is dropped.
+        """
+        toml = 'version = "1.0"\n[verifier]\nservice = "target"\n'
+        task = _make_task(tmp_path, toml)
+        rollout_paths = RolloutPaths(rollout_dir=tmp_path / "rollout")
+        rollout_paths.mkdir()
+        sandbox = _RecordingSandbox(rollout_paths, reward="1.0", is_mounted=True)
+
+        result = await Verifier(task, rollout_paths, sandbox).verify()
+
+        assert sandbox.download_calls, (
+            "target reward must be downloaded even when the sandbox is mounted"
+        )
+        assert {c["service"] for c in sandbox.download_calls} == {"target"}
+        assert result.rewards == {"reward": 1.0}
+
 
 # ---------------------------------------------------------------------------
 # Item 3 — cross-container hardening policy

From 01ecd15b57276f0aa22da898f64fc3667cd6bea2 Mon Sep 17 00:00:00 2001
From: Xiangyi Li <xiangyi@benchflow.ai>
Date: Thu, 21 May 2026 17:50:58 -0400
Subject: [PATCH 3/5] Document the acpx: runtime-key namespace contract

The acpx: key convention is load-bearing in two coupled sites: _acpx_wrap
mints the prefixed name and resolve_agent round-trips it. Make
resolve_agent_key the documented single owner of the namespace via an
ACPX_KEY_PREFIX constant and acpx_runtime_key() helper, with a
module-level contract comment so the coupling is explicit rather than
implied by an easily-missed comment. Behavior unchanged.

From PR #322 verification.
---
 src/benchflow/agents/registry.py | 53 +++++++++++++++++++++++++++-----
 1 file changed, 46 insertions(+), 7 deletions(-)

diff --git a/src/benchflow/agents/registry.py b/src/benchflow/agents/registry.py
index 026fac99..ac8e495e 100644
--- a/src/benchflow/agents/registry.py
+++ b/src/benchflow/agents/registry.py
@@ -628,6 +628,40 @@ def infer_env_key_for_model(model: str) -> str | None:
 
 VALID_PROTOCOLS = {"acp", "acpx"}
 
+# ---------------------------------------------------------------------------
+# The ``acpx:`` runtime-key namespace
+# ---------------------------------------------------------------------------
+#
+# An ``acpx/<agent>`` spec resolves to an acpx-wrapped AgentConfig whose
+# install/launch commands route through the acpx CLI. That wrapped config is
+# registered into ``AGENTS`` (and the installer/launch maps) under a stable
+# runtime key prefixed with ``ACPX_KEY_PREFIX`` so later *name-keyed* lookups
+# in the Rollout/Evaluation path pick up the wrapped commands.
+#
+# This namespace is owned end to end by ``resolve_agent_key``: it is the only
+# function that *mints* an ``acpx:`` key (by registering the wrapped config).
+# Two other sites must agree with that convention and are documented here so
+# the contract is explicit rather than implied:
+#
+#   - ``_acpx_wrap`` produces the wrapped config whose ``name`` carries the
+#     ``acpx:`` prefix (via ``acpx_runtime_key``).
+#   - ``resolve_agent`` round-trips an already-registered ``acpx:`` key:
+#     ``parse_agent_spec`` leaves it whole under the default ``acp`` protocol,
+#     and the ``protocol == "acp" and name in AGENTS`` branch returns it as-is.
+#
+# Changing the prefix or this round-trip behavior requires updating all three.
+ACPX_KEY_PREFIX = "acpx:"
+
+
+def acpx_runtime_key(canonical_name: str) -> str:
+    """Return the stable ``acpx:`` runtime key for a canonical agent name.
+
+    Single source of truth for the ``acpx:`` namespace — see the module-level
+    comment above. ``resolve_agent_key`` registers the wrapped config under
+    this key; ``resolve_agent`` round-trips it back to that config.
+    """
+    return f"{ACPX_KEY_PREFIX}{canonical_name}"
+
 
 def parse_agent_spec(spec: str) -> tuple[str, str]:
     """Parse an agent spec like 'acp/claude-agent-acp', 'acpx/claude', or 'claude'.
@@ -667,7 +701,8 @@ def _acpx_wrap(config: AgentConfig) -> AgentConfig:
             break
 
     return AgentConfig(
-        name=f"acpx:{config.name}",
+        # ``acpx:`` runtime key — see acpx_runtime_key / module-level contract.
+        name=acpx_runtime_key(config.name),
         install_cmd=f"{config.install_cmd} && {_ACPX_INSTALL}",
         launch_cmd=(
             f'export PATH="{_JS_AGENT_PATH}" && acpx {acpx_agent_name} --approve-all'
@@ -702,7 +737,8 @@ def resolve_agent(spec: str) -> AgentConfig:
         )
 
     # An already-resolved acpx runtime key (e.g. "acpx:claude-agent-acp")
-    # round-trips: parse_agent_spec leaves it whole and it lives in AGENTS.
+    # round-trips: parse_agent_spec leaves it whole under the default "acp"
+    # protocol and it lives in AGENTS. See the ACPX_KEY_PREFIX contract.
     if protocol == "acp" and name in AGENTS:
         return AGENTS[name]
 
@@ -725,12 +761,15 @@ def resolve_agent(spec: str) -> AgentConfig:
 def resolve_agent_key(spec: str) -> str:
     """Resolve an agent spec to a stable registry key.
 
-    For plain ACP agents this is the canonical agent name. For ``acpx/<agent>``
-    specs the acpx-wrapped config (acpx install/launch commands) is registered
-    into ``AGENTS``/``AGENT_INSTALLERS``/``AGENT_LAUNCH`` under a stable runtime
-    key (``acpx:<canonical>``) so that name-keyed lookups in the
+    This function owns the ``acpx:`` runtime-key namespace (see the
+    ``ACPX_KEY_PREFIX`` module-level contract). For plain ACP agents the key
+    is the canonical agent name. For ``acpx/<agent>`` specs the acpx-wrapped
+    config (acpx install/launch commands) is registered into
+    ``AGENTS``/``AGENT_INSTALLERS``/``AGENT_LAUNCH`` under the stable runtime
+    key ``acpx_runtime_key(<canonical>)`` so that name-keyed lookups in the
     Rollout/Evaluation path use the wrapped commands instead of the literal
-    spec string.
+    spec string. ``resolve_agent`` then round-trips that key back to the
+    wrapped config.
 
     Unknown agents are returned unchanged so callers can still surface their
     own diagnostics (raw-command fallback).

From 7f9a550d1369f6b8da79565bf6e32eb8674a33a3 Mon Sep 17 00:00:00 2001
From: Xiangyi Li <xiangyi@benchflow.ai>
Date: Thu, 21 May 2026 17:51:04 -0400
Subject: [PATCH 4/5] Tighten HF split-file match; make env-file cleanup
 unconditional

_pick_split_file matched basenames starting with "{split}-", which could
pick a sibling subset like test-small-00000-of-00001.parquet for
split="test". Anchor the sharded match on the HF {split}-NNNNN-of-NNNNN
convention so only the genuine shard resolves.

_wrap_command_with_env_file chained rm -f with &&, so a failed sourcing
step short-circuited the cleanup and leaked the temp env file. Use
trap 'rm -f ...' EXIT so the env file is removed unconditionally.

From PR #323 verification.
---
 src/benchflow/sandbox/docker.py     |  9 ++++++---
 src/benchflow/traces/huggingface.py | 18 +++++++++++++-----
 tests/test_sandbox.py               |  4 ++++
 tests/test_traces_huggingface.py    | 20 ++++++++++++++++++++
 4 files changed, 43 insertions(+), 8 deletions(-)

diff --git a/src/benchflow/sandbox/docker.py b/src/benchflow/sandbox/docker.py
index cda68c98..f2797a1d 100644
--- a/src/benchflow/sandbox/docker.py
+++ b/src/benchflow/sandbox/docker.py
@@ -498,8 +498,10 @@ def _wrap_command_with_env_file(env: dict[str, str], command: str) -> str:
 
         The env vars are base64-encoded into the command string (not visible
         as individual ``KEY=VALUE`` args in ``ps aux``), decoded to a
-        mode-0600 file inside the container, sourced, then deleted before the
-        real command runs.
+        mode-0600 file inside the container, and sourced before the real
+        command runs. A ``trap ... EXIT`` deletes the temp file
+        unconditionally — even if the decode/source step fails — so a failed
+        ``&&`` chain can never leave the env file behind.
         """
         env_body = "".join(f"export {k}={shlex.quote(v)}\n" for k, v in env.items())
         encoded = base64.b64encode(env_body.encode()).decode()
@@ -507,9 +509,10 @@ def _wrap_command_with_env_file(env: dict[str, str], command: str) -> str:
         # clobber each other's env file.
         env_path = f"/tmp/.benchflow_exec_env_{uuid.uuid4().hex[:16]}"
         return (
+            f"trap 'rm -f {env_path}' EXIT && "
             f"umask 077 && printf %s {shlex.quote(encoded)} | base64 -d > "
             f"{env_path} && set -a && . {env_path} && set +a && "
-            f"rm -f {env_path} && {command}"
+            f"{command}"
         )
 
     async def attach(self) -> None:
diff --git a/src/benchflow/traces/huggingface.py b/src/benchflow/traces/huggingface.py
index 48a57703..8de0fed3 100644
--- a/src/benchflow/traces/huggingface.py
+++ b/src/benchflow/traces/huggingface.py
@@ -66,16 +66,24 @@ def _cache_dir() -> Path:
 def _pick_split_file(repo_files: list[str], split: str, suffix: str) -> str | None:
     """Pick the repo file matching *split* and *suffix*, if any.
 
-    Matches files under ``data/`` whose basename starts with the split name
-    (e.g. ``data/test-00000-of-00001.parquet`` or ``data/test.jsonl`` for
-    ``split="test"``). Returns ``None`` when no file matches so the caller
-    can fall back to constructed filename candidates.
+    Matches either the exact ``{split}{suffix}`` basename (e.g.
+    ``data/test.jsonl`` for ``split="test"``) or the HF sharded-parquet
+    convention ``{split}-NNNNN-of-NNNNN`` (e.g.
+    ``data/test-00000-of-00001.parquet``). The sharded match is anchored on
+    ``\\d+-of-\\d+`` so a sibling *subset* such as
+    ``test-small-00000-of-00001.parquet`` is not mistaken for ``split="test"``.
+    Returns ``None`` when no file matches so the caller can fall back to
+    constructed filename candidates.
     """
+    sharded_re = re.compile(rf"^{re.escape(split)}-\d+-of-\d+")
     candidates = [
         f
         for f in repo_files
         if f.endswith(suffix)
-        and (Path(f).name == f"{split}{suffix}" or Path(f).name.startswith(f"{split}-"))
+        and (
+            Path(f).name == f"{split}{suffix}"
+            or sharded_re.match(Path(f).name) is not None
+        )
     ]
     if not candidates:
         return None
diff --git a/tests/test_sandbox.py b/tests/test_sandbox.py
index a2d10498..6d86e5ce 100644
--- a/tests/test_sandbox.py
+++ b/tests/test_sandbox.py
@@ -123,6 +123,10 @@ def test_wrap_command_does_not_inline_secret_values(self):
         assert wrapped.endswith("run-verifier")
         # Restrictive perms on the env file.
         assert "umask 077" in wrapped
+        # Cleanup is via `trap ... EXIT`, so the env file is removed even if
+        # the decode/source step fails and short-circuits the `&&` chain.
+        assert wrapped.startswith("trap 'rm -f ")
+        assert "EXIT" in wrapped
 
     @pytest.mark.asyncio
     async def test_exec_passes_no_dash_e_flags(self, monkeypatch):
diff --git a/tests/test_traces_huggingface.py b/tests/test_traces_huggingface.py
index 92565cd5..614a7094 100644
--- a/tests/test_traces_huggingface.py
+++ b/tests/test_traces_huggingface.py
@@ -37,6 +37,26 @@ def test_pick_split_file_returns_none_when_no_match() -> None:
     assert _pick_split_file(repo_files, "test", ".parquet") is None
 
 
+def test_pick_split_file_ignores_sibling_subset() -> None:
+    """A subset like `test-small-*` must not be picked for split="test".
+
+    The sharded match is anchored on the `{split}-NNNNN-of-NNNNN` convention,
+    so only the genuine `test` shard resolves — never a `test-small` subset.
+    """
+    repo_files = [
+        "data/test-small-00000-of-00001.parquet",
+        "data/test-00000-of-00001.parquet",
+    ]
+    picked = _pick_split_file(repo_files, "test", ".parquet")
+    assert picked == "data/test-00000-of-00001.parquet"
+
+
+def test_pick_split_file_no_match_when_only_subset_present() -> None:
+    """When only a `test-small` subset exists, split="test" finds nothing."""
+    repo_files = ["data/test-small-00000-of-00001.parquet"]
+    assert _pick_split_file(repo_files, "test", ".parquet") is None
+
+
 def test_split_filename_candidates_are_all_split_specific() -> None:
     """Constructed candidates for a non-train split never reference train."""
     candidates = _split_filename_candidates(None, "test", ".parquet")

From 38ded564124d719bf1bc2fdf419b2fc697305233 Mon Sep 17 00:00:00 2001
From: Xiangyi Li <xiangyi@benchflow.ai>
Date: Thu, 21 May 2026 17:51:09 -0400
Subject: [PATCH 5/5] Fix stale docstring; convert llm-judge tests to async
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

test_api_failure_surfaces_original_error referenced last_error, which was
deleted in PR #324 — fix the prose.

Convert the ~18 sync tests still using asyncio.run() to async def +
await, removing the latent order-dependence class that PR #325 fixed
elsewhere (the repo runs asyncio_mode = "auto"). Mechanical conversion
only; assertions unchanged. Full suite verified in default, reverse, and
shuffled file orderings.

From PR #324 / #325 verification.
---
 tests/test_llm_judge.py | 129 ++++++++++++++++++++++------------------
 1 file changed, 70 insertions(+), 59 deletions(-)

diff --git a/tests/test_llm_judge.py b/tests/test_llm_judge.py
index a90e2909..297bd9f3 100644
--- a/tests/test_llm_judge.py
+++ b/tests/test_llm_judge.py
@@ -5,7 +5,6 @@
 
 from __future__ import annotations
 
-import asyncio
 import json
 from contextlib import AbstractContextManager
 from pathlib import Path
@@ -56,13 +55,13 @@ def test_unparseable_raises(self) -> None:
 
 
 class TestCallJudgeProviderFallback:
-    def test_api_failure_surfaces_original_error(self) -> None:
+    async def test_api_failure_surfaces_original_error(self) -> None:
         """A real API failure on the matching provider is raised as-is.
 
         The cross-provider fallback must NOT advance to a provider whose
-        API cannot serve this model name — otherwise ``last_error`` ends
-        up holding a misleading model-not-found error from the wrong
-        provider instead of the genuine failure.
+        API cannot serve this model name — otherwise the surfaced error is
+        a misleading model-not-found error from the wrong provider instead
+        of the genuine failure.
         """
         original = RuntimeError("anthropic: invalid x-api-key")
         anthropic_mock = AsyncMock(side_effect=original)
@@ -75,7 +74,7 @@ def test_api_failure_surfaces_original_error(self) -> None:
             patch("benchflow.rewards.llm._call_google", google_mock),
             pytest.raises(RuntimeError, match="invalid x-api-key") as exc_info,
         ):
-            asyncio.run(call_judge("claude-haiku-4-5", "prompt", retries=2))
+            await call_judge("claude-haiku-4-5", "prompt", retries=2)
 
         # The exact original exception object propagates.
         assert exc_info.value is original
@@ -84,7 +83,7 @@ def test_api_failure_surfaces_original_error(self) -> None:
         openai_mock.assert_not_awaited()
         google_mock.assert_not_awaited()
 
-    def test_import_error_falls_through_to_next_provider(self) -> None:
+    async def test_import_error_falls_through_to_next_provider(self) -> None:
         """A missing SDK (ImportError) still falls through to the next provider."""
         anthropic_mock = AsyncMock(side_effect=ImportError("no anthropic SDK"))
         openai_mock = AsyncMock(return_value="ok from openai")
@@ -93,13 +92,13 @@ def test_import_error_falls_through_to_next_provider(self) -> None:
             patch("benchflow.rewards.llm._call_anthropic", anthropic_mock),
             patch("benchflow.rewards.llm._call_openai", openai_mock),
         ):
-            result = asyncio.run(call_judge("claude-haiku-4-5", "prompt", retries=2))
+            result = await call_judge("claude-haiku-4-5", "prompt", retries=2)
 
         assert result == "ok from openai"
         # ImportError is not retried.
         assert anthropic_mock.await_count == 1
 
-    def test_all_sdks_missing_raises_judge_environment_error(self) -> None:
+    async def test_all_sdks_missing_raises_judge_environment_error(self) -> None:
         """When *every* provider SDK is missing, call_judge raises a
         JudgeEnvironmentError — a distinct, identifiable environment failure,
         not a generic RuntimeError that callers might mistake for a verdict."""
@@ -111,7 +110,7 @@ def test_all_sdks_missing_raises_judge_environment_error(self) -> None:
             patch("benchflow.rewards.llm._call_google", missing),
             pytest.raises(JudgeEnvironmentError, match="judge extra"),
         ):
-            asyncio.run(call_judge("claude-haiku-4-5", "prompt", retries=2))
+            await call_judge("claude-haiku-4-5", "prompt", retries=2)
 
     def test_judge_environment_error_is_a_runtime_error(self) -> None:
         """JudgeEnvironmentError stays a RuntimeError subclass so existing
@@ -149,25 +148,25 @@ def _patch_sdk(self, response: _FakeResponse) -> AbstractContextManager[None]:
         )
         return patch.dict("sys.modules", {"anthropic": fake_anthropic})
 
-    def test_empty_content_returns_empty_string(self) -> None:
+    async def test_empty_content_returns_empty_string(self) -> None:
         with self._patch_sdk(_FakeResponse([])):
-            result = asyncio.run(_call_anthropic("claude-haiku-4-5", "prompt", 100))
+            result = await _call_anthropic("claude-haiku-4-5", "prompt", 100)
         assert result == ""
 
-    def test_non_text_first_block_returns_empty_string(self) -> None:
+    async def test_non_text_first_block_returns_empty_string(self) -> None:
         with self._patch_sdk(_FakeResponse([_FakeNonTextBlock()])):
-            result = asyncio.run(_call_anthropic("claude-haiku-4-5", "prompt", 100))
+            result = await _call_anthropic("claude-haiku-4-5", "prompt", 100)
         assert result == ""
 
-    def test_non_text_block_before_text_block(self) -> None:
+    async def test_non_text_block_before_text_block(self) -> None:
         response = _FakeResponse([_FakeNonTextBlock(), _FakeTextBlock("the verdict")])
         with self._patch_sdk(response):
-            result = asyncio.run(_call_anthropic("claude-haiku-4-5", "prompt", 100))
+            result = await _call_anthropic("claude-haiku-4-5", "prompt", 100)
         assert result == "the verdict"
 
-    def test_text_block_returns_text(self) -> None:
+    async def test_text_block_returns_text(self) -> None:
         with self._patch_sdk(_FakeResponse([_FakeTextBlock("hello")])):
-            result = asyncio.run(_call_anthropic("claude-haiku-4-5", "prompt", 100))
+            result = await _call_anthropic("claude-haiku-4-5", "prompt", 100)
         assert result == "hello"
 
 
@@ -196,21 +195,21 @@ def _patch_sdk(self, response: _FakeGoogleResponse) -> AbstractContextManager[No
             {"google": fake_google, "google.genai": fake_genai},
         )
 
-    def test_none_text_returns_empty_string(self) -> None:
+    async def test_none_text_returns_empty_string(self) -> None:
         """``response.text`` is None (e.g. safety-filtered) -> "" not None."""
         with (
             patch.dict("os.environ", {"GOOGLE_API_KEY": "test-key"}),
             self._patch_sdk(_FakeGoogleResponse(None)),
         ):
-            result = asyncio.run(_call_google("gemini-2.0-flash", "prompt"))
+            result = await _call_google("gemini-2.0-flash", "prompt")
         assert result == ""
 
-    def test_text_returns_text(self) -> None:
+    async def test_text_returns_text(self) -> None:
         with (
             patch.dict("os.environ", {"GOOGLE_API_KEY": "test-key"}),
             self._patch_sdk(_FakeGoogleResponse("the verdict")),
         ):
-            result = asyncio.run(_call_google("gemini-2.0-flash", "prompt"))
+            result = await _call_google("gemini-2.0-flash", "prompt")
         assert result == "the verdict"
 
 
@@ -220,21 +219,21 @@ def test_text_returns_text(self) -> None:
 
 
 class TestLLMJudgeLegacy:
-    def test_reads_score_file(self, tmp_path: Path) -> None:
+    async def test_reads_score_file(self, tmp_path: Path) -> None:
         (tmp_path / "llm_judge_score.txt").write_text("0.85\n")
         func = LLMJudgeRewardFunc(prompt="rate it")
-        score = asyncio.run(func.score(tmp_path))
+        score = await func.score(tmp_path)
         assert score == pytest.approx(0.85)
 
-    def test_missing_score_file(self, tmp_path: Path) -> None:
+    async def test_missing_score_file(self, tmp_path: Path) -> None:
         func = LLMJudgeRewardFunc(prompt="rate it")
-        score = asyncio.run(func.score(tmp_path))
+        score = await func.score(tmp_path)
         assert score == 0.0
 
-    def test_invalid_score_file(self, tmp_path: Path) -> None:
+    async def test_invalid_score_file(self, tmp_path: Path) -> None:
         (tmp_path / "llm_judge_score.txt").write_text("not-a-number\n")
         func = LLMJudgeRewardFunc(prompt="rate it")
-        score = asyncio.run(func.score(tmp_path))
+        score = await func.score(tmp_path)
         assert score == 0.0
 
 
@@ -268,7 +267,7 @@ def _make_rubric_toml(tmp_path: Path, content: str) -> Path:
 
 class TestLLMJudgeRubricMode:
     @patch("benchflow.rewards.llm.call_judge", new_callable=AsyncMock)
-    def test_single_binary_criterion_pass(
+    async def test_single_binary_criterion_pass(
         self, mock_judge: AsyncMock, tmp_path: Path
     ) -> None:
         mock_judge.return_value = _MOCK_PASS_RESPONSE
@@ -287,7 +286,7 @@ def test_single_binary_criterion_pass(
         )
 
         func = LLMJudgeRewardFunc(rubric_path=rubric_path)
-        score = asyncio.run(func.score(tmp_path))
+        score = await func.score(tmp_path)
 
         assert score == pytest.approx(1.0)
         assert len(func.events) == 1
@@ -296,7 +295,7 @@ def test_single_binary_criterion_pass(
         mock_judge.assert_called_once()
 
     @patch("benchflow.rewards.llm.call_judge", new_callable=AsyncMock)
-    def test_single_binary_criterion_fail(
+    async def test_single_binary_criterion_fail(
         self, mock_judge: AsyncMock, tmp_path: Path
     ) -> None:
         mock_judge.return_value = _MOCK_FAIL_RESPONSE
@@ -311,13 +310,13 @@ def test_single_binary_criterion_fail(
         )
 
         func = LLMJudgeRewardFunc(rubric_path=rubric_path)
-        score = asyncio.run(func.score(tmp_path))
+        score = await func.score(tmp_path)
 
         assert score == pytest.approx(0.0)
         assert func.events[0].reward == 0.0
 
     @patch("benchflow.rewards.llm.call_judge", new_callable=AsyncMock)
-    def test_multiple_criteria_weighted_mean(
+    async def test_multiple_criteria_weighted_mean(
         self, mock_judge: AsyncMock, tmp_path: Path
     ) -> None:
         mock_judge.side_effect = [_MOCK_PASS_RESPONSE, _MOCK_FAIL_RESPONSE]
@@ -337,14 +336,16 @@ def test_multiple_criteria_weighted_mean(
         )
 
         func = LLMJudgeRewardFunc(rubric_path=rubric_path)
-        score = asyncio.run(func.score(tmp_path))
+        score = await func.score(tmp_path)
 
         # weighted_mean: (1.0 * 0.7 + 0.0 * 0.3) / (0.7 + 0.3)
         assert score == pytest.approx(0.7)
         assert len(func.events) == 2
 
     @patch("benchflow.rewards.llm.call_judge", new_callable=AsyncMock)
-    def test_all_pass_aggregation(self, mock_judge: AsyncMock, tmp_path: Path) -> None:
+    async def test_all_pass_aggregation(
+        self, mock_judge: AsyncMock, tmp_path: Path
+    ) -> None:
         mock_judge.side_effect = [_MOCK_PASS_RESPONSE, _MOCK_FAIL_RESPONSE]
         (tmp_path / "output.txt").write_text("output")
 
@@ -363,11 +364,13 @@ def test_all_pass_aggregation(self, mock_judge: AsyncMock, tmp_path: Path) -> No
         )
 
         func = LLMJudgeRewardFunc(rubric_path=rubric_path)
-        score = asyncio.run(func.score(tmp_path))
+        score = await func.score(tmp_path)
         assert score == 0.0  # Not all passed
 
     @patch("benchflow.rewards.llm.call_judge", new_callable=AsyncMock)
-    def test_any_pass_aggregation(self, mock_judge: AsyncMock, tmp_path: Path) -> None:
+    async def test_any_pass_aggregation(
+        self, mock_judge: AsyncMock, tmp_path: Path
+    ) -> None:
         mock_judge.side_effect = [_MOCK_PASS_RESPONSE, _MOCK_FAIL_RESPONSE]
         (tmp_path / "output.txt").write_text("output")
 
@@ -386,11 +389,13 @@ def test_any_pass_aggregation(self, mock_judge: AsyncMock, tmp_path: Path) -> No
         )
 
         func = LLMJudgeRewardFunc(rubric_path=rubric_path)
-        score = asyncio.run(func.score(tmp_path))
+        score = await func.score(tmp_path)
         assert score == 1.0  # At least one passed
 
     @patch("benchflow.rewards.llm.call_judge", new_callable=AsyncMock)
-    def test_likert_criterion(self, mock_judge: AsyncMock, tmp_path: Path) -> None:
+    async def test_likert_criterion(
+        self, mock_judge: AsyncMock, tmp_path: Path
+    ) -> None:
         mock_judge.return_value = '{"score": 4, "reasoning": "pretty good"}'
         (tmp_path / "output.txt").write_text("output")
 
@@ -405,12 +410,14 @@ def test_likert_criterion(self, mock_judge: AsyncMock, tmp_path: Path) -> None:
         )
 
         func = LLMJudgeRewardFunc(rubric_path=rubric_path)
-        score = asyncio.run(func.score(tmp_path))
+        score = await func.score(tmp_path)
         # likert: (4 - 1) / (5 - 1) = 0.75
         assert score == pytest.approx(0.75)
 
     @patch("benchflow.rewards.llm.call_judge", new_callable=AsyncMock)
-    def test_numeric_criterion(self, mock_judge: AsyncMock, tmp_path: Path) -> None:
+    async def test_numeric_criterion(
+        self, mock_judge: AsyncMock, tmp_path: Path
+    ) -> None:
         mock_judge.return_value = '{"score": 70, "reasoning": "decent"}'
         (tmp_path / "output.txt").write_text("output")
 
@@ -426,7 +433,7 @@ def test_numeric_criterion(self, mock_judge: AsyncMock, tmp_path: Path) -> None:
         )
 
         func = LLMJudgeRewardFunc(rubric_path=rubric_path)
-        score = asyncio.run(func.score(tmp_path))
+        score = await func.score(tmp_path)
         assert score == pytest.approx(0.7)
 
 
@@ -437,7 +444,9 @@ def test_numeric_criterion(self, mock_judge: AsyncMock, tmp_path: Path) -> None:
 
 class TestLLMJudgeInlineCriteria:
     @patch("benchflow.rewards.llm.call_judge", new_callable=AsyncMock)
-    def test_inline_criteria_list(self, mock_judge: AsyncMock, tmp_path: Path) -> None:
+    async def test_inline_criteria_list(
+        self, mock_judge: AsyncMock, tmp_path: Path
+    ) -> None:
         mock_judge.return_value = _MOCK_PASS_RESPONSE
         (tmp_path / "output.txt").write_text("answer")
 
@@ -446,11 +455,11 @@ def test_inline_criteria_list(self, mock_judge: AsyncMock, tmp_path: Path) -> No
                 {"description": "Is it correct?", "type": "binary"},
             ],
         )
-        score = asyncio.run(func.score(tmp_path))
+        score = await func.score(tmp_path)
         assert score == pytest.approx(1.0)
 
     @patch("benchflow.rewards.llm.call_judge", new_callable=AsyncMock)
-    def test_inline_with_harvey_lab_keys(
+    async def test_inline_with_harvey_lab_keys(
         self, mock_judge: AsyncMock, tmp_path: Path
     ) -> None:
         """Inline criteria can use Harvey LAB style keys (id, match_criteria)."""
@@ -466,7 +475,7 @@ def test_inline_with_harvey_lab_keys(
                 },
             ],
         )
-        score = asyncio.run(func.score(tmp_path))
+        score = await func.score(tmp_path)
         assert score == pytest.approx(1.0)
         assert func.events[0].source == "criterion:criterion-1"
 
@@ -478,7 +487,7 @@ def test_inline_with_harvey_lab_keys(
 
 class TestLLMJudgeAutoDiscovery:
     @patch("benchflow.rewards.llm.call_judge", new_callable=AsyncMock)
-    def test_auto_discovers_rubric_toml(
+    async def test_auto_discovers_rubric_toml(
         self, mock_judge: AsyncMock, tmp_path: Path
     ) -> None:
         mock_judge.return_value = _MOCK_PASS_RESPONSE
@@ -492,7 +501,7 @@ def test_auto_discovers_rubric_toml(
         )
 
         func = LLMJudgeRewardFunc()
-        score = asyncio.run(func.score(tmp_path))
+        score = await func.score(tmp_path)
         assert score == pytest.approx(1.0)
 
 
@@ -503,7 +512,7 @@ def test_auto_discovers_rubric_toml(
 
 class TestLLMJudgeErrors:
     @patch("benchflow.rewards.llm.call_judge", new_callable=AsyncMock)
-    def test_judge_error_returns_zero(
+    async def test_judge_error_returns_zero(
         self, mock_judge: AsyncMock, tmp_path: Path
     ) -> None:
         """A real API failure (bad key, model not found) is a genuine judge
@@ -514,13 +523,13 @@ def test_judge_error_returns_zero(
         func = LLMJudgeRewardFunc(
             criteria=[{"description": "test"}],
         )
-        score = asyncio.run(func.score(tmp_path))
+        score = await func.score(tmp_path)
         assert score == 0.0
         assert len(func.events) == 1
         assert func.events[0].reward == 0.0
 
     @patch("benchflow.rewards.llm.call_judge", new_callable=AsyncMock)
-    def test_missing_sdk_propagates_not_scored_zero(
+    async def test_missing_sdk_propagates_not_scored_zero(
         self, mock_judge: AsyncMock, tmp_path: Path
     ) -> None:
         """A missing provider SDK (JudgeEnvironmentError) must propagate out of
@@ -535,7 +544,7 @@ def test_missing_sdk_propagates_not_scored_zero(
         func = LLMJudgeRewardFunc(criteria=[{"description": "test"}])
 
         with pytest.raises(JudgeEnvironmentError):
-            asyncio.run(func.score(tmp_path))
+            await func.score(tmp_path)
 
 
 # ---------------------------------------------------------------------------
@@ -545,7 +554,7 @@ def test_missing_sdk_propagates_not_scored_zero(
 
 class TestEvaluationDetails:
     @patch("benchflow.rewards.llm.call_judge", new_callable=AsyncMock)
-    def test_writes_evaluation_details(
+    async def test_writes_evaluation_details(
         self, mock_judge: AsyncMock, tmp_path: Path
     ) -> None:
         mock_judge.side_effect = [_MOCK_PASS_RESPONSE, _MOCK_FAIL_RESPONSE]
@@ -557,7 +566,7 @@ def test_writes_evaluation_details(
                 {"description": "B", "id": "b"},
             ],
         )
-        asyncio.run(func.score(tmp_path))
+        await func.score(tmp_path)
 
         details_path = tmp_path / "evaluation_details.json"
         assert details_path.exists()
@@ -574,7 +583,9 @@ def test_writes_evaluation_details(
 
 class TestDenseRewardEvents:
     @patch("benchflow.rewards.llm.call_judge", new_callable=AsyncMock)
-    def test_per_criterion_events(self, mock_judge: AsyncMock, tmp_path: Path) -> None:
+    async def test_per_criterion_events(
+        self, mock_judge: AsyncMock, tmp_path: Path
+    ) -> None:
         mock_judge.side_effect = [
             _MOCK_PASS_RESPONSE,
             _MOCK_FAIL_RESPONSE,
@@ -589,7 +600,7 @@ def test_per_criterion_events(self, mock_judge: AsyncMock, tmp_path: Path) -> No
                 {"description": "C", "id": "c"},
             ],
         )
-        asyncio.run(func.score(tmp_path))
+        await func.score(tmp_path)
 
         events = func.events
         assert len(events) == 3
@@ -603,7 +614,7 @@ def test_per_criterion_events(self, mock_judge: AsyncMock, tmp_path: Path) -> No
         assert events[0].source == "criterion:a"
 
     @patch("benchflow.rewards.llm.call_judge", new_callable=AsyncMock)
-    def test_events_cleared_between_calls(
+    async def test_events_cleared_between_calls(
         self, mock_judge: AsyncMock, tmp_path: Path
     ) -> None:
         mock_judge.return_value = _MOCK_PASS_RESPONSE
@@ -613,10 +624,10 @@ def test_events_cleared_between_calls(
             criteria=[{"description": "A"}],
         )
 
-        asyncio.run(func.score(tmp_path))
+        await func.score(tmp_path)
         assert len(func.events) == 1
 
-        asyncio.run(func.score(tmp_path))
+        await func.score(tmp_path)
         assert len(func.events) == 1  # Not accumulated