Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 22 additions & 5 deletions src/benchflow/task/verifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,18 @@ class RubricNotFoundError(Exception):
"""Raised when an llm-judge verifier cannot locate its rubric file."""


_TAIL_LINES = 30


def _tail_file(path: Path, n: int = _TAIL_LINES) -> str:
"""Return the last *n* lines of *path*, or empty string if unreadable."""
try:
lines = path.read_text(errors="replace").splitlines()
return "\n".join(lines[-n:])
except OSError:
return ""


class Verifier:
"""Runs the task's verifier and parses rewards.

Expand Down Expand Up @@ -289,16 +301,21 @@ async def _verify_test_script(self) -> VerifierResult:
elif self._rollout_paths.reward_json_path.exists():
rewards = self._parse_reward_json()
else:
stdout_tail = _tail_file(self._rollout_paths.test_stdout_path)
if test_return_code != 0:
raise RewardFileNotFoundError(
msg = (
f"verifier exited with rc={test_return_code}; no reward file "
f"found at {self._rollout_paths.reward_text_path} or "
f"{self._rollout_paths.reward_json_path}"
)
raise RewardFileNotFoundError(
f"No reward file found at {self._rollout_paths.reward_text_path} or "
f"{self._rollout_paths.reward_json_path}"
)
else:
msg = (
f"No reward file found at {self._rollout_paths.reward_text_path} or "
f"{self._rollout_paths.reward_json_path}"
)
if stdout_tail:
msg += f"\n--- test-stdout.txt (last {_TAIL_LINES} lines) ---\n{stdout_tail}"
raise RewardFileNotFoundError(msg)

return VerifierResult(rewards=rewards)

Expand Down
89 changes: 89 additions & 0 deletions tests/test_verify.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import pytest

from benchflow._utils.scoring import (
VERIFIER_DEP_INSTALL,
VERIFIER_FAILED,
VERIFIER_INFRA,
VERIFIER_TIMEOUT,
Expand All @@ -34,6 +35,28 @@
),
("verifier timed out after 900s", VERIFIER_TIMEOUT),
("verifier did something weird", "verifier_other"),
# dep_install markers surfaced via test-stdout.txt tail
(
"verifier crashed: verifier exited with rc=1; no reward file found\n"
"--- test-stdout.txt (last 30 lines) ---\n"
"× No solution found when resolving tool dependencies: torch==2.1.2+cpu", # noqa: RUF001
VERIFIER_DEP_INSTALL,
),
(
"verifier crashed: verifier exited with rc=1\n"
"Could not find a version that satisfies the requirement foo==9.9.9",
VERIFIER_DEP_INSTALL,
),
(
"verifier crashed: verifier exited with rc=1\n"
"ERROR: dependency install failed",
VERIFIER_DEP_INSTALL,
),
(
"verifier crashed: verifier exited with rc=1\n"
"resolution impossible for package bar",
VERIFIER_DEP_INSTALL,
),
],
)
def test_classify_verifier_error(input_str, expected):
Expand All @@ -49,6 +72,72 @@ def test_classify_verifier_error_substring_order():
assert classify_verifier_error(msg) == VERIFIER_FAILED


def test_dep_install_takes_precedence_over_generic_crash():
"""dep_install wins over verifier_failure when both markers are present."""
msg = (
"verifier crashed: verifier exited with rc=1; no reward file found\n"
"--- test-stdout.txt (last 30 lines) ---\n"
"× No solution found when resolving tool dependencies: torch==2.1.2+cpu" # noqa: RUF001
)
assert classify_verifier_error(msg) == VERIFIER_DEP_INSTALL


class TestRewardFileNotFoundSurfacesStdout:
"""Verify that RewardFileNotFoundError includes test-stdout.txt tail,
so classify_verifier_error can detect dep-install markers end-to-end.
"""

def test_exception_includes_stdout_tail(self, tmp_path):
from benchflow.task.verifier import _tail_file

stdout = tmp_path / "test-stdout.txt"
stdout.write_text(
"Installing dependencies...\n"
"× No solution found when resolving tool dependencies: torch==2.1.2+cpu\n" # noqa: RUF001
)
tail = _tail_file(stdout)
assert "no solution found" in tail.lower()

def test_exception_message_triggers_classifier(self, tmp_path):
"""Simulate the exact exception message verifier.py now builds and
verify the classifier returns VERIFIER_DEP_INSTALL."""
stdout_content = (
"Collecting torch==2.1.2+cpu\n"
"× No solution found when resolving tool dependencies: torch==2.1.2+cpu\n" # noqa: RUF001
)
(tmp_path / "test-stdout.txt").write_text(stdout_content)

from benchflow.task.verifier import _tail_file

tail = _tail_file(tmp_path / "test-stdout.txt")
verifier_error = (
f"verifier crashed: verifier exited with rc=1; no reward file "
f"found at {tmp_path}/reward.txt or {tmp_path}/reward.json"
f"\n--- test-stdout.txt (last 30 lines) ---\n{tail}"
)
assert classify_verifier_error(verifier_error) == VERIFIER_DEP_INSTALL

def test_missing_stdout_produces_no_tail(self, tmp_path):
from benchflow.task.verifier import _tail_file

tail = _tail_file(tmp_path / "nonexistent.txt")
assert tail == ""

def test_no_dep_markers_stays_verifier_failure(self, tmp_path):
"""Without dep-install markers the classifier should return verifier_failure."""
(tmp_path / "test-stdout.txt").write_text("some random test output\nfail\n")

from benchflow.task.verifier import _tail_file

tail = _tail_file(tmp_path / "test-stdout.txt")
verifier_error = (
f"verifier crashed: verifier exited with rc=1; no reward file "
f"found at {tmp_path}/reward.txt\n"
f"--- test-stdout.txt (last 30 lines) ---\n{tail}"
)
assert classify_verifier_error(verifier_error) == VERIFIER_FAILED


# ---------------------------------------------------------------------------
# RunResult with verifier_error
# ---------------------------------------------------------------------------
Expand Down
Loading