diff --git a/src/benchflow/task/verifier.py b/src/benchflow/task/verifier.py index 23e7f987..a9f4fd07 100644 --- a/src/benchflow/task/verifier.py +++ b/src/benchflow/task/verifier.py @@ -13,8 +13,10 @@ import json import logging +import re import shlex import shutil +from collections import deque from pathlib import Path from typing import Any @@ -60,6 +62,61 @@ class RubricNotFoundError(Exception): """Raised when an llm-judge verifier cannot locate its rubric file.""" +_TAIL_LINES = 30 + +# Secret patterns redacted from verifier stdout before it is surfaced in an +# exception message (and from there into ``verifier_error`` / summaries / +# dashboards). ``test-stdout.txt`` is untrusted subprocess output that can +# contain env dumps, install URLs, or tokens — see PR #572 review. Mirrors the +# trajectory redaction set (#537); kept local so this module has no dependency +# on the trajectories package. +_SECRET_PATTERNS: tuple[tuple[re.Pattern[str], str], ...] = ( + (re.compile(r"(sk-ant-[a-zA-Z0-9_-]{10})[a-zA-Z0-9_-]+"), r"\1***REDACTED***"), + (re.compile(r"(sk-proj-[a-zA-Z0-9_-]{10})[a-zA-Z0-9_-]+"), r"\1***REDACTED***"), + (re.compile(r"(sk-[a-zA-Z0-9]{10})[a-zA-Z0-9]+"), r"\1***REDACTED***"), + (re.compile(r"(AIzaSy[A-Za-z0-9_-]{4})[A-Za-z0-9_-]{20,}"), r"\1***REDACTED***"), + ( + re.compile(r"((?:AKIA|ASIA)[A-Z0-9]{4})[A-Z0-9]{12}(?![A-Z0-9])"), + r"\1***REDACTED***", + ), + (re.compile(r"(dtn_[A-Za-z0-9_]{4})[A-Za-z0-9_]{16,}"), r"\1***REDACTED***"), + ( + re.compile(r'("authorization"\s*:\s*"Bearer\s+)[^"]+(")', re.IGNORECASE), + r"\1***REDACTED***\2", + ), + ( + re.compile(r"(Bearer\s+)[A-Za-z0-9._-]{12,}", re.IGNORECASE), + r"\1***REDACTED***", + ), + ( + re.compile(r"((?:x-api-key|api-key)\s*[:=]\s*)\S+", re.IGNORECASE), + r"\1***REDACTED***", + ), +) + + +def _redact_secrets(text: str) -> str: + """Strip well-known credential patterns from untrusted subprocess output.""" + for pattern, replacement in _SECRET_PATTERNS: + text = pattern.sub(replacement, text) + return text + + +def _tail_file(path: Path, n: int = _TAIL_LINES) -> str: + """Return the last *n* lines of *path*, redacted, or "" if unreadable. + + Streams the file with a bounded ``deque`` so a large ``test-stdout.txt`` + is never fully materialized. Output is redacted because it is untrusted + subprocess output surfaced into ``verifier_error`` (PR #572 review). + """ + try: + with path.open(errors="replace") as f: + lines = deque(f, maxlen=n) + except OSError: + return "" + return _redact_secrets("".join(lines).rstrip("\n")) + + class Verifier: """Runs the task's verifier and parses rewards. @@ -289,16 +346,21 @@ async def _verify_test_script(self) -> VerifierResult: elif self._rollout_paths.reward_json_path.exists(): rewards = self._parse_reward_json() else: + stdout_tail = _tail_file(self._rollout_paths.test_stdout_path) if test_return_code != 0: - raise RewardFileNotFoundError( + msg = ( f"verifier exited with rc={test_return_code}; no reward file " f"found at {self._rollout_paths.reward_text_path} or " f"{self._rollout_paths.reward_json_path}" ) - raise RewardFileNotFoundError( - f"No reward file found at {self._rollout_paths.reward_text_path} or " - f"{self._rollout_paths.reward_json_path}" - ) + else: + msg = ( + f"No reward file found at {self._rollout_paths.reward_text_path} or " + f"{self._rollout_paths.reward_json_path}" + ) + if stdout_tail: + msg += f"\n--- test-stdout.txt (last {_TAIL_LINES} lines) ---\n{stdout_tail}" + raise RewardFileNotFoundError(msg) return VerifierResult(rewards=rewards) diff --git a/tests/test_verifier_output.py b/tests/test_verifier_output.py new file mode 100644 index 00000000..d429fe5c --- /dev/null +++ b/tests/test_verifier_output.py @@ -0,0 +1,163 @@ +"""Tests for verifier dependency-install classification and stdout surfacing. + +Covers the ENG-151 / PR #540 path where a missing reward file surfaces a +redacted tail of ``test-stdout.txt`` through ``RewardFileNotFoundError`` so +``classify_verifier_error`` can detect dependency-install failures, plus the +PR #572 review fix that redacts secrets from that untrusted subprocess output +before it lands in ``verifier_error`` / summaries / dashboards. +""" + +import pytest + +from benchflow._utils.scoring import ( + VERIFIER_DEP_INSTALL, + VERIFIER_FAILED, + classify_verifier_error, +) +from benchflow.task.verifier import _redact_secrets, _tail_file + +# --------------------------------------------------------------------------- +# dep_install classification (PR #540) — markers reach the classifier only +# because verifier.py tails test-stdout.txt into the exception message. +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + "input_str,expected", + [ + ( + "verifier crashed: verifier exited with rc=1; no reward file found\n" + "--- test-stdout.txt (last 30 lines) ---\n" + "x No solution found when resolving tool dependencies: torch==2.1.2+cpu", + VERIFIER_DEP_INSTALL, + ), + ( + "verifier crashed: verifier exited with rc=1\n" + "Could not find a version that satisfies the requirement foo==9.9.9", + VERIFIER_DEP_INSTALL, + ), + ( + "verifier crashed: verifier exited with rc=1\nERROR: dependency install failed", + VERIFIER_DEP_INSTALL, + ), + ( + "verifier crashed: verifier exited with rc=1\nresolution impossible for bar", + VERIFIER_DEP_INSTALL, + ), + ], +) +def test_classify_dep_install_from_stdout_tail(input_str, expected): + """PR #540: dep-install markers in the surfaced stdout tail classify.""" + assert classify_verifier_error(input_str) == expected + + +def test_dep_install_takes_precedence_over_generic_crash(): + """PR #540: dep_install wins over verifier_failure when both markers present.""" + msg = ( + "verifier crashed: verifier exited with rc=1; no reward file found\n" + "--- test-stdout.txt (last 30 lines) ---\n" + "x No solution found when resolving tool dependencies: torch==2.1.2+cpu" + ) + assert classify_verifier_error(msg) == VERIFIER_DEP_INSTALL + + +# --------------------------------------------------------------------------- +# _tail_file: bounded streaming read + secret redaction (PR #572 review) +# --------------------------------------------------------------------------- + + +class TestTailFile: + def test_returns_redacted_tail(self, tmp_path): + """PR #540: tail surfaces the dep-install marker for the classifier.""" + stdout = tmp_path / "test-stdout.txt" + stdout.write_text( + "Installing dependencies...\n" + "x No solution found when resolving tool dependencies: torch==2.1.2+cpu\n" + ) + assert "no solution found" in _tail_file(stdout).lower() + + def test_missing_file_returns_empty(self, tmp_path): + assert _tail_file(tmp_path / "nope.txt") == "" + + def test_only_keeps_last_n_lines(self, tmp_path): + stdout = tmp_path / "test-stdout.txt" + stdout.write_text("".join(f"line{i}\n" for i in range(100))) + tail = _tail_file(stdout, n=5) + assert tail.splitlines() == [f"line{i}" for i in range(95, 100)] + + def test_redacts_secrets_in_tail(self, tmp_path): + """PR #572: untrusted stdout (env dumps, tokens) must be redacted. + + An obviously-fake key is used as the fixture (never a real one). + """ + stdout = tmp_path / "test-stdout.txt" + stdout.write_text( + "GEMINI_API_KEY=AIzaSyFAKEKEYFORTESTSONLYxxxxxxxxxxxxxxx\n" + "Authorization: Bearer fake-token-aaaaaaaaaaaaaaaa\n" + ) + tail = _tail_file(stdout) + assert "AIzaSyFAKEKEYFORTESTSONLYxxxxxxxxxxxxxxx" not in tail + assert "fake-token-aaaaaaaaaaaaaaaa" not in tail + assert "***REDACTED***" in tail + + +class TestRedactSecrets: + """PR #572: redaction patterns mirror the trajectory set (#537). + + All fixtures use obviously-fake credential values, never real ones. + """ + + @pytest.mark.parametrize( + "raw,leaked", + [ + ( + "key=AIzaSyFAKEKEYFORTESTSONLYxxxxxxxxxxxxxxx", + "FORTESTSONLYxxxxxxxxxxxxxxx", + ), + ("key=sk-ant-api03-FAKEFAKEFAKEdeadbeefcafe1234", "deadbeefcafe1234"), + ("key=AKIAFAKEFAKEFAKE1234", "FAKEFAKE1234"), + ("key=dtn_FAKEFAKEFAKEFAKEFAKE12345678", "FAKEFAKE12345678"), + ("x-api-key: abc123secretvalue456", "abc123secretvalue456"), + ], + ) + def test_redacts_pattern(self, raw, leaked): + out = _redact_secrets(raw) + assert leaked not in out + assert "***REDACTED***" in out + + @pytest.mark.parametrize( + "raw", + [ + "region=ASIAPACIFIC", # English word, not an AWS key + "queue=task-sk-us-east-1-foo-bar", # slug containing sk- + "label=dtn_v2_0", # short identifier + "just normal verifier output, nothing secret here", + ], + ) + def test_preserves_non_secret(self, raw): + assert _redact_secrets(raw) == raw + + def test_classifier_still_fires_after_redaction(self, tmp_path): + """Redaction must not eat the dep-install marker the classifier needs.""" + stdout = tmp_path / "test-stdout.txt" + stdout.write_text( + "GEMINI_API_KEY=AIzaSyFAKEKEYFORTESTSONLYxxxxxxxxxxxxxxx\n" + "x No solution found when resolving tool dependencies: torch==2.1.2+cpu\n" + ) + verifier_error = ( + "verifier crashed: verifier exited with rc=1\n" + f"--- test-stdout.txt (last 30 lines) ---\n{_tail_file(stdout)}" + ) + assert classify_verifier_error(verifier_error) == VERIFIER_DEP_INSTALL + assert "AIzaSyFAKEKEYFORTESTSONLYxxxxxxxxxxxxxxx" not in verifier_error + + +def test_no_dep_markers_stays_verifier_failure(tmp_path): + """PR #540: without dep-install markers the classifier returns verifier_failure.""" + stdout = tmp_path / "test-stdout.txt" + stdout.write_text("some random test output\nassertion failed\n") + verifier_error = ( + "verifier crashed: verifier exited with rc=1\n" + f"--- test-stdout.txt (last 30 lines) ---\n{_tail_file(stdout)}" + ) + assert classify_verifier_error(verifier_error) == VERIFIER_FAILED diff --git a/tests/test_verify.py b/tests/test_verify.py index d0bb1db1..3b15c6ad 100644 --- a/tests/test_verify.py +++ b/tests/test_verify.py @@ -34,6 +34,8 @@ ), ("verifier timed out after 900s", VERIFIER_TIMEOUT), ("verifier did something weird", "verifier_other"), + # dep_install classification + stdout surfacing live in + # tests/test_verifier_output.py (PR #540 / #572). ], ) def test_classify_verifier_error(input_str, expected):