From 8f2811ee1267b77658074703008219fe48c70e83 Mon Sep 17 00:00:00 2001 From: Xiangyi Li Date: Thu, 21 May 2026 18:14:17 -0400 Subject: [PATCH 1/2] fix(traces): keep parquet conversion inside HF download fallback scope Bug G from unaddressed PR #323 review-bot comments (Codex P1). _download_hf_dataset ran _parquet_to_jsonl outside the per-candidate try/except. If a parquet file downloaded but conversion failed (pyarrow missing or decode error), the exception propagated immediately and the JSONL candidates / API fallback were never tried. Move the conversion inside the try so a conversion failure falls through to the alternatives. --- src/benchflow/traces/huggingface.py | 8 +++-- tests/test_traces_huggingface.py | 54 +++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+), 2 deletions(-) diff --git a/src/benchflow/traces/huggingface.py b/src/benchflow/traces/huggingface.py index 8de0fed3..df0a6145 100644 --- a/src/benchflow/traces/huggingface.py +++ b/src/benchflow/traces/huggingface.py @@ -160,10 +160,14 @@ def _download_hf_dataset( filename=filename, repo_type="dataset", ) + # Convert parquet to JSONL. The conversion is kept inside the + # try/fallback scope: if a parquet file downloads but pyarrow + # is missing (or decoding fails), the failure must fall through + # to the JSONL candidates / `_download_via_api` rather than + # propagating immediately (regression guarded — PR #323). + _parquet_to_jsonl(Path(downloaded), out_path, max_rows=max_rows) except Exception: continue - # Convert parquet to JSONL - _parquet_to_jsonl(Path(downloaded), out_path, max_rows=max_rows) return out_path # Try a JSONL data file for the split diff --git a/tests/test_traces_huggingface.py b/tests/test_traces_huggingface.py index 614a7094..e3f5b540 100644 --- a/tests/test_traces_huggingface.py +++ b/tests/test_traces_huggingface.py @@ -2,7 +2,11 @@ from __future__ import annotations +from pathlib import Path + +import benchflow.traces.huggingface as hf from benchflow.traces.huggingface import ( + _download_hf_dataset, _pick_split_file, _split_filename_candidates, ) @@ -74,3 +78,53 @@ def test_split_filename_candidates_prefers_matched_file() -> None: matched = "data/test-00000-of-00002.parquet" candidates = _split_filename_candidates(matched, "test", ".parquet") assert candidates[0] == matched + + +def test_parquet_conversion_failure_falls_through_to_jsonl(monkeypatch, tmp_path): + """Guards bug G from PR #323: a parquet conversion failure must not abort. + + When a parquet file downloads but the conversion fails (e.g. ``pyarrow`` + missing or decode error), ``_download_hf_dataset`` must fall through to the + JSONL candidates instead of letting the exception propagate immediately — + a regression from the prior behavior that alternated formats. Before the + fix, ``_parquet_to_jsonl`` ran outside the per-candidate try/except. + """ + import sys + import types + + cache = tmp_path / "cache" + cache.mkdir() + + # A JSONL source the fallback path should successfully copy. + jsonl_src = tmp_path / "src.jsonl" + jsonl_src.write_text('{"ok": true}\n') + + def fake_list_repo_files(repo_id, repo_type=None): + return ["data/train-00000-of-00001.parquet", "data/train.jsonl"] + + def fake_hf_hub_download(repo_id, filename, repo_type=None): + # Parquet "downloads" fine; JSONL "downloads" fine too. + if filename.endswith(".parquet"): + p = tmp_path / "downloaded.parquet" + p.write_bytes(b"not-real-parquet") + return str(p) + return str(jsonl_src) + + # Inject a stub `huggingface_hub` so `_download_hf_dataset` takes the + # hub branch even when the real package is not installed. + fake_hub = types.ModuleType("huggingface_hub") + fake_hub.list_repo_files = fake_list_repo_files + fake_hub.hf_hub_download = fake_hf_hub_download + monkeypatch.setitem(sys.modules, "huggingface_hub", fake_hub) + + # Parquet conversion fails — exactly the pyarrow-missing / decode-error case. + def boom(parquet_path, out_path, *, max_rows=None): + raise ImportError("pyarrow is required for parquet datasets.") + + monkeypatch.setattr(hf, "_parquet_to_jsonl", boom) + + out = _download_hf_dataset("some/repo", split="train", cache=cache) + + # The JSONL fallback ran; the exception did not propagate. + assert Path(out).exists() + assert Path(out).read_text() == '{"ok": true}\n' From 9308ccf774a3be939b3c6b5331028b34df5a4f08 Mon Sep 17 00:00:00 2001 From: Xiangyi Li Date: Thu, 21 May 2026 18:14:23 -0400 Subject: [PATCH 2/2] fix(sandbox): handle non-identifier env keys and scope umask in exec Bugs H and I from unaddressed PR #323 review-bot comments. Bug H (Codex P2): _wrap_command_with_env_file serialized env as shell 'export KEY=...' lines and sourced them. Keys that are valid process env names but not valid POSIX shell identifiers (containing '.' or '-') made '. {env_path}' return non-zero, so the user command never ran. Skip such keys with a warning; valid keys still flow through. Bug I (Cursor Medium): the 'umask 077' protecting the temp env file ran at the top of the chain and persisted into the user's command, giving files it created mode-0600 unexpectedly. Scope umask to a subshell around just the env-file write. --- src/benchflow/sandbox/docker.py | 43 +++++++++++++++++++++--- tests/test_sandbox.py | 59 +++++++++++++++++++++++++++++++++ 2 files changed, 97 insertions(+), 5 deletions(-) diff --git a/src/benchflow/sandbox/docker.py b/src/benchflow/sandbox/docker.py index f2797a1d..965e135c 100644 --- a/src/benchflow/sandbox/docker.py +++ b/src/benchflow/sandbox/docker.py @@ -492,8 +492,14 @@ async def exec( exec_command, check=False, timeout_sec=timeout_sec ) - @staticmethod - def _wrap_command_with_env_file(env: dict[str, str], command: str) -> str: + # A POSIX shell identifier: a name the shell can `export`. Keys outside + # this grammar (e.g. containing `.` or `-`) are valid process env keys but + # cannot be assigned via `export NAME=...`; sourcing such a line aborts the + # whole `. {env_path}` step, so the user command would never run (PR #323). + _SHELL_IDENTIFIER_RE = re.compile(r"^[A-Za-z_][A-Za-z0-9_]*$") + + @classmethod + def _wrap_command_with_env_file(cls, env: dict[str, str], command: str) -> str: """Return *command* prefixed to materialize *env* from a file. The env vars are base64-encoded into the command string (not visible @@ -502,16 +508,43 @@ def _wrap_command_with_env_file(env: dict[str, str], command: str) -> str: command runs. A ``trap ... EXIT`` deletes the temp file unconditionally — even if the decode/source step fails — so a failed ``&&`` chain can never leave the env file behind. + + Only keys that are valid POSIX shell identifiers are emitted as + ``export`` lines. Keys that are not (e.g. containing ``.`` or ``-``) + cannot be assigned by the shell — emitting them would make + ``. {env_path}`` fail and the user command would never run — so they + are skipped with a warning rather than silently breaking exec + (regression guarded — PR #323). + + The ``umask 077`` that protects the env file is scoped to a subshell + so it does not leak into the user's command — otherwise files the + command creates would get mode-0600 unexpectedly (PR #323). """ - env_body = "".join(f"export {k}={shlex.quote(v)}\n" for k, v in env.items()) + exportable: dict[str, str] = {} + skipped: list[str] = [] + for k, v in env.items(): + if cls._SHELL_IDENTIFIER_RE.match(k): + exportable[k] = v + else: + skipped.append(k) + if skipped: + logger.warning( + "Skipping env var(s) with non-identifier names (cannot be " + "exported by the shell): %s", + ", ".join(sorted(skipped)), + ) + + env_body = "".join( + f"export {k}={shlex.quote(v)}\n" for k, v in exportable.items() + ) encoded = base64.b64encode(env_body.encode()).decode() # Unique suffix so concurrent exec() calls in one container can't # clobber each other's env file. env_path = f"/tmp/.benchflow_exec_env_{uuid.uuid4().hex[:16]}" return ( f"trap 'rm -f {env_path}' EXIT && " - f"umask 077 && printf %s {shlex.quote(encoded)} | base64 -d > " - f"{env_path} && set -a && . {env_path} && set +a && " + f"(umask 077 && printf %s {shlex.quote(encoded)} | base64 -d > " + f"{env_path}) && set -a && . {env_path} && set +a && " f"{command}" ) diff --git a/tests/test_sandbox.py b/tests/test_sandbox.py index 6d86e5ce..b7b29844 100644 --- a/tests/test_sandbox.py +++ b/tests/test_sandbox.py @@ -156,3 +156,62 @@ async def fake_run(command, check=True, timeout_sec=None): assert "-e" not in cmd for arg in cmd: assert "sk-leak" not in arg + + def test_umask_scoped_to_env_file_write(self): + """Guards bug I from PR #323: `umask 077` must not leak into the command. + + The restrictive umask that protects the temp env file is wrapped in a + subshell `(umask 077 && ...)`, so the user's command runs under the + default umask. Before the fix, `umask 077` ran at the top of the chain + and persisted, making files the command created mode-0600. + """ + from benchflow.sandbox.docker import DockerSandbox + + wrapped = DockerSandbox._wrap_command_with_env_file( + {"FOO": "bar"}, "the-user-command" + ) + # The umask is scoped to a `( ... )` subshell containing the env-file + # write — it is not a bare top-level statement that bleeds through. + assert "(umask 077 &&" in wrapped + # `umask 077` only ever appears immediately inside the `(` subshell. + assert wrapped.count("umask 077") == 1 + assert "(umask 077" in wrapped + # The subshell closes before `set -a`/source/user command run, so the + # umask does not affect anything after it. + post_subshell = wrapped.split(")", 1)[1] + assert "umask" not in post_subshell + assert post_subshell.lstrip().startswith("&& set -a") + assert wrapped.endswith("the-user-command") + + def test_non_identifier_env_keys_do_not_break_exec(self): + """Guards bug H from PR #323: non-identifier env keys must not abort. + + Env keys that are valid process env names but not valid shell + identifiers (e.g. containing `.` or `-`) cannot be assigned via + `export NAME=...`. Emitting them would make `. {env_path}` fail and the + user command would never run, so they are skipped — valid keys still + flow through and the wrapped command stays intact. + """ + import base64 + + from benchflow.sandbox.docker import DockerSandbox + + env = { + "VALID_KEY": "keep-me", + "dotted.key": "drop-me", + "dashed-key": "drop-me-too", + } + wrapped = DockerSandbox._wrap_command_with_env_file(env, "run-it") + + # Decode the base64 env body that gets sourced inside the container. + token = wrapped.split("printf %s ", 1)[1].split(" |", 1)[0] + encoded = token.strip("'") + body = base64.b64decode(encoded).decode() + + # The valid identifier is exported; non-identifier keys are skipped so + # sourcing the file cannot fail. + assert "export VALID_KEY=" in body + assert "dotted.key" not in body + assert "dashed-key" not in body + # The user command is still wrapped and reachable. + assert wrapped.endswith("run-it")