Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 38 additions & 5 deletions src/benchflow/sandbox/docker.py
Original file line number Diff line number Diff line change
Expand Up @@ -492,8 +492,14 @@ async def exec(
exec_command, check=False, timeout_sec=timeout_sec
)

@staticmethod
def _wrap_command_with_env_file(env: dict[str, str], command: str) -> str:
# A POSIX shell identifier: a name the shell can `export`. Keys outside
# this grammar (e.g. containing `.` or `-`) are valid process env keys but
# cannot be assigned via `export NAME=...`; sourcing such a line aborts the
# whole `. {env_path}` step, so the user command would never run (PR #323).
_SHELL_IDENTIFIER_RE = re.compile(r"^[A-Za-z_][A-Za-z0-9_]*$")

@classmethod
def _wrap_command_with_env_file(cls, env: dict[str, str], command: str) -> str:
"""Return *command* prefixed to materialize *env* from a file.

The env vars are base64-encoded into the command string (not visible
Expand All @@ -502,16 +508,43 @@ def _wrap_command_with_env_file(env: dict[str, str], command: str) -> str:
command runs. A ``trap ... EXIT`` deletes the temp file
unconditionally — even if the decode/source step fails — so a failed
``&&`` chain can never leave the env file behind.

Only keys that are valid POSIX shell identifiers are emitted as
``export`` lines. Keys that are not (e.g. containing ``.`` or ``-``)
cannot be assigned by the shell — emitting them would make
``. {env_path}`` fail and the user command would never run — so they
are skipped with a warning rather than silently breaking exec
(regression guarded — PR #323).

The ``umask 077`` that protects the env file is scoped to a subshell
so it does not leak into the user's command — otherwise files the
command creates would get mode-0600 unexpectedly (PR #323).
"""
env_body = "".join(f"export {k}={shlex.quote(v)}\n" for k, v in env.items())
exportable: dict[str, str] = {}
skipped: list[str] = []
for k, v in env.items():
if cls._SHELL_IDENTIFIER_RE.match(k):
exportable[k] = v
else:
skipped.append(k)
if skipped:
logger.warning(
"Skipping env var(s) with non-identifier names (cannot be "
"exported by the shell): %s",
", ".join(sorted(skipped)),
)

env_body = "".join(
f"export {k}={shlex.quote(v)}\n" for k, v in exportable.items()
)
encoded = base64.b64encode(env_body.encode()).decode()
# Unique suffix so concurrent exec() calls in one container can't
# clobber each other's env file.
env_path = f"/tmp/.benchflow_exec_env_{uuid.uuid4().hex[:16]}"
return (
f"trap 'rm -f {env_path}' EXIT && "
f"umask 077 && printf %s {shlex.quote(encoded)} | base64 -d > "
f"{env_path} && set -a && . {env_path} && set +a && "
f"(umask 077 && printf %s {shlex.quote(encoded)} | base64 -d > "
f"{env_path}) && set -a && . {env_path} && set +a && "
f"{command}"
)

Expand Down
8 changes: 6 additions & 2 deletions src/benchflow/traces/huggingface.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,10 +160,14 @@ def _download_hf_dataset(
filename=filename,
repo_type="dataset",
)
# Convert parquet to JSONL. The conversion is kept inside the
# try/fallback scope: if a parquet file downloads but pyarrow
# is missing (or decoding fails), the failure must fall through
# to the JSONL candidates / `_download_via_api` rather than
# propagating immediately (regression guarded — PR #323).
_parquet_to_jsonl(Path(downloaded), out_path, max_rows=max_rows)
except Exception:
continue
# Convert parquet to JSONL
_parquet_to_jsonl(Path(downloaded), out_path, max_rows=max_rows)
return out_path

# Try a JSONL data file for the split
Expand Down
59 changes: 59 additions & 0 deletions tests/test_sandbox.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,3 +156,62 @@ async def fake_run(command, check=True, timeout_sec=None):
assert "-e" not in cmd
for arg in cmd:
assert "sk-leak" not in arg

def test_umask_scoped_to_env_file_write(self):
"""Guards bug I from PR #323: `umask 077` must not leak into the command.

The restrictive umask that protects the temp env file is wrapped in a
subshell `(umask 077 && ...)`, so the user's command runs under the
default umask. Before the fix, `umask 077` ran at the top of the chain
and persisted, making files the command created mode-0600.
"""
from benchflow.sandbox.docker import DockerSandbox

wrapped = DockerSandbox._wrap_command_with_env_file(
{"FOO": "bar"}, "the-user-command"
)
# The umask is scoped to a `( ... )` subshell containing the env-file
# write — it is not a bare top-level statement that bleeds through.
assert "(umask 077 &&" in wrapped
# `umask 077` only ever appears immediately inside the `(` subshell.
assert wrapped.count("umask 077") == 1
assert "(umask 077" in wrapped
# The subshell closes before `set -a`/source/user command run, so the
# umask does not affect anything after it.
post_subshell = wrapped.split(")", 1)[1]
assert "umask" not in post_subshell
assert post_subshell.lstrip().startswith("&& set -a")
assert wrapped.endswith("the-user-command")

def test_non_identifier_env_keys_do_not_break_exec(self):
"""Guards bug H from PR #323: non-identifier env keys must not abort.

Env keys that are valid process env names but not valid shell
identifiers (e.g. containing `.` or `-`) cannot be assigned via
`export NAME=...`. Emitting them would make `. {env_path}` fail and the
user command would never run, so they are skipped — valid keys still
flow through and the wrapped command stays intact.
"""
import base64

from benchflow.sandbox.docker import DockerSandbox

env = {
"VALID_KEY": "keep-me",
"dotted.key": "drop-me",
"dashed-key": "drop-me-too",
}
wrapped = DockerSandbox._wrap_command_with_env_file(env, "run-it")

# Decode the base64 env body that gets sourced inside the container.
token = wrapped.split("printf %s ", 1)[1].split(" |", 1)[0]
encoded = token.strip("'")
body = base64.b64decode(encoded).decode()

# The valid identifier is exported; non-identifier keys are skipped so
# sourcing the file cannot fail.
assert "export VALID_KEY=" in body
assert "dotted.key" not in body
assert "dashed-key" not in body
# The user command is still wrapped and reachable.
assert wrapped.endswith("run-it")
54 changes: 54 additions & 0 deletions tests/test_traces_huggingface.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,11 @@

from __future__ import annotations

from pathlib import Path

import benchflow.traces.huggingface as hf
from benchflow.traces.huggingface import (
_download_hf_dataset,
_pick_split_file,
_split_filename_candidates,
)
Expand Down Expand Up @@ -74,3 +78,53 @@ def test_split_filename_candidates_prefers_matched_file() -> None:
matched = "data/test-00000-of-00002.parquet"
candidates = _split_filename_candidates(matched, "test", ".parquet")
assert candidates[0] == matched


def test_parquet_conversion_failure_falls_through_to_jsonl(monkeypatch, tmp_path):
"""Guards bug G from PR #323: a parquet conversion failure must not abort.

When a parquet file downloads but the conversion fails (e.g. ``pyarrow``
missing or decode error), ``_download_hf_dataset`` must fall through to the
JSONL candidates instead of letting the exception propagate immediately —
a regression from the prior behavior that alternated formats. Before the
fix, ``_parquet_to_jsonl`` ran outside the per-candidate try/except.
"""
import sys
import types

cache = tmp_path / "cache"
cache.mkdir()

# A JSONL source the fallback path should successfully copy.
jsonl_src = tmp_path / "src.jsonl"
jsonl_src.write_text('{"ok": true}\n')

def fake_list_repo_files(repo_id, repo_type=None):
return ["data/train-00000-of-00001.parquet", "data/train.jsonl"]

def fake_hf_hub_download(repo_id, filename, repo_type=None):
# Parquet "downloads" fine; JSONL "downloads" fine too.
if filename.endswith(".parquet"):
p = tmp_path / "downloaded.parquet"
p.write_bytes(b"not-real-parquet")
return str(p)
return str(jsonl_src)

# Inject a stub `huggingface_hub` so `_download_hf_dataset` takes the
# hub branch even when the real package is not installed.
fake_hub = types.ModuleType("huggingface_hub")
fake_hub.list_repo_files = fake_list_repo_files
fake_hub.hf_hub_download = fake_hf_hub_download
monkeypatch.setitem(sys.modules, "huggingface_hub", fake_hub)

# Parquet conversion fails — exactly the pyarrow-missing / decode-error case.
def boom(parquet_path, out_path, *, max_rows=None):
raise ImportError("pyarrow is required for parquet datasets.")

monkeypatch.setattr(hf, "_parquet_to_jsonl", boom)

out = _download_hf_dataset("some/repo", split="train", cache=cache)

# The JSONL fallback ran; the exception did not propagate.
assert Path(out).exists()
assert Path(out).read_text() == '{"ok": true}\n'
Loading