From 8f2811ee1267b77658074703008219fe48c70e83 Mon Sep 17 00:00:00 2001
From: Xiangyi Li <xiangyi@benchflow.ai>
Date: Thu, 21 May 2026 18:14:17 -0400
Subject: [PATCH 1/2] fix(traces): keep parquet conversion inside HF download
 fallback scope

Bug G from unaddressed PR #323 review-bot comments (Codex P1).

_download_hf_dataset ran _parquet_to_jsonl outside the per-candidate
try/except. If a parquet file downloaded but conversion failed (pyarrow
missing or decode error), the exception propagated immediately and the
JSONL candidates / API fallback were never tried. Move the conversion
inside the try so a conversion failure falls through to the alternatives.
---
 src/benchflow/traces/huggingface.py |  8 +++--
 tests/test_traces_huggingface.py    | 54 +++++++++++++++++++++++++++++
 2 files changed, 60 insertions(+), 2 deletions(-)

diff --git a/src/benchflow/traces/huggingface.py b/src/benchflow/traces/huggingface.py
index 8de0fed3..df0a6145 100644
--- a/src/benchflow/traces/huggingface.py
+++ b/src/benchflow/traces/huggingface.py
@@ -160,10 +160,14 @@ def _download_hf_dataset(
                     filename=filename,
                     repo_type="dataset",
                 )
+                # Convert parquet to JSONL. The conversion is kept inside the
+                # try/fallback scope: if a parquet file downloads but pyarrow
+                # is missing (or decoding fails), the failure must fall through
+                # to the JSONL candidates / `_download_via_api` rather than
+                # propagating immediately (regression guarded — PR #323).
+                _parquet_to_jsonl(Path(downloaded), out_path, max_rows=max_rows)
             except Exception:
                 continue
-            # Convert parquet to JSONL
-            _parquet_to_jsonl(Path(downloaded), out_path, max_rows=max_rows)
             return out_path
 
         # Try a JSONL data file for the split
diff --git a/tests/test_traces_huggingface.py b/tests/test_traces_huggingface.py
index 614a7094..e3f5b540 100644
--- a/tests/test_traces_huggingface.py
+++ b/tests/test_traces_huggingface.py
@@ -2,7 +2,11 @@
 
 from __future__ import annotations
 
+from pathlib import Path
+
+import benchflow.traces.huggingface as hf
 from benchflow.traces.huggingface import (
+    _download_hf_dataset,
     _pick_split_file,
     _split_filename_candidates,
 )
@@ -74,3 +78,53 @@ def test_split_filename_candidates_prefers_matched_file() -> None:
     matched = "data/test-00000-of-00002.parquet"
     candidates = _split_filename_candidates(matched, "test", ".parquet")
     assert candidates[0] == matched
+
+
+def test_parquet_conversion_failure_falls_through_to_jsonl(monkeypatch, tmp_path):
+    """Guards bug G from PR #323: a parquet conversion failure must not abort.
+
+    When a parquet file downloads but the conversion fails (e.g. ``pyarrow``
+    missing or decode error), ``_download_hf_dataset`` must fall through to the
+    JSONL candidates instead of letting the exception propagate immediately —
+    a regression from the prior behavior that alternated formats. Before the
+    fix, ``_parquet_to_jsonl`` ran outside the per-candidate try/except.
+    """
+    import sys
+    import types
+
+    cache = tmp_path / "cache"
+    cache.mkdir()
+
+    # A JSONL source the fallback path should successfully copy.
+    jsonl_src = tmp_path / "src.jsonl"
+    jsonl_src.write_text('{"ok": true}\n')
+
+    def fake_list_repo_files(repo_id, repo_type=None):
+        return ["data/train-00000-of-00001.parquet", "data/train.jsonl"]
+
+    def fake_hf_hub_download(repo_id, filename, repo_type=None):
+        # Parquet "downloads" fine; JSONL "downloads" fine too.
+        if filename.endswith(".parquet"):
+            p = tmp_path / "downloaded.parquet"
+            p.write_bytes(b"not-real-parquet")
+            return str(p)
+        return str(jsonl_src)
+
+    # Inject a stub `huggingface_hub` so `_download_hf_dataset` takes the
+    # hub branch even when the real package is not installed.
+    fake_hub = types.ModuleType("huggingface_hub")
+    fake_hub.list_repo_files = fake_list_repo_files
+    fake_hub.hf_hub_download = fake_hf_hub_download
+    monkeypatch.setitem(sys.modules, "huggingface_hub", fake_hub)
+
+    # Parquet conversion fails — exactly the pyarrow-missing / decode-error case.
+    def boom(parquet_path, out_path, *, max_rows=None):
+        raise ImportError("pyarrow is required for parquet datasets.")
+
+    monkeypatch.setattr(hf, "_parquet_to_jsonl", boom)
+
+    out = _download_hf_dataset("some/repo", split="train", cache=cache)
+
+    # The JSONL fallback ran; the exception did not propagate.
+    assert Path(out).exists()
+    assert Path(out).read_text() == '{"ok": true}\n'

From 9308ccf774a3be939b3c6b5331028b34df5a4f08 Mon Sep 17 00:00:00 2001
From: Xiangyi Li <xiangyi@benchflow.ai>
Date: Thu, 21 May 2026 18:14:23 -0400
Subject: [PATCH 2/2] fix(sandbox): handle non-identifier env keys and scope
 umask in exec

Bugs H and I from unaddressed PR #323 review-bot comments.

Bug H (Codex P2): _wrap_command_with_env_file serialized env as shell
'export KEY=...' lines and sourced them. Keys that are valid process env
names but not valid POSIX shell identifiers (containing '.' or '-') made
'. {env_path}' return non-zero, so the user command never ran. Skip such
keys with a warning; valid keys still flow through.

Bug I (Cursor Medium): the 'umask 077' protecting the temp env file ran
at the top of the chain and persisted into the user's command, giving
files it created mode-0600 unexpectedly. Scope umask to a subshell
around just the env-file write.
---
 src/benchflow/sandbox/docker.py | 43 +++++++++++++++++++++---
 tests/test_sandbox.py           | 59 +++++++++++++++++++++++++++++++++
 2 files changed, 97 insertions(+), 5 deletions(-)

diff --git a/src/benchflow/sandbox/docker.py b/src/benchflow/sandbox/docker.py
index f2797a1d..965e135c 100644
--- a/src/benchflow/sandbox/docker.py
+++ b/src/benchflow/sandbox/docker.py
@@ -492,8 +492,14 @@ async def exec(
             exec_command, check=False, timeout_sec=timeout_sec
         )
 
-    @staticmethod
-    def _wrap_command_with_env_file(env: dict[str, str], command: str) -> str:
+    # A POSIX shell identifier: a name the shell can `export`. Keys outside
+    # this grammar (e.g. containing `.` or `-`) are valid process env keys but
+    # cannot be assigned via `export NAME=...`; sourcing such a line aborts the
+    # whole `. {env_path}` step, so the user command would never run (PR #323).
+    _SHELL_IDENTIFIER_RE = re.compile(r"^[A-Za-z_][A-Za-z0-9_]*$")
+
+    @classmethod
+    def _wrap_command_with_env_file(cls, env: dict[str, str], command: str) -> str:
         """Return *command* prefixed to materialize *env* from a file.
 
         The env vars are base64-encoded into the command string (not visible
@@ -502,16 +508,43 @@ def _wrap_command_with_env_file(env: dict[str, str], command: str) -> str:
         command runs. A ``trap ... EXIT`` deletes the temp file
         unconditionally — even if the decode/source step fails — so a failed
         ``&&`` chain can never leave the env file behind.
+
+        Only keys that are valid POSIX shell identifiers are emitted as
+        ``export`` lines. Keys that are not (e.g. containing ``.`` or ``-``)
+        cannot be assigned by the shell — emitting them would make
+        ``. {env_path}`` fail and the user command would never run — so they
+        are skipped with a warning rather than silently breaking exec
+        (regression guarded — PR #323).
+
+        The ``umask 077`` that protects the env file is scoped to a subshell
+        so it does not leak into the user's command — otherwise files the
+        command creates would get mode-0600 unexpectedly (PR #323).
         """
-        env_body = "".join(f"export {k}={shlex.quote(v)}\n" for k, v in env.items())
+        exportable: dict[str, str] = {}
+        skipped: list[str] = []
+        for k, v in env.items():
+            if cls._SHELL_IDENTIFIER_RE.match(k):
+                exportable[k] = v
+            else:
+                skipped.append(k)
+        if skipped:
+            logger.warning(
+                "Skipping env var(s) with non-identifier names (cannot be "
+                "exported by the shell): %s",
+                ", ".join(sorted(skipped)),
+            )
+
+        env_body = "".join(
+            f"export {k}={shlex.quote(v)}\n" for k, v in exportable.items()
+        )
         encoded = base64.b64encode(env_body.encode()).decode()
         # Unique suffix so concurrent exec() calls in one container can't
         # clobber each other's env file.
         env_path = f"/tmp/.benchflow_exec_env_{uuid.uuid4().hex[:16]}"
         return (
             f"trap 'rm -f {env_path}' EXIT && "
-            f"umask 077 && printf %s {shlex.quote(encoded)} | base64 -d > "
-            f"{env_path} && set -a && . {env_path} && set +a && "
+            f"(umask 077 && printf %s {shlex.quote(encoded)} | base64 -d > "
+            f"{env_path}) && set -a && . {env_path} && set +a && "
             f"{command}"
         )
 
diff --git a/tests/test_sandbox.py b/tests/test_sandbox.py
index 6d86e5ce..b7b29844 100644
--- a/tests/test_sandbox.py
+++ b/tests/test_sandbox.py
@@ -156,3 +156,62 @@ async def fake_run(command, check=True, timeout_sec=None):
         assert "-e" not in cmd
         for arg in cmd:
             assert "sk-leak" not in arg
+
+    def test_umask_scoped_to_env_file_write(self):
+        """Guards bug I from PR #323: `umask 077` must not leak into the command.
+
+        The restrictive umask that protects the temp env file is wrapped in a
+        subshell `(umask 077 && ...)`, so the user's command runs under the
+        default umask. Before the fix, `umask 077` ran at the top of the chain
+        and persisted, making files the command created mode-0600.
+        """
+        from benchflow.sandbox.docker import DockerSandbox
+
+        wrapped = DockerSandbox._wrap_command_with_env_file(
+            {"FOO": "bar"}, "the-user-command"
+        )
+        # The umask is scoped to a `( ... )` subshell containing the env-file
+        # write — it is not a bare top-level statement that bleeds through.
+        assert "(umask 077 &&" in wrapped
+        # `umask 077` only ever appears immediately inside the `(` subshell.
+        assert wrapped.count("umask 077") == 1
+        assert "(umask 077" in wrapped
+        # The subshell closes before `set -a`/source/user command run, so the
+        # umask does not affect anything after it.
+        post_subshell = wrapped.split(")", 1)[1]
+        assert "umask" not in post_subshell
+        assert post_subshell.lstrip().startswith("&& set -a")
+        assert wrapped.endswith("the-user-command")
+
+    def test_non_identifier_env_keys_do_not_break_exec(self):
+        """Guards bug H from PR #323: non-identifier env keys must not abort.
+
+        Env keys that are valid process env names but not valid shell
+        identifiers (e.g. containing `.` or `-`) cannot be assigned via
+        `export NAME=...`. Emitting them would make `. {env_path}` fail and the
+        user command would never run, so they are skipped — valid keys still
+        flow through and the wrapped command stays intact.
+        """
+        import base64
+
+        from benchflow.sandbox.docker import DockerSandbox
+
+        env = {
+            "VALID_KEY": "keep-me",
+            "dotted.key": "drop-me",
+            "dashed-key": "drop-me-too",
+        }
+        wrapped = DockerSandbox._wrap_command_with_env_file(env, "run-it")
+
+        # Decode the base64 env body that gets sourced inside the container.
+        token = wrapped.split("printf %s ", 1)[1].split(" |", 1)[0]
+        encoded = token.strip("'")
+        body = base64.b64decode(encoded).decode()
+
+        # The valid identifier is exported; non-identifier keys are skipped so
+        # sourcing the file cannot fail.
+        assert "export VALID_KEY=" in body
+        assert "dotted.key" not in body
+        assert "dashed-key" not in body
+        # The user command is still wrapped and reachable.
+        assert wrapped.endswith("run-it")