diff --git a/scripts/benchmark.py b/scripts/benchmark.py index 36b5e05..cf7cbf0 100644 --- a/scripts/benchmark.py +++ b/scripts/benchmark.py @@ -62,8 +62,6 @@ def execute_task(self, task: Task, simulate: bool = False) -> Dict[str, Any]: Returns: Dictionary containing execution results """ - if simulate: - logger.info("Simulate flag no longer supported for execute_task") raise NotImplementedError("Use execute_openclaw_task helper for real runs") diff --git a/scripts/lib_agent.py b/scripts/lib_agent.py index 053619a..2e03141 100644 --- a/scripts/lib_agent.py +++ b/scripts/lib_agent.py @@ -6,6 +6,7 @@ import json import logging +import shutil import subprocess import time from pathlib import Path @@ -18,6 +19,13 @@ MAX_OPENCLAW_MESSAGE_CHARS = 4000 +def _decode_output(data: bytes | str | None) -> str: + """Decode subprocess output that may be bytes (e.g. from TimeoutExpired).""" + if isinstance(data, bytes): + return data.decode("utf-8", errors="replace") + return data or "" + + def slugify_model(model_id: str) -> str: return model_id.replace("/", "-").replace(".", "-") @@ -279,11 +287,18 @@ def _find_recent_session_path(agent_dir: Path, started_at: float) -> Path | None if not candidates: return None tolerance_seconds = 5.0 - recent_candidates = [ - path for path in candidates if path.stat().st_mtime >= (started_at - tolerance_seconds) - ] - pool = recent_candidates or candidates - return max(pool, key=lambda path: path.stat().st_mtime) + # Cache stat results to avoid TOCTOU race and redundant syscalls + candidates_with_mtime = [] + for path in candidates: + try: + candidates_with_mtime.append((path, path.stat().st_mtime)) + except OSError: + continue + if not candidates_with_mtime: + return None + recent = [(p, m) for p, m in candidates_with_mtime if m >= (started_at - tolerance_seconds)] + pool = recent or candidates_with_mtime + return max(pool, key=lambda x: x[1])[0] def _load_transcript(agent_id: str, session_id: str, started_at: float) -> List[Dict[str, Any]]: @@ -449,8 +464,8 @@ def execute_openclaw_task( exit_code = result.returncode except subprocess.TimeoutExpired as exc: timed_out = True - stdout = exc.stdout or "" - stderr = exc.stderr or "" + stdout = _decode_output(exc.stdout) + stderr = _decode_output(exc.stderr) except FileNotFoundError as exc: stderr = f"openclaw command not found: {exc}" @@ -458,15 +473,16 @@ def execute_openclaw_task( usage = _extract_usage_from_transcript(transcript) execution_time = time.time() - start_time - status = "success" - if timed_out: - status = "timeout" - if not transcript: + if stderr and "openclaw command not found" in str(stderr): status = "error" - if exit_code not in (0, -1) and not timed_out: + elif timed_out: + status = "timeout" + elif exit_code not in (0, -1): status = "error" - if stderr and "openclaw command not found" in str(stderr): + elif not transcript: status = "error" + else: + status = "success" # Verbose logging for debugging if verbose: @@ -485,6 +501,8 @@ def execute_openclaw_task( msg = entry.get("message", {}) role = msg.get("role", "unknown") content = msg.get("content", "") + if not isinstance(content, str): + content = str(content) if content else "" if role == "assistant": # Truncate long responses preview = content[:500] + "..." if len(content) > 500 else content @@ -593,8 +611,8 @@ def run_openclaw_prompt( break except subprocess.TimeoutExpired as exc: timed_out = True - stdout += exc.stdout or "" - stderr += exc.stderr or "" + stdout += _decode_output(exc.stdout) + stderr += _decode_output(exc.stderr) break except FileNotFoundError as exc: stderr += f"openclaw command not found: {exc}" @@ -603,15 +621,16 @@ def run_openclaw_prompt( transcript = _load_transcript(agent_id, session_id, start_time) execution_time = time.time() - start_time - status = "success" - if timed_out: - status = "timeout" - if not transcript: + if stderr and "openclaw command not found" in str(stderr): status = "error" - if exit_code not in (0, -1) and not timed_out: + elif timed_out: + status = "timeout" + elif exit_code not in (0, -1): status = "error" - if stderr and "openclaw command not found" in str(stderr): + elif not transcript: status = "error" + else: + status = "success" return { "agent_id": agent_id, diff --git a/scripts/lib_grading.py b/scripts/lib_grading.py index a25649e..49d1278 100644 --- a/scripts/lib_grading.py +++ b/scripts/lib_grading.py @@ -257,13 +257,17 @@ def _summarize_transcript(transcript: List[Dict[str, Any]]) -> str: ) elif role == "toolResult": content = msg.get("content", []) - if content: + if isinstance(content, str): + summary_parts.append(f"Result: {content[:200]}") + elif isinstance(content, list) and content: result_preview = str(content[0])[:200] summary_parts.append(f"Result: {result_preview}") elif role == "user": content = msg.get("content", []) - if content: - summary_parts.append(f"User: {content[0]}") + if isinstance(content, str): + summary_parts.append(f"User: {content[:200]}") + elif isinstance(content, list) and content: + summary_parts.append(f"User: {str(content[0])[:200]}") return "\n".join(summary_parts) @@ -418,9 +422,15 @@ def _normalize_judge_response(parsed: Dict[str, Any]) -> Dict[str, Any]: # Extract total score if "total" in parsed and parsed["total"] is not None: - result["total"] = float(parsed["total"]) if isinstance(parsed["total"], (int, float)) else None - elif "score" in parsed and isinstance(parsed["score"], (int, float)): - result["total"] = float(parsed["score"]) + try: + result["total"] = float(parsed["total"]) + except (TypeError, ValueError): + result["total"] = None + elif "score" in parsed and parsed["score"] is not None: + try: + result["total"] = float(parsed["score"]) + except (TypeError, ValueError): + pass elif result["scores"]: # Calculate average if we have individual scores but no total values = [v for v in result["scores"].values() if isinstance(v, (int, float))] diff --git a/scripts/lib_upload.py b/scripts/lib_upload.py index 20102f8..18abae1 100644 --- a/scripts/lib_upload.py +++ b/scripts/lib_upload.py @@ -18,7 +18,7 @@ DEFAULT_SERVER_URL = "https://api.pinchbench.com" DEFAULT_TIMEOUT_SECONDS = 30.0 -CONFIG_DIR = Path(__file__).resolve().parent / ".pinchbench" +CONFIG_DIR = Path.home() / ".pinchbench" CONFIG_PATH = CONFIG_DIR / "config.json" @@ -302,7 +302,7 @@ def _format_timestamp(timestamp: Any) -> str: def _read_client_version() -> str: - pyproject = Path(__file__).with_name("pyproject.toml") + pyproject = Path(__file__).resolve().parent.parent / "pyproject.toml" if not pyproject.exists(): return "" for line in pyproject.read_text(encoding="utf-8").splitlines(): @@ -368,9 +368,9 @@ def _collect_linux_info() -> Dict[str, Any]: total_kb = _parse_meminfo_value(meminfo, "MemTotal") avail_kb = _parse_meminfo_value(meminfo, "MemAvailable") if total_kb is not None: - info["memory_total_gb"] = round(total_kb / 1e6, 1) + info["memory_total_gb"] = round(total_kb / (1024 * 1024), 1) if avail_kb is not None: - info["memory_available_gb"] = round(avail_kb / 1e6, 1) + info["memory_available_gb"] = round(avail_kb / (1024 * 1024), 1) except OSError: pass