Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions scripts/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,6 @@ def execute_task(self, task: Task, simulate: bool = False) -> Dict[str, Any]:
Returns:
Dictionary containing execution results
"""
if simulate:
logger.info("Simulate flag no longer supported for execute_task")
raise NotImplementedError("Use execute_openclaw_task helper for real runs")


Expand Down
61 changes: 40 additions & 21 deletions scripts/lib_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

import json
import logging
import shutil
import subprocess
import time
from pathlib import Path
Expand All @@ -18,6 +19,13 @@
MAX_OPENCLAW_MESSAGE_CHARS = 4000


def _decode_output(data: bytes | str | None) -> str:
"""Decode subprocess output that may be bytes (e.g. from TimeoutExpired)."""
if isinstance(data, bytes):
return data.decode("utf-8", errors="replace")
return data or ""


def slugify_model(model_id: str) -> str:
return model_id.replace("/", "-").replace(".", "-")

Expand Down Expand Up @@ -279,11 +287,18 @@ def _find_recent_session_path(agent_dir: Path, started_at: float) -> Path | None
if not candidates:
return None
tolerance_seconds = 5.0
recent_candidates = [
path for path in candidates if path.stat().st_mtime >= (started_at - tolerance_seconds)
]
pool = recent_candidates or candidates
return max(pool, key=lambda path: path.stat().st_mtime)
# Cache stat results to avoid TOCTOU race and redundant syscalls
candidates_with_mtime = []
for path in candidates:
try:
candidates_with_mtime.append((path, path.stat().st_mtime))
except OSError:
continue
if not candidates_with_mtime:
return None
recent = [(p, m) for p, m in candidates_with_mtime if m >= (started_at - tolerance_seconds)]
pool = recent or candidates_with_mtime
return max(pool, key=lambda x: x[1])[0]


def _load_transcript(agent_id: str, session_id: str, started_at: float) -> List[Dict[str, Any]]:
Expand Down Expand Up @@ -449,24 +464,25 @@ def execute_openclaw_task(
exit_code = result.returncode
except subprocess.TimeoutExpired as exc:
timed_out = True
stdout = exc.stdout or ""
stderr = exc.stderr or ""
stdout = _decode_output(exc.stdout)
stderr = _decode_output(exc.stderr)
except FileNotFoundError as exc:
stderr = f"openclaw command not found: {exc}"

transcript = _load_transcript(agent_id, session_id, start_time)
usage = _extract_usage_from_transcript(transcript)
execution_time = time.time() - start_time

status = "success"
if timed_out:
status = "timeout"
if not transcript:
if stderr and "openclaw command not found" in str(stderr):
status = "error"
if exit_code not in (0, -1) and not timed_out:
elif timed_out:
status = "timeout"
elif exit_code not in (0, -1):
status = "error"
if stderr and "openclaw command not found" in str(stderr):
elif not transcript:
status = "error"
else:
status = "success"

# Verbose logging for debugging
if verbose:
Expand All @@ -485,6 +501,8 @@ def execute_openclaw_task(
msg = entry.get("message", {})
role = msg.get("role", "unknown")
content = msg.get("content", "")
if not isinstance(content, str):
content = str(content) if content else ""
if role == "assistant":
# Truncate long responses
preview = content[:500] + "..." if len(content) > 500 else content
Expand Down Expand Up @@ -593,8 +611,8 @@ def run_openclaw_prompt(
break
except subprocess.TimeoutExpired as exc:
timed_out = True
stdout += exc.stdout or ""
stderr += exc.stderr or ""
stdout += _decode_output(exc.stdout)
stderr += _decode_output(exc.stderr)
break
except FileNotFoundError as exc:
stderr += f"openclaw command not found: {exc}"
Expand All @@ -603,15 +621,16 @@ def run_openclaw_prompt(
transcript = _load_transcript(agent_id, session_id, start_time)
execution_time = time.time() - start_time

status = "success"
if timed_out:
status = "timeout"
if not transcript:
if stderr and "openclaw command not found" in str(stderr):
status = "error"
if exit_code not in (0, -1) and not timed_out:
elif timed_out:
status = "timeout"
elif exit_code not in (0, -1):
status = "error"
if stderr and "openclaw command not found" in str(stderr):
elif not transcript:
status = "error"
else:
status = "success"

return {
"agent_id": agent_id,
Expand Down
22 changes: 16 additions & 6 deletions scripts/lib_grading.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,13 +257,17 @@ def _summarize_transcript(transcript: List[Dict[str, Any]]) -> str:
)
elif role == "toolResult":
content = msg.get("content", [])
if content:
if isinstance(content, str):
summary_parts.append(f"Result: {content[:200]}")
elif isinstance(content, list) and content:
result_preview = str(content[0])[:200]
summary_parts.append(f"Result: {result_preview}")
elif role == "user":
content = msg.get("content", [])
if content:
summary_parts.append(f"User: {content[0]}")
if isinstance(content, str):
summary_parts.append(f"User: {content[:200]}")
elif isinstance(content, list) and content:
summary_parts.append(f"User: {str(content[0])[:200]}")
return "\n".join(summary_parts)


Expand Down Expand Up @@ -418,9 +422,15 @@ def _normalize_judge_response(parsed: Dict[str, Any]) -> Dict[str, Any]:

# Extract total score
if "total" in parsed and parsed["total"] is not None:
result["total"] = float(parsed["total"]) if isinstance(parsed["total"], (int, float)) else None
elif "score" in parsed and isinstance(parsed["score"], (int, float)):
result["total"] = float(parsed["score"])
try:
result["total"] = float(parsed["total"])
except (TypeError, ValueError):
result["total"] = None
elif "score" in parsed and parsed["score"] is not None:
try:
result["total"] = float(parsed["score"])
except (TypeError, ValueError):
pass
elif result["scores"]:
# Calculate average if we have individual scores but no total
values = [v for v in result["scores"].values() if isinstance(v, (int, float))]
Expand Down
8 changes: 4 additions & 4 deletions scripts/lib_upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

DEFAULT_SERVER_URL = "https://api.pinchbench.com"
DEFAULT_TIMEOUT_SECONDS = 30.0
CONFIG_DIR = Path(__file__).resolve().parent / ".pinchbench"
CONFIG_DIR = Path.home() / ".pinchbench"
CONFIG_PATH = CONFIG_DIR / "config.json"


Expand Down Expand Up @@ -302,7 +302,7 @@ def _format_timestamp(timestamp: Any) -> str:


def _read_client_version() -> str:
pyproject = Path(__file__).with_name("pyproject.toml")
pyproject = Path(__file__).resolve().parent.parent / "pyproject.toml"
if not pyproject.exists():
return ""
for line in pyproject.read_text(encoding="utf-8").splitlines():
Expand Down Expand Up @@ -368,9 +368,9 @@ def _collect_linux_info() -> Dict[str, Any]:
total_kb = _parse_meminfo_value(meminfo, "MemTotal")
avail_kb = _parse_meminfo_value(meminfo, "MemAvailable")
if total_kb is not None:
info["memory_total_gb"] = round(total_kb / 1e6, 1)
info["memory_total_gb"] = round(total_kb / (1024 * 1024), 1)
if avail_kb is not None:
info["memory_available_gb"] = round(avail_kb / 1e6, 1)
info["memory_available_gb"] = round(avail_kb / (1024 * 1024), 1)
except OSError:
pass

Expand Down