pinchbench · kaiaiagent · Mar 8, 2026 · Mar 10, 2026
diff --git a/scripts/benchmark.py b/scripts/benchmark.py
@@ -62,8 +62,6 @@ def execute_task(self, task: Task, simulate: bool = False) -> Dict[str, Any]:
         Returns:
             Dictionary containing execution results
         """
-        if simulate:
-            logger.info("Simulate flag no longer supported for execute_task")
         raise NotImplementedError("Use execute_openclaw_task helper for real runs")
 
 

diff --git a/scripts/lib_agent.py b/scripts/lib_agent.py
@@ -6,6 +6,7 @@
 
 import json
 import logging
+import shutil
 import subprocess
 import time
 from pathlib import Path
@@ -18,6 +19,13 @@
 MAX_OPENCLAW_MESSAGE_CHARS = 4000
 
 
+def _decode_output(data: bytes | str | None) -> str:
+    """Decode subprocess output that may be bytes (e.g. from TimeoutExpired)."""
+    if isinstance(data, bytes):
+        return data.decode("utf-8", errors="replace")
+    return data or ""
+
+
 def slugify_model(model_id: str) -> str:
     return model_id.replace("/", "-").replace(".", "-")
 
@@ -279,11 +287,18 @@ def _find_recent_session_path(agent_dir: Path, started_at: float) -> Path | None
     if not candidates:
         return None
     tolerance_seconds = 5.0
-    recent_candidates = [
-        path for path in candidates if path.stat().st_mtime >= (started_at - tolerance_seconds)
-    ]
-    pool = recent_candidates or candidates
-    return max(pool, key=lambda path: path.stat().st_mtime)
+    # Cache stat results to avoid TOCTOU race and redundant syscalls
+    candidates_with_mtime = []
+    for path in candidates:
+        try:
+            candidates_with_mtime.append((path, path.stat().st_mtime))
+        except OSError:
+            continue
+    if not candidates_with_mtime:
+        return None
+    recent = [(p, m) for p, m in candidates_with_mtime if m >= (started_at - tolerance_seconds)]
+    pool = recent or candidates_with_mtime
+    return max(pool, key=lambda x: x[1])[0]
 
 
 def _load_transcript(agent_id: str, session_id: str, started_at: float) -> List[Dict[str, Any]]:
@@ -449,24 +464,25 @@ def execute_openclaw_task(
         exit_code = result.returncode
     except subprocess.TimeoutExpired as exc:
         timed_out = True
-        stdout = exc.stdout or ""
-        stderr = exc.stderr or ""
+        stdout = _decode_output(exc.stdout)
+        stderr = _decode_output(exc.stderr)
     except FileNotFoundError as exc:
         stderr = f"openclaw command not found: {exc}"
 
     transcript = _load_transcript(agent_id, session_id, start_time)
     usage = _extract_usage_from_transcript(transcript)
     execution_time = time.time() - start_time
 
-    status = "success"
-    if timed_out:
-        status = "timeout"
-    if not transcript:
+    if stderr and "openclaw command not found" in str(stderr):
         status = "error"
-    if exit_code not in (0, -1) and not timed_out:
+    elif timed_out:
+        status = "timeout"
+    elif exit_code not in (0, -1):
         status = "error"
-    if stderr and "openclaw command not found" in str(stderr):
+    elif not transcript:
         status = "error"
+    else:
+        status = "success"
 
     # Verbose logging for debugging
     if verbose:
@@ -485,6 +501,8 @@ def execute_openclaw_task(
                 msg = entry.get("message", {})
                 role = msg.get("role", "unknown")
                 content = msg.get("content", "")
+                if not isinstance(content, str):
+                    content = str(content) if content else ""
                 if role == "assistant":
                     # Truncate long responses
                     preview = content[:500] + "..." if len(content) > 500 else content
@@ -593,8 +611,8 @@ def run_openclaw_prompt(
                 break
         except subprocess.TimeoutExpired as exc:
             timed_out = True
-            stdout += exc.stdout or ""
-            stderr += exc.stderr or ""
+            stdout += _decode_output(exc.stdout)
+            stderr += _decode_output(exc.stderr)
             break
         except FileNotFoundError as exc:
             stderr += f"openclaw command not found: {exc}"
@@ -603,15 +621,16 @@ def run_openclaw_prompt(
     transcript = _load_transcript(agent_id, session_id, start_time)
     execution_time = time.time() - start_time
 
-    status = "success"
-    if timed_out:
-        status = "timeout"
-    if not transcript:
+    if stderr and "openclaw command not found" in str(stderr):
         status = "error"
-    if exit_code not in (0, -1) and not timed_out:
+    elif timed_out:
+        status = "timeout"
+    elif exit_code not in (0, -1):
         status = "error"
-    if stderr and "openclaw command not found" in str(stderr):
+    elif not transcript:
         status = "error"
+    else:
+        status = "success"
 
     return {
         "agent_id": agent_id,

diff --git a/scripts/lib_grading.py b/scripts/lib_grading.py
@@ -257,13 +257,17 @@ def _summarize_transcript(transcript: List[Dict[str, Any]]) -> str:
                     )
         elif role == "toolResult":
             content = msg.get("content", [])
-            if content:
+            if isinstance(content, str):
+                summary_parts.append(f"Result: {content[:200]}")
+            elif isinstance(content, list) and content:
                 result_preview = str(content[0])[:200]
                 summary_parts.append(f"Result: {result_preview}")
         elif role == "user":
             content = msg.get("content", [])
-            if content:
-                summary_parts.append(f"User: {content[0]}")
+            if isinstance(content, str):
+                summary_parts.append(f"User: {content[:200]}")
+            elif isinstance(content, list) and content:
+                summary_parts.append(f"User: {str(content[0])[:200]}")
     return "\n".join(summary_parts)
 
 
@@ -418,9 +422,15 @@ def _normalize_judge_response(parsed: Dict[str, Any]) -> Dict[str, Any]:
 
     # Extract total score
     if "total" in parsed and parsed["total"] is not None:
-        result["total"] = float(parsed["total"]) if isinstance(parsed["total"], (int, float)) else None
-    elif "score" in parsed and isinstance(parsed["score"], (int, float)):
-        result["total"] = float(parsed["score"])
+        try:
+            result["total"] = float(parsed["total"])
+        except (TypeError, ValueError):
+            result["total"] = None
+    elif "score" in parsed and parsed["score"] is not None:
+        try:
+            result["total"] = float(parsed["score"])
+        except (TypeError, ValueError):
+            pass
     elif result["scores"]:
         # Calculate average if we have individual scores but no total
         values = [v for v in result["scores"].values() if isinstance(v, (int, float))]

diff --git a/scripts/lib_upload.py b/scripts/lib_upload.py
@@ -18,7 +18,7 @@
 
 DEFAULT_SERVER_URL = "https://api.pinchbench.com"
 DEFAULT_TIMEOUT_SECONDS = 30.0
-CONFIG_DIR = Path(__file__).resolve().parent / ".pinchbench"
+CONFIG_DIR = Path.home() / ".pinchbench"
 CONFIG_PATH = CONFIG_DIR / "config.json"
 
 
@@ -302,7 +302,7 @@ def _format_timestamp(timestamp: Any) -> str:
 
 
 def _read_client_version() -> str:
-    pyproject = Path(__file__).with_name("pyproject.toml")
+    pyproject = Path(__file__).resolve().parent.parent / "pyproject.toml"
     if not pyproject.exists():
         return ""
     for line in pyproject.read_text(encoding="utf-8").splitlines():
@@ -368,9 +368,9 @@ def _collect_linux_info() -> Dict[str, Any]:
         total_kb = _parse_meminfo_value(meminfo, "MemTotal")
         avail_kb = _parse_meminfo_value(meminfo, "MemAvailable")
         if total_kb is not None:
-            info["memory_total_gb"] = round(total_kb / 1e6, 1)
+            info["memory_total_gb"] = round(total_kb / (1024 * 1024), 1)
         if avail_kb is not None:
-            info["memory_available_gb"] = round(avail_kb / 1e6, 1)
+            info["memory_available_gb"] = round(avail_kb / (1024 * 1024), 1)
     except OSError:
         pass