From ea4f93ecf2e884c2fb63464bec8398a222555212 Mon Sep 17 00:00:00 2001
From: JackyCSer <jackycser@gmail.com>
Date: Mon, 16 Mar 2026 23:26:52 +0800
Subject: [PATCH] fix: clean OpenClaw template files from judge workspace
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When `openclaw agents add` creates the judge agent, it scaffolds
AGENTS.md, SOUL.md, BOOTSTRAP.md and other template files into the
workspace. These files instruct the agent to perform a bootstrap /
personality flow (read SOUL.md, introduce itself, etc.) before
processing any prompt.

This causes the judge agent to ignore the grading prompt entirely
and output bootstrap text like 'I'm Claw, bootstrap complete...'
instead of the expected JSON scores. The JSON parser then fails with
'Failed to parse judge JSON response', giving 0 scores to all
llm_judge and hybrid tasks — regardless of how well the tested model
actually performed.

In our testing with doubao-seed-2.0-pro, 13 out of 23 tasks were
affected (7 pure llm_judge + 6 hybrid), severely underestimating the
model's actual score (reported 40% vs estimated 70%+ actual).

Fix: add _clean_judge_workspace() that removes scaffolded template
files after agent creation, ensuring the judge operates as a pure
grading function.
---
 scripts/lib_grading.py | 32 +++++++++++++++++++++++++++++++-
 1 file changed, 31 insertions(+), 1 deletion(-)

diff --git a/scripts/lib_grading.py b/scripts/lib_grading.py
index a345e8d..5a78915 100644
--- a/scripts/lib_grading.py
+++ b/scripts/lib_grading.py
@@ -296,10 +296,40 @@ def _ensure_judge_agent(judge_agent_prefix: str, judge_model: str, skill_dir: Pa
     model_slug = slugify_model(judge_model)
     agent_id = f"{judge_agent_prefix}-{model_slug}"
     workspace = Path("/tmp/pinchbench/judge/workspace")
-    ensure_agent_exists(agent_id, judge_model, workspace)
+    created = ensure_agent_exists(agent_id, judge_model, workspace)
+    # OpenClaw `agents add` scaffolds AGENTS.md, SOUL.md, BOOTSTRAP.md, etc.
+    # into the workspace. These template files instruct the agent to perform a
+    # bootstrap / personality flow (read SOUL.md, do introductions, etc.)
+    # instead of acting as a pure grading function. Remove them so the judge
+    # only responds to the grading prompt with JSON.
+    _clean_judge_workspace(workspace)
     return agent_id
 
 
+def _clean_judge_workspace(workspace: Path) -> None:
+    """Remove OpenClaw-scaffolded template files that interfere with judge grading."""
+    import shutil
+
+    template_files = (
+        "AGENTS.md", "SOUL.md", "BOOTSTRAP.md", "HEARTBEAT.md",
+        "IDENTITY.md", "TOOLS.md", "USER.md", "MEMORY.md",
+    )
+    template_dirs = (".git", "memory")
+    removed = 0
+    for name in template_files:
+        p = workspace / name
+        if p.exists():
+            p.unlink()
+            removed += 1
+    for name in template_dirs:
+        d = workspace / name
+        if d.is_dir():
+            shutil.rmtree(d, ignore_errors=True)
+            removed += 1
+    if removed:
+        logger.info("Cleaned %d template files/dirs from judge workspace", removed)
+
+
 def _parse_judge_response(transcript: List[Dict[str, Any]]) -> Dict[str, Any]:
     content_chunks: List[str] = []
     for event in transcript: