From ea4f93ecf2e884c2fb63464bec8398a222555212 Mon Sep 17 00:00:00 2001 From: JackyCSer Date: Mon, 16 Mar 2026 23:26:52 +0800 Subject: [PATCH] fix: clean OpenClaw template files from judge workspace MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When `openclaw agents add` creates the judge agent, it scaffolds AGENTS.md, SOUL.md, BOOTSTRAP.md and other template files into the workspace. These files instruct the agent to perform a bootstrap / personality flow (read SOUL.md, introduce itself, etc.) before processing any prompt. This causes the judge agent to ignore the grading prompt entirely and output bootstrap text like 'I'm Claw, bootstrap complete...' instead of the expected JSON scores. The JSON parser then fails with 'Failed to parse judge JSON response', giving 0 scores to all llm_judge and hybrid tasks — regardless of how well the tested model actually performed. In our testing with doubao-seed-2.0-pro, 13 out of 23 tasks were affected (7 pure llm_judge + 6 hybrid), severely underestimating the model's actual score (reported 40% vs estimated 70%+ actual). Fix: add _clean_judge_workspace() that removes scaffolded template files after agent creation, ensuring the judge operates as a pure grading function. --- scripts/lib_grading.py | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/scripts/lib_grading.py b/scripts/lib_grading.py index a345e8d..5a78915 100644 --- a/scripts/lib_grading.py +++ b/scripts/lib_grading.py @@ -296,10 +296,40 @@ def _ensure_judge_agent(judge_agent_prefix: str, judge_model: str, skill_dir: Pa model_slug = slugify_model(judge_model) agent_id = f"{judge_agent_prefix}-{model_slug}" workspace = Path("/tmp/pinchbench/judge/workspace") - ensure_agent_exists(agent_id, judge_model, workspace) + created = ensure_agent_exists(agent_id, judge_model, workspace) + # OpenClaw `agents add` scaffolds AGENTS.md, SOUL.md, BOOTSTRAP.md, etc. + # into the workspace. These template files instruct the agent to perform a + # bootstrap / personality flow (read SOUL.md, do introductions, etc.) + # instead of acting as a pure grading function. Remove them so the judge + # only responds to the grading prompt with JSON. + _clean_judge_workspace(workspace) return agent_id +def _clean_judge_workspace(workspace: Path) -> None: + """Remove OpenClaw-scaffolded template files that interfere with judge grading.""" + import shutil + + template_files = ( + "AGENTS.md", "SOUL.md", "BOOTSTRAP.md", "HEARTBEAT.md", + "IDENTITY.md", "TOOLS.md", "USER.md", "MEMORY.md", + ) + template_dirs = (".git", "memory") + removed = 0 + for name in template_files: + p = workspace / name + if p.exists(): + p.unlink() + removed += 1 + for name in template_dirs: + d = workspace / name + if d.is_dir(): + shutil.rmtree(d, ignore_errors=True) + removed += 1 + if removed: + logger.info("Cleaned %d template files/dirs from judge workspace", removed) + + def _parse_judge_response(transcript: List[Dict[str, Any]]) -> Dict[str, Any]: content_chunks: List[str] = [] for event in transcript: