diff --git a/scripts/lib_grading.py b/scripts/lib_grading.py index a345e8d..5a78915 100644 --- a/scripts/lib_grading.py +++ b/scripts/lib_grading.py @@ -296,10 +296,40 @@ def _ensure_judge_agent(judge_agent_prefix: str, judge_model: str, skill_dir: Pa model_slug = slugify_model(judge_model) agent_id = f"{judge_agent_prefix}-{model_slug}" workspace = Path("/tmp/pinchbench/judge/workspace") - ensure_agent_exists(agent_id, judge_model, workspace) + created = ensure_agent_exists(agent_id, judge_model, workspace) + # OpenClaw `agents add` scaffolds AGENTS.md, SOUL.md, BOOTSTRAP.md, etc. + # into the workspace. These template files instruct the agent to perform a + # bootstrap / personality flow (read SOUL.md, do introductions, etc.) + # instead of acting as a pure grading function. Remove them so the judge + # only responds to the grading prompt with JSON. + _clean_judge_workspace(workspace) return agent_id +def _clean_judge_workspace(workspace: Path) -> None: + """Remove OpenClaw-scaffolded template files that interfere with judge grading.""" + import shutil + + template_files = ( + "AGENTS.md", "SOUL.md", "BOOTSTRAP.md", "HEARTBEAT.md", + "IDENTITY.md", "TOOLS.md", "USER.md", "MEMORY.md", + ) + template_dirs = (".git", "memory") + removed = 0 + for name in template_files: + p = workspace / name + if p.exists(): + p.unlink() + removed += 1 + for name in template_dirs: + d = workspace / name + if d.is_dir(): + shutil.rmtree(d, ignore_errors=True) + removed += 1 + if removed: + logger.info("Cleaned %d template files/dirs from judge workspace", removed) + + def _parse_judge_response(transcript: List[Dict[str, Any]]) -> Dict[str, Any]: content_chunks: List[str] = [] for event in transcript: