Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -78,11 +78,33 @@ Skip uploading with `--no-upload` if you just want local results.
| `--suite SUITE` | `all`, `automated-only`, or comma-separated task IDs |
| `--runs N` | Number of runs per task for averaging |
| `--timeout-multiplier N` | Scale timeouts for slower models |
| `--thinking LEVELS` | Comma-separated thinking levels (e.g., `low,medium,high`) |
| `--output-dir DIR` | Where to save results (default: `results/`) |
| `--no-upload` | Skip uploading to leaderboard |
| `--register` | Request an API token for submissions |
| `--upload FILE` | Upload a previous results JSON |

## Thinking Levels

Many models support configurable thinking/reasoning levels. Test how different reasoning depths affect task performance:

```bash
# Test multiple thinking levels
./scripts/run.sh --model anthropic/claude-sonnet-4 --thinking low,medium,high

# Compare a single level against default
./scripts/run.sh --model anthropic/claude-sonnet-4 --thinking high
```

Valid levels: `off`, `minimal`, `low`, `medium`, `high`, `xhigh`, `adaptive`

**Model-specific notes:**
- `xhigh` is only available for GPT-5.x models (gpt-5.4, gpt-5.2, codex variants)
- `adaptive` is provider-managed reasoning budget (Anthropic Claude 4.6 family)
- Invalid levels for your model are warned and skipped

Results include a `thinking_aggregates` section with per-level statistics, and each task result includes the `thinking_level` used.

## Contributing Tasks

We welcome new tasks! Check out [`tasks/TASK_TEMPLATE.md`](tasks/TASK_TEMPLATE.md) for the format. Good tasks are:
Expand Down
24 changes: 24 additions & 0 deletions SKILL.md
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ uv run benchmark.py --model anthropic/claude-sonnet-4 --no-upload
| `--output-dir` | Results directory (default: `results/`) |
| `--timeout-multiplier` | Scale task timeouts for slower models |
| `--runs` | Number of runs per task for averaging |
| `--thinking` | Comma-separated thinking levels (e.g., `low,medium,high`) |
| `--no-upload` | Skip uploading to leaderboard |
| `--register` | Request new API token for submissions |
| `--upload FILE` | Upload previous results JSON |
Expand All @@ -89,6 +90,29 @@ uv run benchmark.py --register
uv run benchmark.py --model anthropic/claude-sonnet-4
```

## Thinking Levels

Many models support different thinking/reasoning levels (e.g., Claude's extended thinking). PinchBench can run tasks across multiple thinking levels to measure how reasoning depth affects performance:

```bash
# Run with multiple thinking levels
uv run benchmark.py --model anthropic/claude-sonnet-4 --thinking low,medium,high

# Run with a single thinking level
uv run benchmark.py --model anthropic/claude-sonnet-4 --thinking high
```

Valid thinking levels: `off`, `minimal`, `low`, `medium`, `high`, `xhigh`, `adaptive`

**Model-specific notes:**
- `xhigh` is only supported by GPT-5.x models (gpt-5.4, gpt-5.2, codex variants)
- `adaptive` is provider-managed reasoning (Anthropic Claude 4.6 family)
- Invalid levels for your model are warned and skipped

Results include per-level aggregates:
- `thinking_aggregates`: Summary statistics for each thinking level
- Per-task results include `thinking_level` field

## Results

Results are saved as JSON in the output directory:
Expand Down
266 changes: 185 additions & 81 deletions scripts/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@
ensure_agent_exists,
execute_openclaw_task,
slugify_model,
THINKING_LEVELS,
validate_thinking_level,
)
from lib_grading import GradeResult, grade_task
from lib_tasks import Task, TaskLoader
Expand Down Expand Up @@ -212,6 +214,15 @@ def _parse_args() -> argparse.Namespace:
default=1,
help="Number of runs per task for averaging",
)
parser.add_argument(
"--thinking",
type=str,
default=None,
help="Comma-separated thinking levels to test (e.g., 'low,medium,high'). "
f"Valid levels: {', '.join(THINKING_LEVELS)}. "
"Note: 'xhigh' requires GPT-5.x models; 'adaptive' is for Anthropic Claude 4.6. "
"If not specified, runs without explicit thinking level.",
)
parser.add_argument(
"--verbose",
"-v",
Expand All @@ -229,6 +240,43 @@ def _select_task_ids(tasks: List[Task], suite: str) -> Optional[List[str]]:
return [task_id.strip() for task_id in suite.split(",") if task_id.strip()]


def _parse_thinking_levels(
thinking_arg: Optional[str],
model_id: Optional[str] = None,
) -> List[Optional[str]]:
"""
Parse thinking levels from the argument.

Args:
thinking_arg: Comma-separated thinking levels or None
model_id: Optional model ID to check level compatibility

Returns:
List of validated thinking levels (or [None] if no explicit level).

Raises:
ValueError: If --thinking was provided but no levels are valid for the model.
"""
if thinking_arg is None:
return [None] # Run once without explicit thinking level

levels: List[str] = []
seen = set()
for level in thinking_arg.split(","):
validated = validate_thinking_level(level.strip(), model_id)
if validated and validated not in seen:
levels.append(validated)
seen.add(validated)

if not levels:
raise ValueError(
"No valid thinking levels remain after validation. "
"Check your --thinking values for this model."
)

return levels


def _next_run_id(run_root: Path) -> str:
run_root.mkdir(parents=True, exist_ok=True)
existing = []
Expand Down Expand Up @@ -363,97 +411,147 @@ def main():
cleanup_agent_sessions(agent_id)

task_ids = _select_task_ids(runner.tasks, args.suite)
try:
thinking_levels = _parse_thinking_levels(args.thinking, args.model)
except ValueError as exc:
logger.error(str(exc))
sys.exit(2)
results = []
grades_by_task_id = {}
grades_by_task_and_thinking: Dict[str, Dict[str, Any]] = {}

tasks_to_run = runner.tasks
if task_ids is not None:
tasks_to_run = [task for task in runner.tasks if task.task_id in task_ids]
tasks_by_id = {task.task_id: task for task in tasks_to_run}

runs_per_task = max(1, args.runs)
for i, task in enumerate(tasks_to_run, 1):
task_grades = []
for run_index in range(runs_per_task):
logger.info("\n%s", "=" * 80)
logger.info(
"📋 Task %s/%s (Run %s/%s)",
i,
len(tasks_to_run),
run_index + 1,
runs_per_task,
)
logger.info("%s", "=" * 80)
execution_error = None
try:
result = execute_openclaw_task(
task=task,
agent_id=agent_id,
model_id=args.model,
run_id=f"{run_id}-{run_index + 1}",
timeout_multiplier=args.timeout_multiplier,
skill_dir=skill_dir,
verbose=args.verbose,
)
except Exception as exc:
execution_error = str(exc)
logger.warning(
"Task execution failed for %s, continuing: %s", task.task_id, exc
total_runs = len(tasks_to_run) * runs_per_task * len(thinking_levels)
run_counter = 0

for thinking_level in thinking_levels:
thinking_label = thinking_level or "default"
logger.info("\n%s", "=" * 80)
logger.info("🧠 Thinking Level: %s", thinking_label)
logger.info("%s", "=" * 80)

for i, task in enumerate(tasks_to_run, 1):
task_key = f"{task.task_id}:{thinking_label}" if thinking_level else task.task_id
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

WARNING: Inconsistent key construction between storage and lookup

When thinking_level is None (no --thinking flag), task_key is set to bare task.task_id (e.g., "task_01"). However, the grading lookup at line 579 always constructs the primary key as f"{task_id}:default" (because result.get('thinking_level') or 'default' evaluates to "default" when thinking_level is None).

This means the primary lookup key "task_01:default" will never match the stored key "task_01". The fallback at line 580 (grades_by_task_and_thinking.get(result["task_id"], {})) saves it, but this is fragile — if the fallback is ever removed, grading data will silently be replaced with {}.

Consider using a consistent key format. For example, always include the label:

Suggested change
task_key = f"{task.task_id}:{thinking_label}" if thinking_level else task.task_id
task_key = f"{task.task_id}:{thinking_label}"

This would make task_key always "task_id:default" when no thinking level is set, matching the lookup at line 579.

task_grades = []

for run_index in range(runs_per_task):
run_counter += 1
logger.info("\n%s", "-" * 80)
logger.info(
"📋 Task %s/%s (Run %s/%s) [%s] — Overall progress: %s/%s",
i,
len(tasks_to_run),
run_index + 1,
runs_per_task,
thinking_label,
run_counter,
total_runs,
)
result = {
"agent_id": agent_id,
"task_id": task.task_id,
"status": "error",
"transcript": [],
"usage": {},
"workspace": "",
"exit_code": -1,
"timed_out": False,
"execution_time": 0.0,
"stdout": "",
"stderr": execution_error,
}
try:
grade = grade_task(task=task, execution_result=result, skill_dir=skill_dir, verbose=args.verbose)
except Exception as exc:
if execution_error:
note = f"Execution failed: {execution_error}; Grading failed: {exc}"
else:
note = f"Grading failed: {exc}"
logger.warning("Task grading failed for %s, continuing: %s", task.task_id, exc)
grade = GradeResult(
task_id=task.task_id,
score=0.0,
max_score=1.0,
grading_type=task.grading_type,
breakdown={},
notes=note,
logger.info("%s", "-" * 80)
execution_error = None
try:
result = execute_openclaw_task(
task=task,
agent_id=agent_id,
model_id=args.model,
run_id=f"{run_id}-{run_index + 1}",
timeout_multiplier=args.timeout_multiplier,
skill_dir=skill_dir,
thinking_level=thinking_level,
verbose=args.verbose,
)
except Exception as exc:
execution_error = str(exc)
logger.warning(
"Task execution failed for %s, continuing: %s", task.task_id, exc
)
result = {
"agent_id": agent_id,
"task_id": task.task_id,
"thinking_level": thinking_level,
"status": "error",
"transcript": [],
"usage": {},
"workspace": "",
"exit_code": -1,
"timed_out": False,
"execution_time": 0.0,
"stdout": "",
"stderr": execution_error,
}
try:
grade = grade_task(
task=task,
execution_result=result,
skill_dir=skill_dir,
verbose=args.verbose,
)
except Exception as exc:
if execution_error:
note = f"Execution failed: {execution_error}; Grading failed: {exc}"
else:
note = f"Grading failed: {exc}"
logger.warning("Task grading failed for %s, continuing: %s", task.task_id, exc)
grade = GradeResult(
task_id=task.task_id,
score=0.0,
max_score=1.0,
grading_type=task.grading_type,
breakdown={},
notes=note,
)
task_grades.append(grade)
result["thinking_level"] = thinking_level
results.append(result)

score_pct = grade.score / grade.max_score * 100 if grade.max_score > 0 else 0
status_emoji = "✅" if grade.score >= grade.max_score else "⚠️" if grade.score > 0 else "❌"
logger.info(
"%s Task %s [%s]: %.1f/%.1f (%.0f%%) - %s",
status_emoji,
task.task_id,
thinking_label,
grade.score,
grade.max_score,
score_pct,
grade.grading_type,
)
task_grades.append(grade)
results.append(result)
if grade.notes:
logger.info(" Notes: %s", grade.notes[:200])

task_scores = [grade.score for grade in task_grades]
grades_by_task_and_thinking[task_key] = {
"task_id": task.task_id,
"thinking_level": thinking_level,
"runs": [grade.to_dict() for grade in task_grades],
"mean": statistics.mean(task_scores),
"std": statistics.stdev(task_scores) if len(task_scores) > 1 else 0.0,
"min": min(task_scores),
"max": max(task_scores),
}

# Log score immediately after grading
score_pct = grade.score / grade.max_score * 100 if grade.max_score > 0 else 0
status_emoji = "✅" if grade.score >= grade.max_score else "⚠️" if grade.score > 0 else "❌"
logger.info(
"%s Task %s: %.1f/%.1f (%.0f%%) - %s",
status_emoji,
task.task_id,
grade.score,
grade.max_score,
score_pct,
grade.grading_type,
)
if grade.notes:
logger.info(" Notes: %s", grade.notes[:200])

task_scores = [grade.score for grade in task_grades]
grades_by_task_id[task.task_id] = {
"runs": [grade.to_dict() for grade in task_grades],
"mean": statistics.mean(task_scores),
"std": statistics.stdev(task_scores) if len(task_scores) > 1 else 0.0,
"min": min(task_scores),
"max": max(task_scores),
# Compute per-thinking-level aggregates
thinking_aggregates: Dict[str, Dict[str, Any]] = {}
for thinking_level in thinking_levels:
thinking_label = thinking_level or "default"
level_keys = [
k for k, v in grades_by_task_and_thinking.items()
if v.get("thinking_level") == thinking_level
]
if not level_keys:
continue
scores = [grades_by_task_and_thinking[k]["mean"] for k in level_keys]
thinking_aggregates[thinking_label] = {
"thinking_level": thinking_label,
"task_count": len(scores),
"mean_score": statistics.mean(scores) if scores else 0.0,
"std_score": statistics.stdev(scores) if len(scores) > 1 else 0.0,
"min_score": min(scores) if scores else 0.0,
"max_score": max(scores) if scores else 0.0,
}

output_dir = Path(args.output_dir)
Expand All @@ -465,16 +563,22 @@ def main():
"timestamp": time.time(),
"suite": args.suite,
"runs_per_task": runs_per_task,
"thinking_levels": [tl or "default" for tl in thinking_levels],
"thinking_aggregates": thinking_aggregates,
"tasks": [
{
"task_id": result["task_id"],
"thinking_level": result.get("thinking_level"),
"status": result["status"],
"timed_out": result["timed_out"],
"execution_time": result["execution_time"],
"transcript_length": len(result["transcript"]),
"usage": result.get("usage", {}),
"workspace": result["workspace"],
"grading": grades_by_task_id[result["task_id"]],
"grading": grades_by_task_and_thinking.get(
f"{result['task_id']}:{result.get('thinking_level') or 'default'}",
grades_by_task_and_thinking.get(result["task_id"], {}),
),
"frontmatter": tasks_by_id[result["task_id"]].frontmatter,
}
for result in results
Expand Down
Loading