Skip to content

feat: add SWE-bench and TAU-bench benchmark suite #3

feat: add SWE-bench and TAU-bench benchmark suite

feat: add SWE-bench and TAU-bench benchmark suite #3

Workflow file for this run

name: Benchmark Full Suite
on:
workflow_dispatch:
inputs:
benchmark:
description: "Which benchmark to run"
type: choice
required: true
default: both
options:
- all
- both
- swe
- tau
- tb2
provider:
description: "SWE/TAU provider filter"
type: choice
required: true
default: all
options:
- all
- anthropic
- openai
- gemini
tb2_model:
description: "TB2 model in provider/model format"
type: string
required: true
default: openai/glm-5
push:
branches:
- add_benchmark_test
pull_request:
branches:
- main
env:
NODE_VERSION: "20"
permissions:
contents: read
jobs:
benchmark:
name: Benchmark
runs-on: ubuntu-latest
timeout-minutes: 360
if: ${{ vars.BENCHMARK_ACTION_ENABLED == '1' }}
env:
DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }}
DOCKERHUB_TOKEN: ${{ secrets.DOCKERHUB_TOKEN }}
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Setup Node.js
uses: actions/setup-node@v4
with:
node-version: ${{ env.NODE_VERSION }}
cache: npm
- name: Setup uv
uses: astral-sh/setup-uv@v4
- name: Login to Docker Hub (optional)
if: ${{ env.DOCKERHUB_USERNAME != '' && env.DOCKERHUB_TOKEN != '' }}
uses: docker/login-action@v3
with:
username: ${{ env.DOCKERHUB_USERNAME }}
password: ${{ env.DOCKERHUB_TOKEN }}
- name: Install dependencies
run: npm ci
- name: Create benchmark environment
run: |
cat > .env.test << 'EOT'
ANTHROPIC_API_KEY=${{ secrets.ANTHROPIC_API_KEY }}
ANTHROPIC_MODEL_ID=${{ vars.ANTHROPIC_MODEL_ID }}
ANTHROPIC_BASE_URL=${{ vars.ANTHROPIC_BASE_URL }}
OPENAI_API_KEY=${{ secrets.OPENAI_API_KEY }}
OPENAI_MODEL_ID=${{ vars.OPENAI_MODEL_ID }}
OPENAI_BASE_URL=${{ vars.OPENAI_BASE_URL }}
GEMINI_API_KEY=${{ secrets.GEMINI_API_KEY }}
GEMINI_MODEL_ID=${{ vars.GEMINI_MODEL_ID }}
GEMINI_BASE_URL=${{ vars.GEMINI_BASE_URL }}
BENCHMARK_DOCKER_PROXY=${{ vars.BENCHMARK_DOCKER_PROXY }}
BENCHMARK_TIMEOUT_MS=${{ vars.BENCHMARK_TIMEOUT_MS }}
EOT
- name: Run unified benchmark command
run: |
mkdir -p tests/tmp
benchmark="${{ github.event.inputs.benchmark || 'both' }}"
provider="${{ github.event.inputs.provider || 'all' }}"
tb2_model="${{ github.event.inputs.tb2_model || 'openai/glm-5' }}"
args=(
--benchmark=${benchmark}
--tb2-model=${tb2_model}
--tb2-agent=oracle
--tb2-runner=uvx
--tb2-python=3.12
--tb2-jobs-dir=./tests/tmp/jobs
--output=json
--output-file=tests/tmp/benchmark-report.json
)
if [[ "${provider}" != "all" && "${benchmark}" != "tb2" ]]; then
args+=(--provider=${provider})
fi
npm run test:benchmark -- "${args[@]}"
- name: Write step summary
if: ${{ always() }}
run: |
node - <<'NODE' >> "$GITHUB_STEP_SUMMARY"
const fs = require('fs');
function readJson(p) {
if (!fs.existsSync(p)) return null;
try { return JSON.parse(fs.readFileSync(p, 'utf8')); } catch { return null; }
}
const report = readJson('tests/tmp/benchmark-report.json');
console.log('## Benchmark Report');
console.log('');
if (!report) {
console.log('- report not found');
process.exit(0);
}
if (Array.isArray(report.swe) && report.swe.length > 0) {
console.log('### SWE-bench-Verified');
console.log('');
console.log('| Provider / Model | Resolved | Rate |');
console.log('|---|---:|---:|');
for (const r of report.swe) {
const name = `${r.provider.id} / ${r.provider.model}`;
const resolved = `${r.summary.resolved}/${r.summary.total}`;
const rate = `${(r.summary.rate * 100).toFixed(1)}%`;
console.log(`| ${name} | ${resolved} | ${rate} |`);
}
console.log('');
}
if (Array.isArray(report.tau) && report.tau.length > 0) {
console.log('### TAU-bench');
console.log('');
console.log('| Provider / Model | Domain | Pass^1 | Avg Tokens |');
console.log('|---|---|---:|---:|');
for (const r of report.tau) {
const name = `${r.provider.id} / ${r.provider.model}`;
const domain = r.summary.domain;
const pass1 = `${((r.summary.pass_at_k?.[0] ?? 0) * 100).toFixed(1)}%`;
const observed = (r.summary.token_observed_trials ?? 0) > 0;
const avgTokens = observed
? (r.summary.avg_tokens >= 1000 ? `${(r.summary.avg_tokens / 1000).toFixed(1)}k` : `${r.summary.avg_tokens}`)
: '-';
console.log(`| ${name} | ${domain} | ${pass1} | ${avgTokens} |`);
}
console.log('');
}
if (report.tb2) {
const tb2 = report.tb2;
console.log('### Terminal Bench 2.0');
console.log('');
console.log(`- Agent: \`${tb2.agent}\``);
if (tb2.model) console.log(`- Model: \`${tb2.model}\``);
console.log(`- Passed: **${tb2.passed}/${tb2.total}**`);
console.log(`- Rate: **${(tb2.rate * 100).toFixed(1)}%**`);
if (typeof tb2.avg_total_tokens === 'number' && (tb2.token_observed_trials ?? 0) > 0) {
console.log(`- Avg tokens: **${tb2.avg_total_tokens}** (observed ${tb2.token_observed_trials} trials)`);
} else {
console.log(`- Avg tokens: **N/A**`);
}
console.log('');
}
NODE
- name: Upload benchmark artifacts
if: ${{ always() }}
uses: actions/upload-artifact@v4
with:
name: benchmark-artifacts-${{ github.run_id }}
if-no-files-found: warn
path: |
tests/tmp/benchmark-report.json
tests/tmp/jobs/*/result.json
tests/tmp/tau2-data/simulations/*.json