Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 89 additions & 0 deletions .github/workflows/benchmark-dashboard.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
name: Benchmark Dashboard

on:
workflow_dispatch:
inputs:
provider:
description: "Provider (ollama, copilot, gemini)"
required: true
type: choice
options:
- ollama
- copilot
- gemini
model:
description: "Model name (e.g., qwen-coder-next:cloud)"
required: true
type: string
skill:
description: "Optional specific skill to test"
required: false
type: string

jobs:
run-benchmark:
runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v4
with:
ref: main
path: workspace

- uses: astral-sh/setup-uv@v5
with:
enable-cache: true

- name: Run benchmark
working-directory: workspace
env:
OLLAMA_API_KEY: ${{ secrets.OLLAMA_API_KEY }}
run: |
# Install dependencies
uv sync --project tests
# Run benchmark
uv run --project tests tests/evaluator.py \
--provider ${{ inputs.provider }} \
--model ${{ inputs.model }} \
--judge \
--verbose \
--report
# Rename artifact for clarity
if [ -d tests/results ]; then
ARTIFACT_NAME="benchmark-${{ inputs.provider }}-${{ inputs.model }}-$(date +%Y%m%d-%H%M%S)"
mv tests/results "tests/${ARTIFACT_NAME}"
fi

- name: Generate dashboard
working-directory: workspace
run: |
uv run --project tests python3 ci/publish_benchmarks.py \
--provider ${{ inputs.provider }} \
--model ${{ inputs.model }} \
--branch benchmark-history

deploy-pages:
needs: run-benchmark
runs-on: ubuntu-latest

steps:
- name: Checkout workspace
uses: actions/checkout@v4
with:
ref: main
path: workspace

- name: Checkout benchmark data
uses: actions/checkout@v4
with:
ref: benchmark-history
path: benchmark-data

- name: Verify deployment
run: |
echo "Benchmark data directory contents:"
ls -la benchmark-data/docs/
echo ""
echo "GitHub Pages will be served from benchmark-history branch"
echo "Dashboard will be available at:"
echo "https://${{ github.repository }}/index.html"
4 changes: 2 additions & 2 deletions .github/workflows/skill-validation.yml
Original file line number Diff line number Diff line change
Expand Up @@ -116,8 +116,8 @@ jobs:
fi

echo "Running evaluation..."
# Use trusted script
uv run --project trusted-scripts/tests --frozen trusted-scripts/tests/evaluator.py "${ARGS[@]}"
# Use trusted script with correct project path
uv run --project . --frozen trusted-scripts/tests/evaluator.py "${ARGS[@]}"

- name: Upload results
if: always()
Expand Down
190 changes: 122 additions & 68 deletions ci/consolidate_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,93 +3,52 @@
Consolidate results from parallel evaluations.

Aggregates outputs from multiple provider/model runs and generates summary.
Supports both PR comments and benchmark dashboard generation.
"""

import argparse
import json
import sys
from pathlib import Path


def main():
results_base = Path("tests/results")

print("==> Consolidating results from all evaluations")

# Find all summary.json files (artifacts download to flat structure)
summary_files = sorted(results_base.glob("*/summary.json"))

if not summary_files:
print(f"No results found in {results_base}")
print(f"Directory contents: {list(results_base.glob('*'))}")
# Write empty comment so job doesn't fail
Path("comment.md").write_text("# Evaluation Results\n\nNo results found.\n")
sys.exit(0)

# Check for failures
failed = 0
succeeded = 0
all_results = []
def generate_pr_comment(results: list) -> str:
"""
Generate GitHub PR comment from results.

print("\nResults:")
print("========")
Args:
results: List of result dictionaries

for summary_file in summary_files:
try:
summary = json.loads(summary_file.read_text())
artifact_name = summary_file.parent.name

all_results.append({
"artifact": artifact_name,
"summary": summary,
})

print(f"✅ {artifact_name}")
succeeded += 1

except json.JSONDecodeError:
print(f"❌ {summary_file.parent.name} - Could not parse JSON")
failed += 1
Returns:
Markdown comment string
"""
comment = "# 📊 Evaluation Results\n\n"

print("\nSummary:")
print("========")
print(f"Processed: {len(all_results)}")
print(f"Failed: {failed}")
if results:
comment += f"Processed {len(results)} evaluation(s).\n\n"

# Generate comment
comment = "# 📊 Evaluation Results\n\n"

if all_results:
comment += f"Processed {len(all_results)} evaluation(s).\n\n"

# Build table with results
comment += "| Test Name | Model | Baseline | With Skill | Cases Pass | Winner |\n"
comment += "|-----------|-------|----------|------------|------------|--------|\n"

# Rating hierarchy for comparison
rating_hierarchy = {'vague': 0, 'regular': 1, 'good': 2, 'outstanding': 3}
for result in all_results:

for result in results:
summary = result['summary']
artifact = result['artifact']

# Debug: Log top-level keys
print(f"\n📋 Processing: {artifact}")
print(f" Top-level keys: {list(summary.keys())}")


# Extract key data from nested results[0]
eval_result = summary.get('results', [{}])[0] if summary.get('results') else {}
print(f" Nested result keys: {list(eval_result.keys())}")


skill = eval_result.get('skill', 'N/A')
model = eval_result.get('model', 'N/A')
baseline_rating = eval_result.get('baseline_rating', 'N/A')
skill_rating = eval_result.get('skill_rating', 'N/A')
baseline_pass = eval_result.get('baseline_pass_count', 'N/A')
skill_pass = eval_result.get('skill_pass_count', 'N/A')
overall_better = eval_result.get('judgment', {}).get('overall_better', 'N/A')

print(f" Extracted: model={model}, baseline={baseline_rating}, skill={skill_rating}, winner={overall_better}")


# Determine winner
if overall_better == 'A':
winner = "Baseline"
Expand All @@ -99,26 +58,121 @@ def main():
winner = "Tie"
else:
winner = "N/A"
# Determine emoji for cases pass (skill >= baseline in rating hierarchy)

# Determine emoji for cases pass
baseline_score = rating_hierarchy.get(baseline_rating, -1)
skill_score = rating_hierarchy.get(skill_rating, -1)
pass_emoji = "✅" if skill_score >= baseline_score else "❌"

# Build row
test_link = f"[{artifact}]()"
comment += f"| {test_link} | {model} | {baseline_rating} | {skill_rating} | {pass_emoji} {skill_pass} | {winner} |\n"

comment += "\n"
else:
comment += "No evaluation results found.\n"

Path("comment.md").write_text(comment)
print("\n✓ Comment saved to comment.md")

return comment


def generate_benchmark_data(results: list, output_dir: Path) -> None:
"""
Generate benchmark data files for dashboard.

Args:
results: List of result dictionaries
output_dir: Directory to write benchmark data
"""
output_dir.mkdir(parents=True, exist_ok=True)

for result in results:
summary = result['summary']
artifact = result['artifact']

# Save individual benchmark data
benchmark_file = output_dir / f"{artifact}.json"
benchmark_file.write_text(json.dumps(summary, indent=2))

# Save aggregated data for dashboard
aggregated = {
"generated_at": json.dumps(None), # Will be set by generate_dashboard_data.py
"results": [
{
"artifact": r['artifact'],
"summary": r['summary']
}
for r in results
]
}

aggregated_file = output_dir / "benchmark_aggregated.json"
aggregated_file.write_text(json.dumps(aggregated, indent=2))


def main():
parser = argparse.ArgumentParser(description="Consolidate evaluation results")
parser.add_argument("--mode", choices=["pr-comment", "benchmark"], default="pr-comment",
help="Output mode: pr-comment or benchmark")
parser.add_argument("--results-dir", type=Path, default="tests/results",
help="Directory containing evaluation results")
parser.add_argument("--output-dir", type=Path, default=None,
help="Output directory for benchmark mode")
parser.add_argument("--output-file", type=Path, default="comment.md",
help="Output file for PR comment mode")
args = parser.parse_args()

if args.mode == "benchmark" and not args.output_dir:
args.output_dir = args.results_dir / "benchmark"

print(f"==> Consolidating results (mode: {args.mode})")

# Find all summary.json files
summary_files = sorted(args.results_dir.glob("*/summary.json"))

if not summary_files:
print(f"No results found in {args.results_dir}")
print(f"Directory contents: {list(args.results_dir.glob('*'))}")

if args.mode == "pr-comment":
Path(args.output_file).write_text("# Evaluation Results\n\nNo results found.\n")
sys.exit(0)

# Process results
all_results = []
failed = 0

for summary_file in summary_files:
try:
summary = json.loads(summary_file.read_text())
artifact_name = summary_file.parent.name

all_results.append({
"artifact": artifact_name,
"summary": summary,
})

print(f"✅ {artifact_name}")

except json.JSONDecodeError as e:
print(f"❌ {summary_file.parent.name} - Could not parse JSON: {e}")
failed += 1

print(f"\nSummary:")
print(f" Processed: {len(all_results)}")
print(f" Failed: {failed}")

# Generate output based on mode
if args.mode == "pr-comment":
comment = generate_pr_comment(all_results)
args.output_file.write_text(comment)
print(f"\n✓ Comment saved to {args.output_file}")

elif args.mode == "benchmark":
generate_benchmark_data(all_results, args.output_dir)
print(f"\n✓ Benchmark data saved to {args.output_dir}")

sys.exit(0)


if __name__ == "__main__":
import os
main()
Loading