Ariel-Rodriguez · Ariel-Rodriguez · Feb 7, 2026 · Feb 7, 2026 · Feb 7, 2026
diff --git a/.github/workflows/benchmark-dashboard.yml b/.github/workflows/benchmark-dashboard.yml
@@ -0,0 +1,89 @@
+name: Benchmark Dashboard
+
+on:
+  workflow_dispatch:
+    inputs:
+      provider:
+        description: "Provider (ollama, copilot, gemini)"
+        required: true
+        type: choice
+        options:
+          - ollama
+          - copilot
+          - gemini
+      model:
+        description: "Model name (e.g., qwen-coder-next:cloud)"
+        required: true
+        type: string
+      skill:
+        description: "Optional specific skill to test"
+        required: false
+        type: string
+
+jobs:
+  run-benchmark:
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          ref: main
+          path: workspace
+
+      - uses: astral-sh/setup-uv@v5
+        with:
+          enable-cache: true
+
+      - name: Run benchmark
+        working-directory: workspace
+        env:
+          OLLAMA_API_KEY: ${{ secrets.OLLAMA_API_KEY }}
+        run: |
+          # Install dependencies
+          uv sync --project tests
+          # Run benchmark
+          uv run --project tests tests/evaluator.py \
+            --provider ${{ inputs.provider }} \
+            --model ${{ inputs.model }} \
+            --judge \
+            --verbose \
+            --report
+          # Rename artifact for clarity
+          if [ -d tests/results ]; then
+            ARTIFACT_NAME="benchmark-${{ inputs.provider }}-${{ inputs.model }}-$(date +%Y%m%d-%H%M%S)"
+            mv tests/results "tests/${ARTIFACT_NAME}"
+          fi
+
+      - name: Generate dashboard
+        working-directory: workspace
+        run: |
+          uv run --project tests python3 ci/publish_benchmarks.py \
+            --provider ${{ inputs.provider }} \
+            --model ${{ inputs.model }} \
+            --branch benchmark-history
+
+  deploy-pages:
+    needs: run-benchmark
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout workspace
+        uses: actions/checkout@v4
+        with:
+          ref: main
+          path: workspace
+
+      - name: Checkout benchmark data
+        uses: actions/checkout@v4
+        with:
+          ref: benchmark-history
+          path: benchmark-data
+
+      - name: Verify deployment
+        run: |
+          echo "Benchmark data directory contents:"
+          ls -la benchmark-data/docs/
+          echo ""
+          echo "GitHub Pages will be served from benchmark-history branch"
+          echo "Dashboard will be available at:"
+          echo "https://${{ github.repository }}/index.html"
diff --git a/.github/workflows/skill-validation.yml b/.github/workflows/skill-validation.yml
@@ -116,8 +116,8 @@ jobs:
           fi
 
           echo "Running evaluation..."
-          # Use trusted script
-          uv run --project trusted-scripts/tests --frozen trusted-scripts/tests/evaluator.py "${ARGS[@]}"
+          # Use trusted script with correct project path
+          uv run --project . --frozen trusted-scripts/tests/evaluator.py "${ARGS[@]}"
 
       - name: Upload results
         if: always()

diff --git a/ci/consolidate_results.py b/ci/consolidate_results.py
@@ -3,93 +3,52 @@
 Consolidate results from parallel evaluations.
 
 Aggregates outputs from multiple provider/model runs and generates summary.
+Supports both PR comments and benchmark dashboard generation.
 """
 
+import argparse
 import json
 import sys
 from pathlib import Path
 
 
-def main():
-    results_base = Path("tests/results")
-
-    print("==> Consolidating results from all evaluations")
-
-    # Find all summary.json files (artifacts download to flat structure)
-    summary_files = sorted(results_base.glob("*/summary.json"))
-
-    if not summary_files:
-        print(f"No results found in {results_base}")
-        print(f"Directory contents: {list(results_base.glob('*'))}")
-        # Write empty comment so job doesn't fail
-        Path("comment.md").write_text("# Evaluation Results\n\nNo results found.\n")
-        sys.exit(0)
-
-    # Check for failures
-    failed = 0
-    succeeded = 0
-    all_results = []
+def generate_pr_comment(results: list) -> str:
+    """
+    Generate GitHub PR comment from results.
 
-    print("\nResults:")
-    print("========")
+    Args:
+        results: List of result dictionaries
 
-    for summary_file in summary_files:
-        try:
-            summary = json.loads(summary_file.read_text())
-            artifact_name = summary_file.parent.name
-
-            all_results.append({
-                "artifact": artifact_name,
-                "summary": summary,
-            })
-
-            print(f"✅ {artifact_name}")
-            succeeded += 1
-
-        except json.JSONDecodeError:
-            print(f"❌ {summary_file.parent.name} - Could not parse JSON")
-            failed += 1
+    Returns:
+        Markdown comment string
+    """
+    comment = "# 📊 Evaluation Results\n\n"
 
-    print("\nSummary:")
-    print("========")
-    print(f"Processed: {len(all_results)}")
-    print(f"Failed: {failed}")
+    if results:
+        comment += f"Processed {len(results)} evaluation(s).\n\n"
 
-    # Generate comment
-    comment = "# 📊 Evaluation Results\n\n"
-
-    if all_results:
-        comment += f"Processed {len(all_results)} evaluation(s).\n\n"
-
         # Build table with results
         comment += "| Test Name | Model | Baseline | With Skill | Cases Pass | Winner |\n"
         comment += "|-----------|-------|----------|------------|------------|--------|\n"
-        
+
         # Rating hierarchy for comparison
         rating_hierarchy = {'vague': 0, 'regular': 1, 'good': 2, 'outstanding': 3}
-        
-        for result in all_results:
+
+        for result in results:
             summary = result['summary']
             artifact = result['artifact']
-
-            # Debug: Log top-level keys
-            print(f"\n📋 Processing: {artifact}")
-            print(f"   Top-level keys: {list(summary.keys())}")
-
+
             # Extract key data from nested results[0]
             eval_result = summary.get('results', [{}])[0] if summary.get('results') else {}
-            print(f"   Nested result keys: {list(eval_result.keys())}")
-
+
             skill = eval_result.get('skill', 'N/A')
             model = eval_result.get('model', 'N/A')
             baseline_rating = eval_result.get('baseline_rating', 'N/A')
             skill_rating = eval_result.get('skill_rating', 'N/A')
             baseline_pass = eval_result.get('baseline_pass_count', 'N/A')
             skill_pass = eval_result.get('skill_pass_count', 'N/A')
             overall_better = eval_result.get('judgment', {}).get('overall_better', 'N/A')
-
-            print(f"   Extracted: model={model}, baseline={baseline_rating}, skill={skill_rating}, winner={overall_better}")
-
+
             # Determine winner
             if overall_better == 'A':
                 winner = "Baseline"
@@ -99,26 +58,121 @@ def main():
                 winner = "Tie"
             else:
                 winner = "N/A"
-            
-            # Determine emoji for cases pass (skill >= baseline in rating hierarchy)
+
+            # Determine emoji for cases pass
             baseline_score = rating_hierarchy.get(baseline_rating, -1)
             skill_score = rating_hierarchy.get(skill_rating, -1)
             pass_emoji = "✅" if skill_score >= baseline_score else "❌"
-            
+
             # Build row
             test_link = f"[{artifact}]()"
             comment += f"| {test_link} | {model} | {baseline_rating} | {skill_rating} | {pass_emoji} {skill_pass} | {winner} |\n"
-        
+
         comment += "\n"
     else:
         comment += "No evaluation results found.\n"
 
-    Path("comment.md").write_text(comment)
-    print("\n✓ Comment saved to comment.md")
-
+    return comment
+
+
+def generate_benchmark_data(results: list, output_dir: Path) -> None:
+    """
+    Generate benchmark data files for dashboard.
+
+    Args:
+        results: List of result dictionaries
+        output_dir: Directory to write benchmark data
+    """
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    for result in results:
+        summary = result['summary']
+        artifact = result['artifact']
+
+        # Save individual benchmark data
+        benchmark_file = output_dir / f"{artifact}.json"
+        benchmark_file.write_text(json.dumps(summary, indent=2))
+
+    # Save aggregated data for dashboard
+    aggregated = {
+        "generated_at": json.dumps(None),  # Will be set by generate_dashboard_data.py
+        "results": [
+            {
+                "artifact": r['artifact'],
+                "summary": r['summary']
+            }
+            for r in results
+        ]
+    }
+
+    aggregated_file = output_dir / "benchmark_aggregated.json"
+    aggregated_file.write_text(json.dumps(aggregated, indent=2))
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Consolidate evaluation results")
+    parser.add_argument("--mode", choices=["pr-comment", "benchmark"], default="pr-comment",
+                       help="Output mode: pr-comment or benchmark")
+    parser.add_argument("--results-dir", type=Path, default="tests/results",
+                       help="Directory containing evaluation results")
+    parser.add_argument("--output-dir", type=Path, default=None,
+                       help="Output directory for benchmark mode")
+    parser.add_argument("--output-file", type=Path, default="comment.md",
+                       help="Output file for PR comment mode")
+    args = parser.parse_args()
+
+    if args.mode == "benchmark" and not args.output_dir:
+        args.output_dir = args.results_dir / "benchmark"
+
+    print(f"==> Consolidating results (mode: {args.mode})")
+
+    # Find all summary.json files
+    summary_files = sorted(args.results_dir.glob("*/summary.json"))
+
+    if not summary_files:
+        print(f"No results found in {args.results_dir}")
+        print(f"Directory contents: {list(args.results_dir.glob('*'))}")
+
+        if args.mode == "pr-comment":
+            Path(args.output_file).write_text("# Evaluation Results\n\nNo results found.\n")
+        sys.exit(0)
+
+    # Process results
+    all_results = []
+    failed = 0
+
+    for summary_file in summary_files:
+        try:
+            summary = json.loads(summary_file.read_text())
+            artifact_name = summary_file.parent.name
+
+            all_results.append({
+                "artifact": artifact_name,
+                "summary": summary,
+            })
+
+            print(f"✅ {artifact_name}")
+
+        except json.JSONDecodeError as e:
+            print(f"❌ {summary_file.parent.name} - Could not parse JSON: {e}")
+            failed += 1
+
+    print(f"\nSummary:")
+    print(f"  Processed: {len(all_results)}")
+    print(f"  Failed: {failed}")
+
+    # Generate output based on mode
+    if args.mode == "pr-comment":
+        comment = generate_pr_comment(all_results)
+        args.output_file.write_text(comment)
+        print(f"\n✓ Comment saved to {args.output_file}")
+
+    elif args.mode == "benchmark":
+        generate_benchmark_data(all_results, args.output_dir)
+        print(f"\n✓ Benchmark data saved to {args.output_dir}")
+
     sys.exit(0)
 
 
 if __name__ == "__main__":
-    import os
     main()