Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
189 changes: 62 additions & 127 deletions .github/workflows/skill-validation.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,185 +9,120 @@ permissions:
contents: read

jobs:
# Job 1: Generate matrix from config
prepare:
runs-on: ubuntu-latest
if: github.event.issue.pull_request && contains(github.event.comment.body, '/test') && github.actor == 'Ariel-Rodriguez'
outputs:
matrix: ${{ steps.generate-matrix.outputs.matrix }}
filter: ${{ steps.parse.outputs.filter }}
use-parallel: ${{ steps.parse.outputs.use-parallel }}

matrix: ${{ steps.matrix.outputs.result }}

steps:
- uses: actions/checkout@v4
- uses: astral-sh/setup-uv@v5

- name: Parse test command
id: parse
id: command
run: |
COMMENT="${{ github.event.comment.body }}"
FILTER="all"
USE_PARALLEL="false"

# Check for specific provider filter
if echo "$COMMENT" | grep -q "/test copilot"; then
FILTER="copilot"
elif echo "$COMMENT" | grep -q "/test ollama"; then
FILTER="ollama"
elif echo "$COMMENT" | grep -q "/test gemini"; then
FILTER="gemini"
elif echo "$COMMENT" | grep -q "/test all"; then
FILTER="all"
fi

# Check for parallel flag
if echo "$COMMENT" | grep -q "parallel"; then
USE_PARALLEL="true"
SKILLS=$(python3 ci/parse_test_command.py "${{ github.event.comment.body }}")
echo "skills=$SKILLS" >> $GITHUB_OUTPUT
if [ -n "$SKILLS" ]; then
echo "Override skills: $SKILLS"
else
echo "Auto-detecting changed skills..."
fi

echo "filter=$FILTER" >> $GITHUB_OUTPUT
echo "use-parallel=$USE_PARALLEL" >> $GITHUB_OUTPUT
echo "Testing with filter: $FILTER (parallel: $USE_PARALLEL)"

- name: Checkout code
uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.11'

- name: Install PyYAML
run: pip install pyyaml

- name: Generate matrix
id: generate-matrix
run: |
MATRIX=$(python3 ci/matrix_generator.py --filter-provider "${{ steps.parse.outputs.filter }}")
echo "matrix=$MATRIX" >> $GITHUB_OUTPUT
echo "Generated matrix:"
echo "$MATRIX" | jq .

# Job 2: Run evaluations (parallel or sequential based on parse)
evaluate:
needs: prepare
runs-on: ubuntu-latest
if: needs.prepare.outputs.use-parallel == 'false'

steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: refs/pull/${{ github.event.issue.number }}/head

- name: Setup Node.js
uses: actions/setup-node@v4
with:
node-version: '20'

- name: Install Copilot CLI
run: npm install -g @github/copilot

- name: Install uv
uses: astral-sh/setup-uv@v5
with:
enable-cache: true

- name: Set up Python
run: uv python install 3.11

- name: Run orchestration
id: matrix
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
COPILOT_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
OLLAMA_API_KEY: ${{ secrets.OLLAMA_API_KEY }}
PR_NUMBER: ${{ github.event.issue.number }}
run: |
chmod +x ci/*.sh
./ci/orchestrate_evaluations.sh ${{ github.event.issue.number }} --filter-provider "${{ needs.prepare.outputs.filter }}"
python3 ci/generate_matrix.py "$PR_NUMBER" ${{ steps.command.outputs.skills }} > /tmp/matrix.json

- name: Post results
uses: marocchino/sticky-pull-request-comment@v2
if: always() && hashFiles('comment.md') != ''
with:
number: ${{ github.event.issue.number }}
path: comment.md
echo "result<<EOF" >> $GITHUB_OUTPUT
cat /tmp/matrix.json >> $GITHUB_OUTPUT
echo "EOF" >> $GITHUB_OUTPUT

python3 -c "import json; m=json.load(open('/tmp/matrix.json')); print(f'Generated {len(m[\"include\"])} jobs')"

# Job 3: Run evaluations in parallel (matrix strategy)
evaluate-parallel:
evaluate:
needs: prepare
runs-on: ubuntu-latest
if: needs.prepare.outputs.use-parallel == 'true'
strategy:
fail-fast: false
max-parallel: 2
matrix: ${{ fromJson(needs.prepare.outputs.matrix) }}

steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: refs/pull/${{ github.event.issue.number }}/head
- uses: actions/checkout@v4

- name: Setup Node.js
uses: actions/setup-node@v4
- uses: actions/setup-node@v4
if: matrix.provider == 'copilot'
with:
node-version: '20'
node-version: "20"

- name: Install Copilot CLI
if: matrix.provider == 'copilot'
run: npm install -g @github/copilot

- name: Install uv
uses: astral-sh/setup-uv@v5
- uses: astral-sh/setup-uv@v5
with:
enable-cache: true

- name: Set up Python
run: uv python install 3.11

- name: Detect changes
id: changes
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- name: Set artifact name
id: artifact
run: |
chmod +x ci/detect_changes.sh
MODIFIED_SKILLS=$(./ci/detect_changes.sh ${{ github.event.issue.number }})
echo "modified_skills=$MODIFIED_SKILLS" >> $GITHUB_OUTPUT
ARTIFACT_NAME="results-${{ matrix.provider }}-${{ matrix.model }}-${{ matrix.skill || 'all' }}"
ARTIFACT_NAME="${ARTIFACT_NAME//:/--}"
echo "name=$ARTIFACT_NAME" >> $GITHUB_OUTPUT

- name: Run evaluation for ${{ matrix.display_name }}
- name: Evaluate ${{ matrix.display_name }}
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
COPILOT_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
OLLAMA_API_KEY: ${{ secrets.OLLAMA_API_KEY }}
MODIFIED_SKILLS: ${{ steps.changes.outputs.modified_skills }}
run: |
chmod +x ci/run_evaluation.sh
./ci/run_evaluation.sh "${{ matrix.provider }}" "${{ matrix.model }}" 50 "${{ matrix.extra_args }}"
CMD="uv run --project tests --frozen tests/evaluator.py"
CMD="$CMD --provider ${{ matrix.provider }}"
CMD="$CMD --model ${{ matrix.model }}"
CMD="$CMD --judge --verbose --report --threshold 50"

if [ -n "${{ matrix.extra_args }}" ]; then
CMD="$CMD ${{ matrix.extra_args }}"
fi

if [ -n "${{ matrix.skill }}" ]; then
CMD="$CMD --skill ${{ matrix.skill }}"
else
CMD="$CMD --all"
fi

echo "Running: $CMD"
eval "$CMD"

- name: Upload results
uses: actions/upload-artifact@v4
if: always()
uses: actions/upload-artifact@v4
with:
name: results-${{ matrix.provider }}-${{ matrix.model }}
path: tests/results/${{ matrix.provider }}/${{ matrix.model }}/
name: ${{ steps.artifact.outputs.name }}
path: tests/results/
retention-days: 1

# Job 4: Consolidate parallel results
consolidate:
needs: [prepare, evaluate-parallel]
needs: evaluate
runs-on: ubuntu-latest
if: needs.prepare.outputs.use-parallel == 'true' && always()
if: always()

steps:
- name: Checkout code
uses: actions/checkout@v4
- uses: actions/checkout@v4

- name: Download all artifacts
uses: actions/download-artifact@v4
- uses: actions/download-artifact@v4
with:
path: tests/results/

- name: Consolidate results
run: |
chmod +x ci/consolidate_results.sh
./ci/consolidate_results.sh || true
run: python3 ci/consolidate_results.py

- name: Post consolidated results
- name: Post to PR
uses: marocchino/sticky-pull-request-comment@v2
if: hashFiles('comment.md') != ''
with:
Expand Down
8 changes: 0 additions & 8 deletions ci/collect-artifacts.sh

This file was deleted.

4 changes: 2 additions & 2 deletions ci/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@
ollama:
enabled: true
local: false
context: 32000
context: 28000
models:
- gpt-oss:20b-cloud
- rnj-1:8b-cloud
- devstral-small-2:24b-cloud
# - gpt-oss:20b-cloud

copilot:
enabled: false
Expand Down
124 changes: 124 additions & 0 deletions ci/consolidate_results.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
#!/usr/bin/env python3
"""
Consolidate results from parallel evaluations.

Aggregates outputs from multiple provider/model runs and generates summary.
"""

import json
import sys
from pathlib import Path


def main():
results_base = Path("tests/results")

print("==> Consolidating results from all evaluations")

# Find all summary.json files (artifacts download to flat structure)
summary_files = sorted(results_base.glob("*/summary.json"))

if not summary_files:
print(f"No results found in {results_base}")
print(f"Directory contents: {list(results_base.glob('*'))}")
# Write empty comment so job doesn't fail
Path("comment.md").write_text("# Evaluation Results\n\nNo results found.\n")
sys.exit(0)

# Check for failures
failed = 0
succeeded = 0
all_results = []

print("\nResults:")
print("========")

for summary_file in summary_files:
try:
summary = json.loads(summary_file.read_text())
artifact_name = summary_file.parent.name

all_results.append({
"artifact": artifact_name,
"summary": summary,
})

print(f"✅ {artifact_name}")
succeeded += 1

except json.JSONDecodeError:
print(f"❌ {summary_file.parent.name} - Could not parse JSON")
failed += 1

print("\nSummary:")
print("========")
print(f"Processed: {len(all_results)}")
print(f"Failed: {failed}")

# Generate comment
comment = "# 📊 Evaluation Results\n\n"

if all_results:
comment += f"Processed {len(all_results)} evaluation(s).\n\n"

# Build table with results
comment += "| Test Name | Model | Baseline | With Skill | Cases Pass | Winner |\n"
comment += "|-----------|-------|----------|------------|------------|--------|\n"

# Rating hierarchy for comparison
rating_hierarchy = {'vague': 0, 'regular': 1, 'good': 2, 'outstanding': 3}

for result in all_results:
summary = result['summary']
artifact = result['artifact']

# Debug: Log top-level keys
print(f"\n📋 Processing: {artifact}")
print(f" Top-level keys: {list(summary.keys())}")

# Extract key data from nested results[0]
eval_result = summary.get('results', [{}])[0] if summary.get('results') else {}
print(f" Nested result keys: {list(eval_result.keys())}")

skill = eval_result.get('skill', 'N/A')
model = eval_result.get('model', 'N/A')
baseline_rating = eval_result.get('baseline_rating', 'N/A')
skill_rating = eval_result.get('skill_rating', 'N/A')
baseline_pass = eval_result.get('baseline_pass_count', 'N/A')
skill_pass = eval_result.get('skill_pass_count', 'N/A')
overall_better = eval_result.get('judgment', {}).get('overall_better', 'N/A')

print(f" Extracted: model={model}, baseline={baseline_rating}, skill={skill_rating}, winner={overall_better}")

# Determine winner
if overall_better == 'A':
winner = "Baseline"
elif overall_better == 'B':
winner = "With Skill"
elif overall_better == 'TIE':
winner = "Tie"
else:
winner = "N/A"

# Determine emoji for cases pass (skill >= baseline in rating hierarchy)
baseline_score = rating_hierarchy.get(baseline_rating, -1)
skill_score = rating_hierarchy.get(skill_rating, -1)
pass_emoji = "✅" if skill_score >= baseline_score else "❌"

# Build row
test_link = f"[{artifact}]()"
comment += f"| {test_link} | {model} | {baseline_rating} | {skill_rating} | {pass_emoji} {skill_pass} | {winner} |\n"

comment += "\n"
else:
comment += "No evaluation results found.\n"

Path("comment.md").write_text(comment)
print("\n✓ Comment saved to comment.md")

sys.exit(0)


if __name__ == "__main__":
import os
main()
Loading
Loading