Skip to content

chore: Benchmarks Page - stage 1/2 #47

chore: Benchmarks Page - stage 1/2

chore: Benchmarks Page - stage 1/2 #47

name: Skill Validation
on:
issue_comment:
types: [created]
permissions:
pull-requests: write
contents: read
jobs:
prepare:
runs-on: ubuntu-latest
if: github.event.issue.pull_request && contains(github.event.comment.body, '/test') && github.actor == 'Ariel-Rodriguez'
outputs:
matrix: ${{ steps.matrix.outputs.result }}
steps:
# Checkout trusted scripts from main
- uses: actions/checkout@v4
with:
ref: main
path: trusted-scripts
# Checkout PR code (for context, though prepare mainly uses scripts)
- uses: actions/checkout@v4
with:
path: workspace
- uses: astral-sh/setup-uv@v5
- name: Parse test command
id: command
run: |
SKILLS=$(python3 trusted-scripts/ci/parse_test_command.py "${{ github.event.comment.body }}")
echo "skills=$SKILLS" >> $GITHUB_OUTPUT
if [ -n "$SKILLS" ]; then
echo "Override skills: $SKILLS"
else
echo "Auto-detecting changed skills..."
fi
- name: Generate matrix
id: matrix
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
PR_NUMBER: ${{ github.event.issue.number }}
run: |
# Run from workspace root but use trusted script
cd workspace
python3 ../trusted-scripts/ci/generate_matrix.py "$PR_NUMBER" ${{ steps.command.outputs.skills }} > /tmp/matrix.json
echo "result<<EOF" >> $GITHUB_OUTPUT
cat /tmp/matrix.json >> $GITHUB_OUTPUT
echo "EOF" >> $GITHUB_OUTPUT
python3 -c "import json; m=json.load(open('/tmp/matrix.json')); print(f'Generated {len(m[\"include\"])} jobs')"
evaluate:
needs: prepare
runs-on: ubuntu-latest
strategy:
fail-fast: false
max-parallel: 2
matrix: ${{ fromJson(needs.prepare.outputs.matrix) }}
steps:
# Checkout trusted scripts again (new job)
- uses: actions/checkout@v4
with:
ref: main
path: trusted-scripts
# Checkout actual PR code to test
- uses: actions/checkout@v4
- uses: actions/setup-node@v4
if: matrix.provider == 'copilot'
with:
node-version: "20"
- name: Install Copilot CLI
if: matrix.provider == 'copilot'
run: npm install -g @github/copilot
- uses: astral-sh/setup-uv@v5
with:
enable-cache: true
- name: Set artifact name
id: artifact
run: |
ARTIFACT_NAME="results-${{ matrix.provider }}-${{ matrix.model }}-${{ matrix.skill || 'all' }}"
ARTIFACT_NAME="${ARTIFACT_NAME//:/--}"
echo "name=$ARTIFACT_NAME" >> $GITHUB_OUTPUT
- name: Evaluate ${{ matrix.display_name }}
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
COPILOT_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
OLLAMA_API_KEY: ${{ secrets.OLLAMA_API_KEY }}
TRULENS_OTEL_ENDPOINT: ${{ secrets.TRULENS_OTEL_ENDPOINT }}
run: |
# Construct arguments array for safety
ARGS=(--provider "${{ matrix.provider }}" --model "${{ matrix.model }}" --judge --verbose --report --threshold 50)
if [ -n "${{ matrix.extra_args }}" ]; then
# Split extra_args safely if needed, but for now assuming simple flags
ARGS+=(${{ matrix.extra_args }})
fi
if [ -n "${{ matrix.skill }}" ]; then
ARGS+=(--skill "${{ matrix.skill }}")
else
ARGS+=(--all)
fi
echo "Running evaluation..."
# Use trusted script
uv run --project trusted-scripts/tests --frozen trusted-scripts/tests/evaluator.py "${ARGS[@]}"
- name: Upload results
if: always()
uses: actions/upload-artifact@v4
with:
name: ${{ steps.artifact.outputs.name }}
path: tests/results/
retention-days: 1
consolidate:
needs: evaluate
runs-on: ubuntu-latest
if: always()
steps:
# Checkout trusted scripts for consolidation logic
- uses: actions/checkout@v4
with:
ref: main
path: trusted-scripts
- uses: actions/download-artifact@v4
with:
path: tests/results/
- name: Consolidate results
run: python3 trusted-scripts/ci/consolidate_results.py
- name: Post to PR
uses: marocchino/sticky-pull-request-comment@v2
if: hashFiles('comment.md') != ''
with:
number: ${{ github.event.issue.number }}
path: comment.md