chore: Benchmarks Page - stage 1/2 #47
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Skill Validation | |
| on: | |
| issue_comment: | |
| types: [created] | |
| permissions: | |
| pull-requests: write | |
| contents: read | |
| jobs: | |
| prepare: | |
| runs-on: ubuntu-latest | |
| if: github.event.issue.pull_request && contains(github.event.comment.body, '/test') && github.actor == 'Ariel-Rodriguez' | |
| outputs: | |
| matrix: ${{ steps.matrix.outputs.result }} | |
| steps: | |
| # Checkout trusted scripts from main | |
| - uses: actions/checkout@v4 | |
| with: | |
| ref: main | |
| path: trusted-scripts | |
| # Checkout PR code (for context, though prepare mainly uses scripts) | |
| - uses: actions/checkout@v4 | |
| with: | |
| path: workspace | |
| - uses: astral-sh/setup-uv@v5 | |
| - name: Parse test command | |
| id: command | |
| run: | | |
| SKILLS=$(python3 trusted-scripts/ci/parse_test_command.py "${{ github.event.comment.body }}") | |
| echo "skills=$SKILLS" >> $GITHUB_OUTPUT | |
| if [ -n "$SKILLS" ]; then | |
| echo "Override skills: $SKILLS" | |
| else | |
| echo "Auto-detecting changed skills..." | |
| fi | |
| - name: Generate matrix | |
| id: matrix | |
| env: | |
| GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
| PR_NUMBER: ${{ github.event.issue.number }} | |
| run: | | |
| # Run from workspace root but use trusted script | |
| cd workspace | |
| python3 ../trusted-scripts/ci/generate_matrix.py "$PR_NUMBER" ${{ steps.command.outputs.skills }} > /tmp/matrix.json | |
| echo "result<<EOF" >> $GITHUB_OUTPUT | |
| cat /tmp/matrix.json >> $GITHUB_OUTPUT | |
| echo "EOF" >> $GITHUB_OUTPUT | |
| python3 -c "import json; m=json.load(open('/tmp/matrix.json')); print(f'Generated {len(m[\"include\"])} jobs')" | |
| evaluate: | |
| needs: prepare | |
| runs-on: ubuntu-latest | |
| strategy: | |
| fail-fast: false | |
| max-parallel: 2 | |
| matrix: ${{ fromJson(needs.prepare.outputs.matrix) }} | |
| steps: | |
| # Checkout trusted scripts again (new job) | |
| - uses: actions/checkout@v4 | |
| with: | |
| ref: main | |
| path: trusted-scripts | |
| # Checkout actual PR code to test | |
| - uses: actions/checkout@v4 | |
| - uses: actions/setup-node@v4 | |
| if: matrix.provider == 'copilot' | |
| with: | |
| node-version: "20" | |
| - name: Install Copilot CLI | |
| if: matrix.provider == 'copilot' | |
| run: npm install -g @github/copilot | |
| - uses: astral-sh/setup-uv@v5 | |
| with: | |
| enable-cache: true | |
| - name: Set artifact name | |
| id: artifact | |
| run: | | |
| ARTIFACT_NAME="results-${{ matrix.provider }}-${{ matrix.model }}-${{ matrix.skill || 'all' }}" | |
| ARTIFACT_NAME="${ARTIFACT_NAME//:/--}" | |
| echo "name=$ARTIFACT_NAME" >> $GITHUB_OUTPUT | |
| - name: Evaluate ${{ matrix.display_name }} | |
| env: | |
| GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
| COPILOT_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
| OLLAMA_API_KEY: ${{ secrets.OLLAMA_API_KEY }} | |
| TRULENS_OTEL_ENDPOINT: ${{ secrets.TRULENS_OTEL_ENDPOINT }} | |
| run: | | |
| # Construct arguments array for safety | |
| ARGS=(--provider "${{ matrix.provider }}" --model "${{ matrix.model }}" --judge --verbose --report --threshold 50) | |
| if [ -n "${{ matrix.extra_args }}" ]; then | |
| # Split extra_args safely if needed, but for now assuming simple flags | |
| ARGS+=(${{ matrix.extra_args }}) | |
| fi | |
| if [ -n "${{ matrix.skill }}" ]; then | |
| ARGS+=(--skill "${{ matrix.skill }}") | |
| else | |
| ARGS+=(--all) | |
| fi | |
| echo "Running evaluation..." | |
| # Use trusted script | |
| uv run --project trusted-scripts/tests --frozen trusted-scripts/tests/evaluator.py "${ARGS[@]}" | |
| - name: Upload results | |
| if: always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: ${{ steps.artifact.outputs.name }} | |
| path: tests/results/ | |
| retention-days: 1 | |
| consolidate: | |
| needs: evaluate | |
| runs-on: ubuntu-latest | |
| if: always() | |
| steps: | |
| # Checkout trusted scripts for consolidation logic | |
| - uses: actions/checkout@v4 | |
| with: | |
| ref: main | |
| path: trusted-scripts | |
| - uses: actions/download-artifact@v4 | |
| with: | |
| path: tests/results/ | |
| - name: Consolidate results | |
| run: python3 trusted-scripts/ci/consolidate_results.py | |
| - name: Post to PR | |
| uses: marocchino/sticky-pull-request-comment@v2 | |
| if: hashFiles('comment.md') != '' | |
| with: | |
| number: ${{ github.event.issue.number }} | |
| path: comment.md |