chore: Benchmarks Page - stage 1/2 #47

Workflow file for this run

.github/workflows/skill-validation.yml at db641ba

	name: Skill Validation

	on:
	issue_comment:
	types: [created]

	permissions:
	pull-requests: write
	contents: read

	jobs:
	prepare:
	runs-on: ubuntu-latest
	if: github.event.issue.pull_request && contains(github.event.comment.body, '/test') && github.actor == 'Ariel-Rodriguez'
	outputs:
	matrix: ${{ steps.matrix.outputs.result }}

	steps:
	# Checkout trusted scripts from main
	- uses: actions/checkout@v4
	with:
	ref: main
	path: trusted-scripts

	# Checkout PR code (for context, though prepare mainly uses scripts)
	- uses: actions/checkout@v4
	with:
	path: workspace

	- uses: astral-sh/setup-uv@v5

	- name: Parse test command
	id: command
	run: \|
	SKILLS=$(python3 trusted-scripts/ci/parse_test_command.py "${{ github.event.comment.body }}")
	echo "skills=$SKILLS" >> $GITHUB_OUTPUT
	if [ -n "$SKILLS" ]; then
	echo "Override skills: $SKILLS"
	else
	echo "Auto-detecting changed skills..."
	fi

	- name: Generate matrix
	id: matrix
	env:
	GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
	PR_NUMBER: ${{ github.event.issue.number }}
	run: \|
	# Run from workspace root but use trusted script
	cd workspace
	python3 ../trusted-scripts/ci/generate_matrix.py "$PR_NUMBER" ${{ steps.command.outputs.skills }} > /tmp/matrix.json

	echo "result<<EOF" >> $GITHUB_OUTPUT
	cat /tmp/matrix.json >> $GITHUB_OUTPUT
	echo "EOF" >> $GITHUB_OUTPUT

	python3 -c "import json; m=json.load(open('/tmp/matrix.json')); print(f'Generated {len(m[\"include\"])} jobs')"

	evaluate:
	needs: prepare
	runs-on: ubuntu-latest
	strategy:
	fail-fast: false
	max-parallel: 2
	matrix: ${{ fromJson(needs.prepare.outputs.matrix) }}

	steps:
	# Checkout trusted scripts again (new job)
	- uses: actions/checkout@v4
	with:
	ref: main
	path: trusted-scripts

	# Checkout actual PR code to test
	- uses: actions/checkout@v4

	- uses: actions/setup-node@v4
	if: matrix.provider == 'copilot'
	with:
	node-version: "20"

	- name: Install Copilot CLI
	if: matrix.provider == 'copilot'
	run: npm install -g @github/copilot

	- uses: astral-sh/setup-uv@v5
	with:
	enable-cache: true

	- name: Set artifact name
	id: artifact
	run: \|
	ARTIFACT_NAME="results-${{ matrix.provider }}-${{ matrix.model }}-${{ matrix.skill \|\| 'all' }}"
	ARTIFACT_NAME="${ARTIFACT_NAME//:/--}"
	echo "name=$ARTIFACT_NAME" >> $GITHUB_OUTPUT

	- name: Evaluate ${{ matrix.display_name }}
	env:
	GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
	COPILOT_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
	OLLAMA_API_KEY: ${{ secrets.OLLAMA_API_KEY }}
	TRULENS_OTEL_ENDPOINT: ${{ secrets.TRULENS_OTEL_ENDPOINT }}
	run: \|
	# Construct arguments array for safety
	ARGS=(--provider "${{ matrix.provider }}" --model "${{ matrix.model }}" --judge --verbose --report --threshold 50)

	if [ -n "${{ matrix.extra_args }}" ]; then
	# Split extra_args safely if needed, but for now assuming simple flags
	ARGS+=(${{ matrix.extra_args }})
	fi

	if [ -n "${{ matrix.skill }}" ]; then
	ARGS+=(--skill "${{ matrix.skill }}")
	else
	ARGS+=(--all)
	fi

	echo "Running evaluation..."
	# Use trusted script
	uv run --project trusted-scripts/tests --frozen trusted-scripts/tests/evaluator.py "${ARGS[@]}"

	- name: Upload results
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: ${{ steps.artifact.outputs.name }}
	path: tests/results/
	retention-days: 1

	consolidate:
	needs: evaluate
	runs-on: ubuntu-latest
	if: always()

	steps:
	# Checkout trusted scripts for consolidation logic
	- uses: actions/checkout@v4
	with:
	ref: main
	path: trusted-scripts

	- uses: actions/download-artifact@v4
	with:
	path: tests/results/

	- name: Consolidate results
	run: python3 trusted-scripts/ci/consolidate_results.py

	- name: Post to PR
	uses: marocchino/sticky-pull-request-comment@v2
	if: hashFiles('comment.md') != ''
	with:
	number: ${{ github.event.issue.number }}
	path: comment.md

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

chore: Benchmarks Page - stage 1/2 #47

Workflow file

chore: Benchmarks Page - stage 1/2 #47

Uh oh!

Workflow file for this run