Benchmark Dashboard #3
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Benchmark Dashboard | |
| on: | |
| workflow_dispatch: | |
| inputs: | |
| provider: | |
| description: "Provider (ollama, copilot, gemini)" | |
| required: true | |
| type: choice | |
| options: | |
| - ollama | |
| - copilot | |
| - gemini | |
| model: | |
| description: "Model name (e.g., qwen-coder-next:cloud)" | |
| required: true | |
| type: string | |
| skill: | |
| description: "Optional specific skill to test" | |
| required: false | |
| type: string | |
| jobs: | |
| run-benchmark: | |
| runs-on: ubuntu-latest | |
| steps: | |
| - uses: actions/checkout@v4 | |
| with: | |
| ref: main | |
| path: workspace | |
| - uses: astral-sh/setup-uv@v5 | |
| with: | |
| enable-cache: true | |
| - name: Run benchmark | |
| working-directory: workspace | |
| env: | |
| OLLAMA_API_KEY: ${{ secrets.OLLAMA_API_KEY }} | |
| run: | | |
| # Install dependencies | |
| uv sync --project tests | |
| # Run benchmark | |
| uv run --project tests tests/evaluator.py \ | |
| --provider ${{ inputs.provider }} \ | |
| --model ${{ inputs.model }} \ | |
| --judge \ | |
| --verbose \ | |
| --report | |
| # Rename artifact for clarity | |
| if [ -d tests/results ]; then | |
| ARTIFACT_NAME="benchmark-${{ inputs.provider }}-${{ inputs.model }}-$(date +%Y%m%d-%H%M%S)" | |
| mv tests/results "tests/${ARTIFACT_NAME}" | |
| fi | |
| # Create docs/benchmarks if it doesn't exist for publish_benchmarks.py | |
| mkdir -p docs/benchmarks | |
| - name: Generate dashboard | |
| working-directory: workspace | |
| run: | | |
| uv run --project tests python3 ci/publish_benchmarks.py \ | |
| --provider ${{ inputs.provider }} \ | |
| --model ${{ inputs.model }} \ | |
| --branch benchmark-history \ | |
| --no-benchmark | |
| - name: Generate dashboard | |
| working-directory: workspace | |
| run: | | |
| uv run --project tests python3 ci/publish_benchmarks.py \ | |
| --provider ${{ inputs.provider }} \ | |
| --model ${{ inputs.model }} \ | |
| --branch benchmark-history | |
| deploy-pages: | |
| needs: run-benchmark | |
| runs-on: ubuntu-latest | |
| steps: | |
| - name: Checkout workspace | |
| uses: actions/checkout@v4 | |
| with: | |
| ref: main | |
| path: workspace | |
| - name: Checkout benchmark data | |
| uses: actions/checkout@v4 | |
| with: | |
| ref: benchmark-history | |
| path: benchmark-data | |
| - name: Copy results to docs | |
| run: | | |
| mkdir -p workspace/docs/benchmarks | |
| cp benchmark-data/docs/benchmarks.json workspace/docs/benchmarks.json 2>/dev/null || true | |
| cp benchmark-data/docs/index.html workspace/docs/index.html 2>/dev/null || true | |
| # Also copy individual benchmark results if they exist | |
| cp -r benchmark-data/docs/benchmarks/*.json workspace/docs/benchmarks/ 2>/dev/null || true | |
| - name: Commit and push updates | |
| working-directory: workspace | |
| run: | | |
| git config user.name "GitHub Actions" | |
| git config user.email "actions@github.com" | |
| git add docs/ | |
| git commit -m "Update benchmark data" || echo "No changes to commit" | |
| git push origin HEAD:benchmark-history |