diff --git a/.github/workflows/benchmark-dashboard.yml b/.github/workflows/benchmark-dashboard.yml index 20eb7f5..20d2e44 100644 --- a/.github/workflows/benchmark-dashboard.yml +++ b/.github/workflows/benchmark-dashboard.yml @@ -53,6 +53,17 @@ jobs: ARTIFACT_NAME="benchmark-${{ inputs.provider }}-${{ inputs.model }}-$(date +%Y%m%d-%H%M%S)" mv tests/results "tests/${ARTIFACT_NAME}" fi + # Create docs/benchmarks if it doesn't exist for publish_benchmarks.py + mkdir -p docs/benchmarks + + - name: Generate dashboard + working-directory: workspace + run: | + uv run --project tests python3 ci/publish_benchmarks.py \ + --provider ${{ inputs.provider }} \ + --model ${{ inputs.model }} \ + --branch benchmark-history \ + --no-benchmark - name: Generate dashboard working-directory: workspace @@ -79,11 +90,19 @@ jobs: ref: benchmark-history path: benchmark-data - - name: Verify deployment + - name: Copy results to docs + run: | + mkdir -p workspace/docs/benchmarks + cp benchmark-data/docs/benchmarks.json workspace/docs/benchmarks.json 2>/dev/null || true + cp benchmark-data/docs/index.html workspace/docs/index.html 2>/dev/null || true + # Also copy individual benchmark results if they exist + cp -r benchmark-data/docs/benchmarks/*.json workspace/docs/benchmarks/ 2>/dev/null || true + + - name: Commit and push updates + working-directory: workspace run: | - echo "Benchmark data directory contents:" - ls -la benchmark-data/docs/ - echo "" - echo "GitHub Pages will be served from benchmark-history branch" - echo "Dashboard will be available at:" - echo "https://${{ github.repository }}/index.html" + git config user.name "GitHub Actions" + git config user.email "actions@github.com" + git add docs/ + git commit -m "Update benchmark data" || echo "No changes to commit" + git push origin HEAD:benchmark-history diff --git a/.github/workflows/skill-validation.yml b/.github/workflows/skill-validation.yml index 402de19..2151882 100644 --- a/.github/workflows/skill-validation.yml +++ b/.github/workflows/skill-validation.yml @@ -71,8 +71,10 @@ jobs: ref: main path: trusted-scripts - # Checkout actual PR code to test + # Checkout actual PR code to test (to different path to avoid overwriting trusted-scripts) - uses: actions/checkout@v4 + with: + path: pr-code - uses: actions/setup-node@v4 if: matrix.provider == 'copilot' @@ -100,10 +102,11 @@ jobs: COPILOT_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} OLLAMA_API_KEY: ${{ secrets.OLLAMA_API_KEY }} TRULENS_OTEL_ENDPOINT: ${{ secrets.TRULENS_OTEL_ENDPOINT }} + working-directory: pr-code run: | # Construct arguments array for safety ARGS=(--provider "${{ matrix.provider }}" --model "${{ matrix.model }}" --judge --verbose --report --threshold 50) - + if [ -n "${{ matrix.extra_args }}" ]; then # Split extra_args safely if needed, but for now assuming simple flags ARGS+=(${{ matrix.extra_args }}) @@ -116,15 +119,16 @@ jobs: fi echo "Running evaluation..." - # Use trusted script with correct project path - uv run --project . --frozen trusted-scripts/tests/evaluator.py "${ARGS[@]}" + # Run evaluator from trusted-scripts/tests + # The working-directory is pr-code, so evaluator.py finds PR code at current directory + uv run --project . --frozen ../trusted-scripts/tests/evaluator.py "${ARGS[@]}" - name: Upload results if: always() uses: actions/upload-artifact@v4 with: name: ${{ steps.artifact.outputs.name }} - path: tests/results/ + path: pr-code/tests/results/ retention-days: 1 consolidate: @@ -141,14 +145,14 @@ jobs: - uses: actions/download-artifact@v4 with: - path: tests/results/ + path: pr-code/tests/results/ - name: Consolidate results - run: python3 trusted-scripts/ci/consolidate_results.py + run: python3 trusted-scripts/ci/consolidate_results.py --results-dir pr-code/tests/results --output-file pr-code/comment.md - name: Post to PR uses: marocchino/sticky-pull-request-comment@v2 - if: hashFiles('comment.md') != '' + if: hashFiles('pr-code/comment.md') != '' with: number: ${{ github.event.issue.number }} - path: comment.md + path: pr-code/comment.md diff --git a/skills/ps-error-handling-design/SKILL.md b/skills/ps-error-handling-design/SKILL.md index 9478417..504a85e 100644 --- a/skills/ps-error-handling-design/SKILL.md +++ b/skills/ps-error-handling-design/SKILL.md @@ -4,7 +4,7 @@ description: Design systems with explicit error handling. Avoid throwing excepti severity: WARN --- -## Principle +# Principle Treat error handling as a first-class design concern, not an afterthought: