Ariel-Rodriguez · Ariel-Rodriguez · Jan 31, 2026 · Jan 31, 2026 · Jan 31, 2026 · Jan 31, 2026
diff --git a/.github/workflows/skill-validation.yml b/.github/workflows/skill-validation.yml
@@ -9,185 +9,120 @@ permissions:
   contents: read
 
 jobs:
-  # Job 1: Generate matrix from config
   prepare:
     runs-on: ubuntu-latest
     if: github.event.issue.pull_request && contains(github.event.comment.body, '/test') && github.actor == 'Ariel-Rodriguez'
     outputs:
-      matrix: ${{ steps.generate-matrix.outputs.matrix }}
-      filter: ${{ steps.parse.outputs.filter }}
-      use-parallel: ${{ steps.parse.outputs.use-parallel }}
-
+      matrix: ${{ steps.matrix.outputs.result }}
+
     steps:
+      - uses: actions/checkout@v4
+      - uses: astral-sh/setup-uv@v5
+
       - name: Parse test command
-        id: parse
+        id: command
         run: |
-          COMMENT="${{ github.event.comment.body }}"
-          FILTER="all"
-          USE_PARALLEL="false"
-
-          # Check for specific provider filter
-          if echo "$COMMENT" | grep -q "/test copilot"; then
-            FILTER="copilot"
-          elif echo "$COMMENT" | grep -q "/test ollama"; then
-            FILTER="ollama"
-          elif echo "$COMMENT" | grep -q "/test gemini"; then
-            FILTER="gemini"
-          elif echo "$COMMENT" | grep -q "/test all"; then
-            FILTER="all"
-          fi
-
-          # Check for parallel flag
-          if echo "$COMMENT" | grep -q "parallel"; then
-            USE_PARALLEL="true"
+          SKILLS=$(python3 ci/parse_test_command.py "${{ github.event.comment.body }}")
+          echo "skills=$SKILLS" >> $GITHUB_OUTPUT
+          if [ -n "$SKILLS" ]; then
+            echo "Override skills: $SKILLS"
+          else
+            echo "Auto-detecting changed skills..."
           fi
-
-          echo "filter=$FILTER" >> $GITHUB_OUTPUT
-          echo "use-parallel=$USE_PARALLEL" >> $GITHUB_OUTPUT
-          echo "Testing with filter: $FILTER (parallel: $USE_PARALLEL)"
-
-      - name: Checkout code
-        uses: actions/checkout@v4
-
-      - name: Set up Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: '3.11'
-
-      - name: Install PyYAML
-        run: pip install pyyaml
 
       - name: Generate matrix
-        id: generate-matrix
-        run: |
-          MATRIX=$(python3 ci/matrix_generator.py --filter-provider "${{ steps.parse.outputs.filter }}")
-          echo "matrix=$MATRIX" >> $GITHUB_OUTPUT
-          echo "Generated matrix:"
-          echo "$MATRIX" | jq .
-
-  # Job 2: Run evaluations (parallel or sequential based on parse)
-  evaluate:
-    needs: prepare
-    runs-on: ubuntu-latest
-    if: needs.prepare.outputs.use-parallel == 'false'
-
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-        with:
-          ref: refs/pull/${{ github.event.issue.number }}/head
-
-      - name: Setup Node.js
-        uses: actions/setup-node@v4
-        with:
-          node-version: '20'
-
-      - name: Install Copilot CLI
-        run: npm install -g @github/copilot
-
-      - name: Install uv
-        uses: astral-sh/setup-uv@v5
-        with:
-          enable-cache: true
-
-      - name: Set up Python
-        run: uv python install 3.11
-
-      - name: Run orchestration
+        id: matrix
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          COPILOT_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          OLLAMA_API_KEY: ${{ secrets.OLLAMA_API_KEY }}
+          PR_NUMBER: ${{ github.event.issue.number }}
         run: |
-          chmod +x ci/*.sh
-          ./ci/orchestrate_evaluations.sh ${{ github.event.issue.number }} --filter-provider "${{ needs.prepare.outputs.filter }}"
+          python3 ci/generate_matrix.py "$PR_NUMBER" ${{ steps.command.outputs.skills }} > /tmp/matrix.json
 
-      - name: Post results
-        uses: marocchino/sticky-pull-request-comment@v2
-        if: always() && hashFiles('comment.md') != ''
-        with:
-          number: ${{ github.event.issue.number }}
-          path: comment.md
+          echo "result<<EOF" >> $GITHUB_OUTPUT
+          cat /tmp/matrix.json >> $GITHUB_OUTPUT
+          echo "EOF" >> $GITHUB_OUTPUT
+
+          python3 -c "import json; m=json.load(open('/tmp/matrix.json')); print(f'Generated {len(m[\"include\"])} jobs')"
 
-  # Job 3: Run evaluations in parallel (matrix strategy)
-  evaluate-parallel:
+  evaluate:
     needs: prepare
     runs-on: ubuntu-latest
-    if: needs.prepare.outputs.use-parallel == 'true'
     strategy:
       fail-fast: false
+      max-parallel: 2
       matrix: ${{ fromJson(needs.prepare.outputs.matrix) }}
-    
+
     steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-        with:
-          ref: refs/pull/${{ github.event.issue.number }}/head
+      - uses: actions/checkout@v4
 
-      - name: Setup Node.js
-        uses: actions/setup-node@v4
+      - uses: actions/setup-node@v4
+        if: matrix.provider == 'copilot'
         with:
-          node-version: '20'
+          node-version: "20"
 
       - name: Install Copilot CLI
         if: matrix.provider == 'copilot'
         run: npm install -g @github/copilot
 
-      - name: Install uv
-        uses: astral-sh/setup-uv@v5
+      - uses: astral-sh/setup-uv@v5
         with:
           enable-cache: true
 
-      - name: Set up Python
-        run: uv python install 3.11
-
-      - name: Detect changes
-        id: changes
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Set artifact name
+        id: artifact
         run: |
-          chmod +x ci/detect_changes.sh
-          MODIFIED_SKILLS=$(./ci/detect_changes.sh ${{ github.event.issue.number }})
-          echo "modified_skills=$MODIFIED_SKILLS" >> $GITHUB_OUTPUT
+          ARTIFACT_NAME="results-${{ matrix.provider }}-${{ matrix.model }}-${{ matrix.skill || 'all' }}"
+          ARTIFACT_NAME="${ARTIFACT_NAME//:/--}"
+          echo "name=$ARTIFACT_NAME" >> $GITHUB_OUTPUT
 
-      - name: Run evaluation for ${{ matrix.display_name }}
+      - name: Evaluate ${{ matrix.display_name }}
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
           COPILOT_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
           OLLAMA_API_KEY: ${{ secrets.OLLAMA_API_KEY }}
-          MODIFIED_SKILLS: ${{ steps.changes.outputs.modified_skills }}
         run: |
-          chmod +x ci/run_evaluation.sh
-          ./ci/run_evaluation.sh "${{ matrix.provider }}" "${{ matrix.model }}" 50 "${{ matrix.extra_args }}"
+          CMD="uv run --project tests --frozen tests/evaluator.py"
+          CMD="$CMD --provider ${{ matrix.provider }}"
+          CMD="$CMD --model ${{ matrix.model }}"
+          CMD="$CMD --judge --verbose --report --threshold 50"
+
+          if [ -n "${{ matrix.extra_args }}" ]; then
+            CMD="$CMD ${{ matrix.extra_args }}"
+          fi
+
+          if [ -n "${{ matrix.skill }}" ]; then
+            CMD="$CMD --skill ${{ matrix.skill }}"
+          else
+            CMD="$CMD --all"
+          fi
+
+          echo "Running: $CMD"
+          eval "$CMD"
 
       - name: Upload results
-        uses: actions/upload-artifact@v4
         if: always()
+        uses: actions/upload-artifact@v4
         with:
-          name: results-${{ matrix.provider }}-${{ matrix.model }}
-          path: tests/results/${{ matrix.provider }}/${{ matrix.model }}/
+          name: ${{ steps.artifact.outputs.name }}
+          path: tests/results/
+          retention-days: 1
 
-  # Job 4: Consolidate parallel results
   consolidate:
-    needs: [prepare, evaluate-parallel]
+    needs: evaluate
     runs-on: ubuntu-latest
-    if: needs.prepare.outputs.use-parallel == 'true' && always()
-    
+    if: always()
+
     steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
+      - uses: actions/checkout@v4
 
-      - name: Download all artifacts
-        uses: actions/download-artifact@v4
+      - uses: actions/download-artifact@v4
         with:
           path: tests/results/
 
       - name: Consolidate results
-        run: |
-          chmod +x ci/consolidate_results.sh
-          ./ci/consolidate_results.sh || true
+        run: python3 ci/consolidate_results.py
 
-      - name: Post consolidated results
+      - name: Post to PR
         uses: marocchino/sticky-pull-request-comment@v2
         if: hashFiles('comment.md') != ''
         with:

diff --git a/ci/collect-artifacts.sh b/ci/collect-artifacts.sh
diff --git a/ci/config.yaml b/ci/config.yaml
@@ -4,11 +4,11 @@
 ollama:
   enabled: true
   local: false
-  context: 32000
+  context: 28000
   models:
-    - gpt-oss:20b-cloud
     - rnj-1:8b-cloud
     - devstral-small-2:24b-cloud
+#   - gpt-oss:20b-cloud
 
 copilot:
   enabled: false

diff --git a/ci/consolidate_results.py b/ci/consolidate_results.py
@@ -0,0 +1,124 @@
+#!/usr/bin/env python3
+"""
+Consolidate results from parallel evaluations.
+
+Aggregates outputs from multiple provider/model runs and generates summary.
+"""
+
+import json
+import sys
+from pathlib import Path
+
+
+def main():
+    results_base = Path("tests/results")
+
+    print("==> Consolidating results from all evaluations")
+
+    # Find all summary.json files (artifacts download to flat structure)
+    summary_files = sorted(results_base.glob("*/summary.json"))
+
+    if not summary_files:
+        print(f"No results found in {results_base}")
+        print(f"Directory contents: {list(results_base.glob('*'))}")
+        # Write empty comment so job doesn't fail
+        Path("comment.md").write_text("# Evaluation Results\n\nNo results found.\n")
+        sys.exit(0)
+
+    # Check for failures
+    failed = 0
+    succeeded = 0
+    all_results = []
+
+    print("\nResults:")
+    print("========")
+
+    for summary_file in summary_files:
+        try:
+            summary = json.loads(summary_file.read_text())
+            artifact_name = summary_file.parent.name
+
+            all_results.append({
+                "artifact": artifact_name,
+                "summary": summary,
+            })
+
+            print(f"✅ {artifact_name}")
+            succeeded += 1
+
+        except json.JSONDecodeError:
+            print(f"❌ {summary_file.parent.name} - Could not parse JSON")
+            failed += 1
+
+    print("\nSummary:")
+    print("========")
+    print(f"Processed: {len(all_results)}")
+    print(f"Failed: {failed}")
+
+    # Generate comment
+    comment = "# 📊 Evaluation Results\n\n"
+
+    if all_results:
+        comment += f"Processed {len(all_results)} evaluation(s).\n\n"
+
+        # Build table with results
+        comment += "| Test Name | Model | Baseline | With Skill | Cases Pass | Winner |\n"
+        comment += "|-----------|-------|----------|------------|------------|--------|\n"
+
+        # Rating hierarchy for comparison
+        rating_hierarchy = {'vague': 0, 'regular': 1, 'good': 2, 'outstanding': 3}
+
+        for result in all_results:
+            summary = result['summary']
+            artifact = result['artifact']
+
+            # Debug: Log top-level keys
+            print(f"\n📋 Processing: {artifact}")
+            print(f"   Top-level keys: {list(summary.keys())}")
+
+            # Extract key data from nested results[0]
+            eval_result = summary.get('results', [{}])[0] if summary.get('results') else {}
+            print(f"   Nested result keys: {list(eval_result.keys())}")
+
+            skill = eval_result.get('skill', 'N/A')
+            model = eval_result.get('model', 'N/A')
+            baseline_rating = eval_result.get('baseline_rating', 'N/A')
+            skill_rating = eval_result.get('skill_rating', 'N/A')
+            baseline_pass = eval_result.get('baseline_pass_count', 'N/A')
+            skill_pass = eval_result.get('skill_pass_count', 'N/A')
+            overall_better = eval_result.get('judgment', {}).get('overall_better', 'N/A')
+
+            print(f"   Extracted: model={model}, baseline={baseline_rating}, skill={skill_rating}, winner={overall_better}")
+
+            # Determine winner
+            if overall_better == 'A':
+                winner = "Baseline"
+            elif overall_better == 'B':
+                winner = "With Skill"
+            elif overall_better == 'TIE':
+                winner = "Tie"
+            else:
+                winner = "N/A"
+
+            # Determine emoji for cases pass (skill >= baseline in rating hierarchy)
+            baseline_score = rating_hierarchy.get(baseline_rating, -1)
+            skill_score = rating_hierarchy.get(skill_rating, -1)
+            pass_emoji = "✅" if skill_score >= baseline_score else "❌"
+
+            # Build row
+            test_link = f"[{artifact}]()"
+            comment += f"| {test_link} | {model} | {baseline_rating} | {skill_rating} | {pass_emoji} {skill_pass} | {winner} |\n"
+
+        comment += "\n"
+    else:
+        comment += "No evaluation results found.\n"
+
+    Path("comment.md").write_text(comment)
+    print("\n✓ Comment saved to comment.md")
+
+    sys.exit(0)
+
+
+if __name__ == "__main__":
+    import os
+    main()