chore: parallel more

Ariel-Rodriguez · Ariel-Rodriguez · commit 1a17b07893c1 · 2026-01-31T16:31:23.000+01:00
diff --git a/.github/workflows/skill-validation.yml b/.github/workflows/skill-validation.yml
@@ -9,21 +9,34 @@ permissions:
   contents: read
 
 jobs:
-  # Prepare matrix for parallel evaluation (default)
+  # Prepare matrix for per-skill parallelization
   prepare:
     runs-on: ubuntu-latest
     if: github.event.issue.pull_request && contains(github.event.comment.body, '/test')
     outputs:
       matrix: ${{ steps.generate-matrix.outputs.matrix }}
-      filter: ${{ steps.parse.outputs.filter }}
     
     steps:
-      - name: Parse test command
-        id: parse
+      - uses: actions/checkout@v4
+
+      - uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+
+      - name: Detect skills and generate matrix
+        id: generate-matrix
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         run: |
+          python3 -m pip install -q pyyaml
+          
+          # Detect which skills changed
+          MODIFIED_SKILLS=$(python3 ci/detect_changes.py ${{ github.event.issue.number }} 2>/dev/null || echo "")
+          echo "Modified skills: $MODIFIED_SKILLS"
+          
+          # Extract filter from comment
           COMMENT="${{ github.event.comment.body }}"
           FILTER="all"
-          
           if echo "$COMMENT" | grep -q "/test copilot"; then
             FILTER="copilot"
           elif echo "$COMMENT" | grep -q "/test ollama"; then
@@ -32,24 +45,42 @@ jobs:
             FILTER="gemini"
           fi
           
-          echo "filter=$FILTER" >> $GITHUB_OUTPUT
-      
-      - uses: actions/checkout@v4
-
-      - uses: actions/setup-python@v5
-        with:
-          python-version: '3.11'
-
-      - name: Generate matrix
-        id: generate-matrix
-        run: |
-          python3 -m pip install -q pyyaml
-          MATRIX=$(python3 ci/matrix_generator.py --filter-provider "${{ steps.parse.outputs.filter }}")
+          # Generate base matrix from config
+          BASE_MATRIX=$(python3 ci/matrix_generator.py --filter-provider "$FILTER")
+          
+          # Expand matrix: one job per skill per model
+          python3 << 'PYTHON_EOF'
+          import json
+          import sys
+          
+          base_matrix = json.loads("""$BASE_MATRIX""")
+          modified_skills = """$MODIFIED_SKILLS""".strip().split()
+          
+          matrix = {"include": []}
+          
+          if modified_skills and modified_skills[0]:
+              # Per-skill parallelization
+              for item in base_matrix["include"]:
+                  for skill in modified_skills:
+                      new_item = item.copy()
+                      new_item["skill"] = skill
+                      new_item["display_name"] = f"{item['display_name']} / {skill}"
+                      matrix["include"].append(new_item)
+          else:
+              # Test all skills
+              matrix = base_matrix
+          
+          print(json.dumps(matrix, indent=2))
+          with open('/tmp/matrix.json', 'w') as f:
+              json.dump(matrix, f)
+          PYTHON_EOF
+          
+          MATRIX=$(cat /tmp/matrix.json)
           echo "matrix<<EOF" >> $GITHUB_OUTPUT
           echo "$MATRIX" >> $GITHUB_OUTPUT
           echo "EOF" >> $GITHUB_OUTPUT
 
-  # Run evaluations in parallel (default: matrix strategy)
+  # Run evaluations - one per skill per model (parallel)
   evaluate:
     needs: prepare
     runs-on: ubuntu-latest
@@ -72,33 +103,40 @@ jobs:
         with:
           enable-cache: true
 
-      - name: Detect changes
-        id: changes
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: |
-          python3 ci/detect_changes.py ${{ github.event.issue.number }} > modified_skills.txt 2>/dev/null || echo "" > modified_skills.txt
-          SKILLS=$(cat modified_skills.txt)
-          echo "skills=$SKILLS" >> $GITHUB_OUTPUT
-
       - name: Run evaluation for ${{ matrix.display_name }}
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
           COPILOT_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
           OLLAMA_API_KEY: ${{ secrets.OLLAMA_API_KEY }}
-          MODIFIED_SKILLS: ${{ steps.changes.outputs.skills }}
         run: |
-          python3 ci/run_evaluation.py "${{ matrix.provider }}" "${{ matrix.model }}" 50 "${{ matrix.extra_args }}"
+          # Build command
+          CMD="uv run --project tests --frozen tests/evaluator.py"
+          CMD="$CMD --provider ${{ matrix.provider }}"
+          CMD="$CMD --model '${{ matrix.model }}'"
+          CMD="$CMD --judge --verbose --report"
+          
+          if [ -n "${{ matrix.extra_args }}" ]; then
+            CMD="$CMD ${{ matrix.extra_args }}"
+          fi
+          
+          if [ -n "${{ matrix.skill }}" ]; then
+            CMD="$CMD --skill ${{ matrix.skill }}"
+          else
+            CMD="$CMD --all"
+          fi
+          
+          echo "Running: $CMD"
+          eval "$CMD"
 
       - name: Upload results
         if: always()
         uses: actions/upload-artifact@v4
         with:
-          name: results-${{ matrix.provider }}-${{ matrix.model }}
-          path: tests/results/${{ matrix.provider }}/${{ matrix.model }}/
+          name: results-${{ matrix.provider }}-${{ matrix.model }}${{ matrix.skill && format('-{0}', matrix.skill) || '' }}
+          path: tests/results/
           retention-days: 1
 
-  # Consolidate results from all parallel runs
+  # Consolidate and post results
   consolidate:
     needs: evaluate
     runs-on: ubuntu-latest
@@ -112,7 +150,29 @@ jobs:
           path: tests/results/
 
       - name: Consolidate results
-        run: python3 ci/consolidate_results.py || true
+        run: |
+          python3 << 'PYTHON_EOF'
+          import json
+          from pathlib import Path
+          
+          results_base = Path("tests/results")
+          all_passed = True
+          
+          # Scan all result dirs
+          for summary_file in results_base.glob("*/summary.json"):
+              try:
+                  with open(summary_file) as f:
+                      summary = json.load(f)
+                      # Check if evaluation passed
+              except:
+                  pass
+          
+          # Generate comment
+          comment = "## 🎉 Skill Evaluations Complete\n\n"
+          comment += "Results aggregated from parallel runs.\n"
+          
+          Path("comment.md").write_text(comment)
+          PYTHON_EOF
 
       - name: Post results
         uses: marocchino/sticky-pull-request-comment@v2
diff --git a/ci/config.yaml b/ci/config.yaml
@@ -4,11 +4,11 @@
 ollama:
   enabled: true
   local: false
-  context: 32000
+  context: 28000
   models:
-    - gpt-oss:20b-cloud
     - rnj-1:8b-cloud
     - devstral-small-2:24b-cloud
+#   - gpt-oss:20b-cloud
 
 copilot:
   enabled: false
diff --git a/ci/orchestrate_evaluations.py b/ci/orchestrate_evaluations.py
@@ -32,7 +32,7 @@ def validate_environment():
 
 def detect_changes(pr_number: int) -> str:
     """Detect modified skills in PR."""
-    print("\n==> Detecting changes")
+    print("\n==> Detecting changes in PR #{}".format(pr_number))
 
     result = subprocess.run(
         ["python3", "ci/detect_changes.py", str(pr_number)],
@@ -43,15 +43,21 @@ def detect_changes(pr_number: int) -> str:
     modified_skills = result.stdout.strip()
 
     if modified_skills:
-        print(f"✓ Will test skills: {modified_skills}")
+        skill_list = modified_skills.split()
+        print(f"✓ Found {len(skill_list)} modified skill(s): {', '.join(skill_list[:3])}" + 
+              (f" +{len(skill_list)-3} more" if len(skill_list) > 3 else ""))
     else:
-        print("✓ Will test all skills")
+        print("✓ No skills modified - will test all skills")
 
     return modified_skills
 
 
-def generate_matrix(filter_provider: str = "all") -> list:
-    """Generate evaluation matrix from configuration."""
+def generate_matrix(filter_provider: str = "all", skills: str = "") -> list:
+    """Generate evaluation matrix from configuration with per-skill jobs.
+    
+    If skills are provided, creates one matrix item per skill per model.
+    Otherwise creates one item per model (tests all skills).
+    """
     print("\n==> Generating evaluation matrix")
 
     result = subprocess.run(
@@ -62,12 +68,14 @@ def generate_matrix(filter_provider: str = "all") -> list:
 
     if result.returncode != 0:
         print("❌ Error generating matrix")
+        print(result.stderr)
         sys.exit(1)
 
     try:
         matrix_data = json.loads(result.stdout)
     except json.JSONDecodeError:
         print("❌ Error parsing matrix JSON")
+        print(result.stdout)
         sys.exit(1)
 
     items = matrix_data.get("include", [])
@@ -76,9 +84,25 @@ def generate_matrix(filter_provider: str = "all") -> list:
         print("❌ Error: No enabled providers in configuration")
         sys.exit(1)
 
-    print(f"✓ Generated matrix with {len(items)} configurations")
-    for item in items:
-        print(f"  - {item['display_name']}")
+    # If skills are specified, expand matrix to one item per skill per model
+    if skills and skills.strip():
+        skill_list = skills.strip().split()
+        expanded_items = []
+        
+        for item in items:
+            for skill in skill_list:
+                expanded_item = item.copy()
+                expanded_item["skill"] = skill
+                expanded_item["display_name"] = f"{item['display_name']} / {skill}"
+                expanded_items.append(expanded_item)
+        
+        items = expanded_items
+        print(f"✓ Generated matrix with {len(items)} job(s) ({len(items)//len(skill_list)} model(s) × {len(skill_list)} skill(s))")
+    else:
+        print(f"✓ Generated matrix with {len(items)} configuration(s) (all skills per model)")
+    
+    for i, item in enumerate(items, 1):
+        print(f"  {i}. {item['display_name']}")
 
     return items
 
@@ -105,15 +129,75 @@ def run_sequential(items: list, threshold: int = 50):
     subprocess.run(["python3", "ci/consolidate_results.py"])
 
 
-def run_parallel(items: list, threshold: int = 50):
-    """Run evaluations in parallel (for GitHub Actions matrix strategy)."""
-    print("\n==> Running evaluations in parallel")
-    print("(This is for GitHub Actions matrix strategy - not running locally)")
+def run_parallel_local(items: list, threshold: int = 50):
+    """Run evaluations in parallel locally (one job per skill per model)."""
+    print(f"\n==> Running {len(items)} evaluation(s) in parallel")
 
-    for item in items:
-        print(f"  Will run: {item['display_name']}")
-
-    print("\nUse GitHub Actions matrix strategy in workflow for true parallelization")
+    import concurrent.futures
+    
+    def run_single_eval(item):
+        provider = item["provider"]
+        model = item["model"]
+        extra_args = item.get("extra_args", "")
+        skill = item.get("skill")
+        
+        display = f"{provider}/{model}" + (f"/{skill}" if skill else "")
+        print(f"[{display}] Starting...")
+        
+        cmd = [
+            "uv",
+            "run",
+            "--project",
+            "tests",
+            "--frozen",
+            "tests/evaluator.py",
+            "--provider",
+            provider,
+            "--model",
+            model,
+            "--threshold",
+            str(threshold),
+            "--judge",
+            "--verbose",
+            "--report",
+        ]
+        
+        if extra_args.strip():
+            cmd.extend(extra_args.split())
+        
+        if skill:
+            cmd.extend(["--skill", skill])
+        else:
+            cmd.append("--all")
+        
+        result = subprocess.run(cmd, capture_output=False)
+        
+        return provider, model, skill, result.returncode
+    
+    with concurrent.futures.ThreadPoolExecutor(max_workers=min(len(items), 4)) as executor:
+        futures = {executor.submit(run_single_eval, item): item for item in items}
+        
+        failed_count = 0
+        for future in concurrent.futures.as_completed(futures):
+            try:
+                provider, model, skill, exit_code = future.result()
+                display = f"{provider}/{model}" + (f"/{skill}" if skill else "")
+                if exit_code == 0:
+                    print(f"✅ [{display}] Completed")
+                else:
+                    print(f"❌ [{display}] Failed (exit code {exit_code})")
+                    failed_count += 1
+            except Exception as e:
+                print(f"❌ Error: {e}")
+                failed_count += 1
+    
+    print(f"\n✓ All evaluation(s) completed ({len(items)-failed_count}/{len(items)} passed)")
+    
+    if failed_count > 0:
+        print(f"⚠ {failed_count} evaluation(s) failed")
+    
+    # Consolidate
+    subprocess.run(["python3", "ci/consolidate_results.py"])
 
 
 def parse_command(comment: str) -> tuple:
@@ -155,8 +239,8 @@ def main():
     modified_skills = detect_changes(args.pr_number)
     os.environ["MODIFIED_SKILLS"] = modified_skills
 
-    # Generate matrix
-    items = generate_matrix(args.filter_provider)
+    # Generate matrix (expand with per-skill jobs if skills detected)
+    items = generate_matrix(args.filter_provider, modified_skills)
 
     # Clean previous results
     results_base = Path("tests/results")
@@ -167,7 +251,7 @@ def main():
 
     # Run evaluations
     if args.parallel:
-        run_parallel(items, args.threshold)
+        run_parallel_local(items, args.threshold)
     else:
         run_sequential(items, args.threshold)
 
diff --git a/ci/run_evaluation.py b/ci/run_evaluation.py