Skip to content

Commit 1a17b07

Browse files
chore: parallel more
1 parent d069134 commit 1a17b07

File tree

4 files changed

+208
-57
lines changed

4 files changed

+208
-57
lines changed

.github/workflows/skill-validation.yml

Lines changed: 94 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -9,21 +9,34 @@ permissions:
99
contents: read
1010

1111
jobs:
12-
# Prepare matrix for parallel evaluation (default)
12+
# Prepare matrix for per-skill parallelization
1313
prepare:
1414
runs-on: ubuntu-latest
1515
if: github.event.issue.pull_request && contains(github.event.comment.body, '/test')
1616
outputs:
1717
matrix: ${{ steps.generate-matrix.outputs.matrix }}
18-
filter: ${{ steps.parse.outputs.filter }}
1918

2019
steps:
21-
- name: Parse test command
22-
id: parse
20+
- uses: actions/checkout@v4
21+
22+
- uses: actions/setup-python@v5
23+
with:
24+
python-version: '3.11'
25+
26+
- name: Detect skills and generate matrix
27+
id: generate-matrix
28+
env:
29+
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
2330
run: |
31+
python3 -m pip install -q pyyaml
32+
33+
# Detect which skills changed
34+
MODIFIED_SKILLS=$(python3 ci/detect_changes.py ${{ github.event.issue.number }} 2>/dev/null || echo "")
35+
echo "Modified skills: $MODIFIED_SKILLS"
36+
37+
# Extract filter from comment
2438
COMMENT="${{ github.event.comment.body }}"
2539
FILTER="all"
26-
2740
if echo "$COMMENT" | grep -q "/test copilot"; then
2841
FILTER="copilot"
2942
elif echo "$COMMENT" | grep -q "/test ollama"; then
@@ -32,24 +45,42 @@ jobs:
3245
FILTER="gemini"
3346
fi
3447
35-
echo "filter=$FILTER" >> $GITHUB_OUTPUT
36-
37-
- uses: actions/checkout@v4
38-
39-
- uses: actions/setup-python@v5
40-
with:
41-
python-version: '3.11'
42-
43-
- name: Generate matrix
44-
id: generate-matrix
45-
run: |
46-
python3 -m pip install -q pyyaml
47-
MATRIX=$(python3 ci/matrix_generator.py --filter-provider "${{ steps.parse.outputs.filter }}")
48+
# Generate base matrix from config
49+
BASE_MATRIX=$(python3 ci/matrix_generator.py --filter-provider "$FILTER")
50+
51+
# Expand matrix: one job per skill per model
52+
python3 << 'PYTHON_EOF'
53+
import json
54+
import sys
55+
56+
base_matrix = json.loads("""$BASE_MATRIX""")
57+
modified_skills = """$MODIFIED_SKILLS""".strip().split()
58+
59+
matrix = {"include": []}
60+
61+
if modified_skills and modified_skills[0]:
62+
# Per-skill parallelization
63+
for item in base_matrix["include"]:
64+
for skill in modified_skills:
65+
new_item = item.copy()
66+
new_item["skill"] = skill
67+
new_item["display_name"] = f"{item['display_name']} / {skill}"
68+
matrix["include"].append(new_item)
69+
else:
70+
# Test all skills
71+
matrix = base_matrix
72+
73+
print(json.dumps(matrix, indent=2))
74+
with open('/tmp/matrix.json', 'w') as f:
75+
json.dump(matrix, f)
76+
PYTHON_EOF
77+
78+
MATRIX=$(cat /tmp/matrix.json)
4879
echo "matrix<<EOF" >> $GITHUB_OUTPUT
4980
echo "$MATRIX" >> $GITHUB_OUTPUT
5081
echo "EOF" >> $GITHUB_OUTPUT
5182
52-
# Run evaluations in parallel (default: matrix strategy)
83+
# Run evaluations - one per skill per model (parallel)
5384
evaluate:
5485
needs: prepare
5586
runs-on: ubuntu-latest
@@ -72,33 +103,40 @@ jobs:
72103
with:
73104
enable-cache: true
74105

75-
- name: Detect changes
76-
id: changes
77-
env:
78-
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
79-
run: |
80-
python3 ci/detect_changes.py ${{ github.event.issue.number }} > modified_skills.txt 2>/dev/null || echo "" > modified_skills.txt
81-
SKILLS=$(cat modified_skills.txt)
82-
echo "skills=$SKILLS" >> $GITHUB_OUTPUT
83-
84106
- name: Run evaluation for ${{ matrix.display_name }}
85107
env:
86108
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
87109
COPILOT_GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
88110
OLLAMA_API_KEY: ${{ secrets.OLLAMA_API_KEY }}
89-
MODIFIED_SKILLS: ${{ steps.changes.outputs.skills }}
90111
run: |
91-
python3 ci/run_evaluation.py "${{ matrix.provider }}" "${{ matrix.model }}" 50 "${{ matrix.extra_args }}"
112+
# Build command
113+
CMD="uv run --project tests --frozen tests/evaluator.py"
114+
CMD="$CMD --provider ${{ matrix.provider }}"
115+
CMD="$CMD --model '${{ matrix.model }}'"
116+
CMD="$CMD --judge --verbose --report"
117+
118+
if [ -n "${{ matrix.extra_args }}" ]; then
119+
CMD="$CMD ${{ matrix.extra_args }}"
120+
fi
121+
122+
if [ -n "${{ matrix.skill }}" ]; then
123+
CMD="$CMD --skill ${{ matrix.skill }}"
124+
else
125+
CMD="$CMD --all"
126+
fi
127+
128+
echo "Running: $CMD"
129+
eval "$CMD"
92130
93131
- name: Upload results
94132
if: always()
95133
uses: actions/upload-artifact@v4
96134
with:
97-
name: results-${{ matrix.provider }}-${{ matrix.model }}
98-
path: tests/results/${{ matrix.provider }}/${{ matrix.model }}/
135+
name: results-${{ matrix.provider }}-${{ matrix.model }}${{ matrix.skill && format('-{0}', matrix.skill) || '' }}
136+
path: tests/results/
99137
retention-days: 1
100138

101-
# Consolidate results from all parallel runs
139+
# Consolidate and post results
102140
consolidate:
103141
needs: evaluate
104142
runs-on: ubuntu-latest
@@ -112,7 +150,29 @@ jobs:
112150
path: tests/results/
113151

114152
- name: Consolidate results
115-
run: python3 ci/consolidate_results.py || true
153+
run: |
154+
python3 << 'PYTHON_EOF'
155+
import json
156+
from pathlib import Path
157+
158+
results_base = Path("tests/results")
159+
all_passed = True
160+
161+
# Scan all result dirs
162+
for summary_file in results_base.glob("*/summary.json"):
163+
try:
164+
with open(summary_file) as f:
165+
summary = json.load(f)
166+
# Check if evaluation passed
167+
except:
168+
pass
169+
170+
# Generate comment
171+
comment = "## 🎉 Skill Evaluations Complete\n\n"
172+
comment += "Results aggregated from parallel runs.\n"
173+
174+
Path("comment.md").write_text(comment)
175+
PYTHON_EOF
116176
117177
- name: Post results
118178
uses: marocchino/sticky-pull-request-comment@v2

ci/config.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,11 @@
44
ollama:
55
enabled: true
66
local: false
7-
context: 32000
7+
context: 28000
88
models:
9-
- gpt-oss:20b-cloud
109
- rnj-1:8b-cloud
1110
- devstral-small-2:24b-cloud
11+
# - gpt-oss:20b-cloud
1212

1313
copilot:
1414
enabled: false

ci/orchestrate_evaluations.py

Lines changed: 103 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ def validate_environment():
3232

3333
def detect_changes(pr_number: int) -> str:
3434
"""Detect modified skills in PR."""
35-
print("\n==> Detecting changes")
35+
print("\n==> Detecting changes in PR #{}".format(pr_number))
3636

3737
result = subprocess.run(
3838
["python3", "ci/detect_changes.py", str(pr_number)],
@@ -43,15 +43,21 @@ def detect_changes(pr_number: int) -> str:
4343
modified_skills = result.stdout.strip()
4444

4545
if modified_skills:
46-
print(f"✓ Will test skills: {modified_skills}")
46+
skill_list = modified_skills.split()
47+
print(f"✓ Found {len(skill_list)} modified skill(s): {', '.join(skill_list[:3])}" +
48+
(f" +{len(skill_list)-3} more" if len(skill_list) > 3 else ""))
4749
else:
48-
print("✓ Will test all skills")
50+
print("✓ No skills modified - will test all skills")
4951

5052
return modified_skills
5153

5254

53-
def generate_matrix(filter_provider: str = "all") -> list:
54-
"""Generate evaluation matrix from configuration."""
55+
def generate_matrix(filter_provider: str = "all", skills: str = "") -> list:
56+
"""Generate evaluation matrix from configuration with per-skill jobs.
57+
58+
If skills are provided, creates one matrix item per skill per model.
59+
Otherwise creates one item per model (tests all skills).
60+
"""
5561
print("\n==> Generating evaluation matrix")
5662

5763
result = subprocess.run(
@@ -62,12 +68,14 @@ def generate_matrix(filter_provider: str = "all") -> list:
6268

6369
if result.returncode != 0:
6470
print("❌ Error generating matrix")
71+
print(result.stderr)
6572
sys.exit(1)
6673

6774
try:
6875
matrix_data = json.loads(result.stdout)
6976
except json.JSONDecodeError:
7077
print("❌ Error parsing matrix JSON")
78+
print(result.stdout)
7179
sys.exit(1)
7280

7381
items = matrix_data.get("include", [])
@@ -76,9 +84,25 @@ def generate_matrix(filter_provider: str = "all") -> list:
7684
print("❌ Error: No enabled providers in configuration")
7785
sys.exit(1)
7886

79-
print(f"✓ Generated matrix with {len(items)} configurations")
80-
for item in items:
81-
print(f" - {item['display_name']}")
87+
# If skills are specified, expand matrix to one item per skill per model
88+
if skills and skills.strip():
89+
skill_list = skills.strip().split()
90+
expanded_items = []
91+
92+
for item in items:
93+
for skill in skill_list:
94+
expanded_item = item.copy()
95+
expanded_item["skill"] = skill
96+
expanded_item["display_name"] = f"{item['display_name']} / {skill}"
97+
expanded_items.append(expanded_item)
98+
99+
items = expanded_items
100+
print(f"✓ Generated matrix with {len(items)} job(s) ({len(items)//len(skill_list)} model(s) × {len(skill_list)} skill(s))")
101+
else:
102+
print(f"✓ Generated matrix with {len(items)} configuration(s) (all skills per model)")
103+
104+
for i, item in enumerate(items, 1):
105+
print(f" {i}. {item['display_name']}")
82106

83107
return items
84108

@@ -105,15 +129,75 @@ def run_sequential(items: list, threshold: int = 50):
105129
subprocess.run(["python3", "ci/consolidate_results.py"])
106130

107131

108-
def run_parallel(items: list, threshold: int = 50):
109-
"""Run evaluations in parallel (for GitHub Actions matrix strategy)."""
110-
print("\n==> Running evaluations in parallel")
111-
print("(This is for GitHub Actions matrix strategy - not running locally)")
132+
def run_parallel_local(items: list, threshold: int = 50):
133+
"""Run evaluations in parallel locally (one job per skill per model)."""
134+
print(f"\n==> Running {len(items)} evaluation(s) in parallel")
112135

113-
for item in items:
114-
print(f" Will run: {item['display_name']}")
115-
116-
print("\nUse GitHub Actions matrix strategy in workflow for true parallelization")
136+
import concurrent.futures
137+
138+
def run_single_eval(item):
139+
provider = item["provider"]
140+
model = item["model"]
141+
extra_args = item.get("extra_args", "")
142+
skill = item.get("skill")
143+
144+
display = f"{provider}/{model}" + (f"/{skill}" if skill else "")
145+
print(f"[{display}] Starting...")
146+
147+
cmd = [
148+
"uv",
149+
"run",
150+
"--project",
151+
"tests",
152+
"--frozen",
153+
"tests/evaluator.py",
154+
"--provider",
155+
provider,
156+
"--model",
157+
model,
158+
"--threshold",
159+
str(threshold),
160+
"--judge",
161+
"--verbose",
162+
"--report",
163+
]
164+
165+
if extra_args.strip():
166+
cmd.extend(extra_args.split())
167+
168+
if skill:
169+
cmd.extend(["--skill", skill])
170+
else:
171+
cmd.append("--all")
172+
173+
result = subprocess.run(cmd, capture_output=False)
174+
175+
return provider, model, skill, result.returncode
176+
177+
with concurrent.futures.ThreadPoolExecutor(max_workers=min(len(items), 4)) as executor:
178+
futures = {executor.submit(run_single_eval, item): item for item in items}
179+
180+
failed_count = 0
181+
for future in concurrent.futures.as_completed(futures):
182+
try:
183+
provider, model, skill, exit_code = future.result()
184+
display = f"{provider}/{model}" + (f"/{skill}" if skill else "")
185+
if exit_code == 0:
186+
print(f"✅ [{display}] Completed")
187+
else:
188+
print(f"❌ [{display}] Failed (exit code {exit_code})")
189+
failed_count += 1
190+
except Exception as e:
191+
print(f"❌ Error: {e}")
192+
failed_count += 1
193+
194+
print(f"\n✓ All evaluation(s) completed ({len(items)-failed_count}/{len(items)} passed)")
195+
196+
if failed_count > 0:
197+
print(f"⚠ {failed_count} evaluation(s) failed")
198+
199+
# Consolidate
200+
subprocess.run(["python3", "ci/consolidate_results.py"])
117201

118202

119203
def parse_command(comment: str) -> tuple:
@@ -155,8 +239,8 @@ def main():
155239
modified_skills = detect_changes(args.pr_number)
156240
os.environ["MODIFIED_SKILLS"] = modified_skills
157241

158-
# Generate matrix
159-
items = generate_matrix(args.filter_provider)
242+
# Generate matrix (expand with per-skill jobs if skills detected)
243+
items = generate_matrix(args.filter_provider, modified_skills)
160244

161245
# Clean previous results
162246
results_base = Path("tests/results")
@@ -167,7 +251,7 @@ def main():
167251

168252
# Run evaluations
169253
if args.parallel:
170-
run_parallel(items, args.threshold)
254+
run_parallel_local(items, args.threshold)
171255
else:
172256
run_sequential(items, args.threshold)
173257

0 commit comments

Comments
 (0)