@@ -32,7 +32,7 @@ def validate_environment():
3232
3333def detect_changes (pr_number : int ) -> str :
3434 """Detect modified skills in PR."""
35- print ("\n ==> Detecting changes" )
35+ print ("\n ==> Detecting changes in PR #{}" . format ( pr_number ) )
3636
3737 result = subprocess .run (
3838 ["python3" , "ci/detect_changes.py" , str (pr_number )],
@@ -43,15 +43,21 @@ def detect_changes(pr_number: int) -> str:
4343 modified_skills = result .stdout .strip ()
4444
4545 if modified_skills :
46- print (f"✓ Will test skills: { modified_skills } " )
46+ skill_list = modified_skills .split ()
47+ print (f"✓ Found { len (skill_list )} modified skill(s): { ', ' .join (skill_list [:3 ])} " +
48+ (f" +{ len (skill_list )- 3 } more" if len (skill_list ) > 3 else "" ))
4749 else :
48- print ("✓ Will test all skills" )
50+ print ("✓ No skills modified - will test all skills" )
4951
5052 return modified_skills
5153
5254
53- def generate_matrix (filter_provider : str = "all" ) -> list :
54- """Generate evaluation matrix from configuration."""
55+ def generate_matrix (filter_provider : str = "all" , skills : str = "" ) -> list :
56+ """Generate evaluation matrix from configuration with per-skill jobs.
57+
58+ If skills are provided, creates one matrix item per skill per model.
59+ Otherwise creates one item per model (tests all skills).
60+ """
5561 print ("\n ==> Generating evaluation matrix" )
5662
5763 result = subprocess .run (
@@ -62,12 +68,14 @@ def generate_matrix(filter_provider: str = "all") -> list:
6268
6369 if result .returncode != 0 :
6470 print ("❌ Error generating matrix" )
71+ print (result .stderr )
6572 sys .exit (1 )
6673
6774 try :
6875 matrix_data = json .loads (result .stdout )
6976 except json .JSONDecodeError :
7077 print ("❌ Error parsing matrix JSON" )
78+ print (result .stdout )
7179 sys .exit (1 )
7280
7381 items = matrix_data .get ("include" , [])
@@ -76,9 +84,25 @@ def generate_matrix(filter_provider: str = "all") -> list:
7684 print ("❌ Error: No enabled providers in configuration" )
7785 sys .exit (1 )
7886
79- print (f"✓ Generated matrix with { len (items )} configurations" )
80- for item in items :
81- print (f" - { item ['display_name' ]} " )
87+ # If skills are specified, expand matrix to one item per skill per model
88+ if skills and skills .strip ():
89+ skill_list = skills .strip ().split ()
90+ expanded_items = []
91+
92+ for item in items :
93+ for skill in skill_list :
94+ expanded_item = item .copy ()
95+ expanded_item ["skill" ] = skill
96+ expanded_item ["display_name" ] = f"{ item ['display_name' ]} / { skill } "
97+ expanded_items .append (expanded_item )
98+
99+ items = expanded_items
100+ print (f"✓ Generated matrix with { len (items )} job(s) ({ len (items )// len (skill_list )} model(s) × { len (skill_list )} skill(s))" )
101+ else :
102+ print (f"✓ Generated matrix with { len (items )} configuration(s) (all skills per model)" )
103+
104+ for i , item in enumerate (items , 1 ):
105+ print (f" { i } . { item ['display_name' ]} " )
82106
83107 return items
84108
@@ -105,15 +129,75 @@ def run_sequential(items: list, threshold: int = 50):
105129 subprocess .run (["python3" , "ci/consolidate_results.py" ])
106130
107131
108- def run_parallel (items : list , threshold : int = 50 ):
109- """Run evaluations in parallel (for GitHub Actions matrix strategy)."""
110- print ("\n ==> Running evaluations in parallel" )
111- print ("(This is for GitHub Actions matrix strategy - not running locally)" )
132+ def run_parallel_local (items : list , threshold : int = 50 ):
133+ """Run evaluations in parallel locally (one job per skill per model)."""
134+ print (f"\n ==> Running { len (items )} evaluation(s) in parallel" )
112135
113- for item in items :
114- print (f" Will run: { item ['display_name' ]} " )
115-
116- print ("\n Use GitHub Actions matrix strategy in workflow for true parallelization" )
136+ import concurrent .futures
137+
138+ def run_single_eval (item ):
139+ provider = item ["provider" ]
140+ model = item ["model" ]
141+ extra_args = item .get ("extra_args" , "" )
142+ skill = item .get ("skill" )
143+
144+ display = f"{ provider } /{ model } " + (f"/{ skill } " if skill else "" )
145+ print (f"[{ display } ] Starting..." )
146+
147+ cmd = [
148+ "uv" ,
149+ "run" ,
150+ "--project" ,
151+ "tests" ,
152+ "--frozen" ,
153+ "tests/evaluator.py" ,
154+ "--provider" ,
155+ provider ,
156+ "--model" ,
157+ model ,
158+ "--threshold" ,
159+ str (threshold ),
160+ "--judge" ,
161+ "--verbose" ,
162+ "--report" ,
163+ ]
164+
165+ if extra_args .strip ():
166+ cmd .extend (extra_args .split ())
167+
168+ if skill :
169+ cmd .extend (["--skill" , skill ])
170+ else :
171+ cmd .append ("--all" )
172+
173+ result = subprocess .run (cmd , capture_output = False )
174+
175+ return provider , model , skill , result .returncode
176+
177+ with concurrent .futures .ThreadPoolExecutor (max_workers = min (len (items ), 4 )) as executor :
178+ futures = {executor .submit (run_single_eval , item ): item for item in items }
179+
180+ failed_count = 0
181+ for future in concurrent .futures .as_completed (futures ):
182+ try :
183+ provider , model , skill , exit_code = future .result ()
184+ display = f"{ provider } /{ model } " + (f"/{ skill } " if skill else "" )
185+ if exit_code == 0 :
186+ print (f"✅ [{ display } ] Completed" )
187+ else :
188+ print (f"❌ [{ display } ] Failed (exit code { exit_code } )" )
189+ failed_count += 1
190+ except Exception as e :
191+ print (f"❌ Error: { e } " )
192+ failed_count += 1
193+
194+ print (f"\n ✓ All evaluation(s) completed ({ len (items )- failed_count } /{ len (items )} passed)" )
195+
196+ if failed_count > 0 :
197+ print (f"⚠ { failed_count } evaluation(s) failed" )
198+
199+ # Consolidate
200+ subprocess .run (["python3" , "ci/consolidate_results.py" ])
117201
118202
119203def parse_command (comment : str ) -> tuple :
@@ -155,8 +239,8 @@ def main():
155239 modified_skills = detect_changes (args .pr_number )
156240 os .environ ["MODIFIED_SKILLS" ] = modified_skills
157241
158- # Generate matrix
159- items = generate_matrix (args .filter_provider )
242+ # Generate matrix (expand with per-skill jobs if skills detected)
243+ items = generate_matrix (args .filter_provider , modified_skills )
160244
161245 # Clean previous results
162246 results_base = Path ("tests/results" )
@@ -167,7 +251,7 @@ def main():
167251
168252 # Run evaluations
169253 if args .parallel :
170- run_parallel (items , args .threshold )
254+ run_parallel_local (items , args .threshold )
171255 else :
172256 run_sequential (items , args .threshold )
173257
0 commit comments