11using System . Text . Json ;
2+ using System . Text . RegularExpressions ;
23using SkillValidator . Models ;
34using SkillValidator . Utilities ;
45using GitHub . Copilot . SDK ;
56
67namespace SkillValidator . Services ;
78
8- public static class OverfittingJudge
9+ public static partial class OverfittingJudge
910{
1011 private const int MaxRetries = 2 ;
1112 private const int MaxSkillContentChars = 48_000 ; // ~12K tokens
@@ -38,6 +39,10 @@ public static class OverfittingJudge
3839
3940 private static async Task < OverfittingResult > AnalyzeOnce ( SkillInfo skill , OverfittingJudgeOptions options )
4041 {
42+ // Run deterministic prompt checks first — these are high-confidence signals
43+ // that don't need LLM judgment.
44+ var deterministicPromptAssessments = DetectPromptOverfitting ( skill ) ;
45+
4146 var client = await AgentRunner . GetSharedClient ( options . Verbose ) ;
4247
4348 await using var session = await client . CreateSessionAsync ( new SessionConfig
@@ -91,7 +96,7 @@ private static async Task<OverfittingResult> AnalyzeOnce(SkillInfo skill, Overfi
9196 var content = await done . Task . WaitAsync ( cts . Token ) ;
9297
9398 if ( ! string . IsNullOrEmpty ( content ) )
94- return ParseOverfittingResponse ( content ) ;
99+ return ParseOverfittingResponse ( content , deterministicPromptAssessments ) ;
95100
96101 throw new InvalidOperationException ( "Overfitting judge returned no content" ) ;
97102 }
@@ -154,7 +159,7 @@ public static async Task GenerateFix(SkillInfo skill, OverfittingResult result,
154159 }
155160 }
156161
157- internal static OverfittingResult ParseOverfittingResponse ( string content )
162+ internal static OverfittingResult ParseOverfittingResponse ( string content , IReadOnlyList < PromptOverfitAssessment > ? deterministicPromptAssessments = null )
158163 {
159164 var jsonStr = LlmJson . ExtractJson ( content )
160165 ?? throw new InvalidOperationException (
@@ -199,6 +204,37 @@ internal static OverfittingResult ParseOverfittingResponse(string content)
199204 }
200205 }
201206
207+ // Parse LLM prompt assessments
208+ var llmPromptAssessments = new List < PromptOverfitAssessment > ( ) ;
209+ if ( parsed . TryGetProperty ( "prompt_assessments" , out var promptEl ) && promptEl . ValueKind == JsonValueKind . Array )
210+ {
211+ foreach ( var item in promptEl . EnumerateArray ( ) )
212+ {
213+ var scenario = item . TryGetProperty ( "scenario" , out var s ) ? s . GetString ( ) ?? "" : "" ;
214+ var issue = item . TryGetProperty ( "issue" , out var i ) ? i . GetString ( ) ?? "" : "" ;
215+ var confidence = item . TryGetProperty ( "confidence" , out var conf ) ? conf . GetDouble ( ) : 0.5 ;
216+ var reasoning = item . TryGetProperty ( "reasoning" , out var r ) ? r . GetString ( ) ?? "" : "" ;
217+ llmPromptAssessments . Add ( new PromptOverfitAssessment ( scenario , issue , confidence , reasoning ) ) ;
218+ }
219+ }
220+
221+ // Merge deterministic prompt assessments (high priority) with LLM-detected ones.
222+ // Deterministic detections are authoritative — they have confidence 1.0.
223+ // Only add LLM detections for (scenario, issue) pairs not already covered by deterministic checks.
224+ var promptAssessments = new List < PromptOverfitAssessment > ( deterministicPromptAssessments ?? [ ] ) ;
225+ var coveredScenarioIssues = new HashSet < string > (
226+ promptAssessments . Select ( p => $ "{ p . Scenario } \u0001 { p . Issue } ". ToLowerInvariant ( ) )
227+ ) ;
228+ foreach ( var llmAssessment in llmPromptAssessments )
229+ {
230+ var key = $ "{ llmAssessment . Scenario } \u0001 { llmAssessment . Issue } ". ToLowerInvariant ( ) ;
231+ if ( ! coveredScenarioIssues . Contains ( key ) )
232+ {
233+ promptAssessments . Add ( llmAssessment ) ;
234+ coveredScenarioIssues . Add ( key ) ;
235+ }
236+ }
237+
202238 double llmOverallScore = 0.0 ;
203239 if ( parsed . TryGetProperty ( "overall_overfitting_score" , out var overallEl ) )
204240 llmOverallScore = Math . Clamp ( overallEl . GetDouble ( ) , 0.0 , 1.0 ) ;
@@ -208,7 +244,7 @@ internal static OverfittingResult ParseOverfittingResponse(string content)
208244 overallReasoning = reasonEl . GetString ( ) ?? "" ;
209245
210246 // Compute score from per-element classifications
211- double computedScore = ComputeOverfittingScore ( rubricAssessments , assertionAssessments ) ;
247+ double computedScore = ComputeOverfittingScore ( rubricAssessments , assertionAssessments , promptAssessments ) ;
212248
213249 // Blend: 60% computed (systematic) + 40% LLM holistic
214250 double finalScore = Math . Clamp ( 0.6 * computedScore + 0.4 * llmOverallScore , 0.0 , 1.0 ) ;
@@ -225,13 +261,15 @@ internal static OverfittingResult ParseOverfittingResponse(string content)
225261 severity ,
226262 rubricAssessments ,
227263 assertionAssessments ,
264+ promptAssessments ,
228265 crossScenarioIssues ,
229266 overallReasoning ) ;
230267 }
231268
232269 internal static double ComputeOverfittingScore (
233270 IReadOnlyList < RubricOverfitAssessment > rubricAssessments ,
234- IReadOnlyList < AssertionOverfitAssessment > assertionAssessments )
271+ IReadOnlyList < AssertionOverfitAssessment > assertionAssessments ,
272+ IReadOnlyList < PromptOverfitAssessment > ? promptAssessments = null )
235273 {
236274 // Rubric scoring: weight by classification and confidence
237275 double rubricScore = 0 ;
@@ -264,13 +302,80 @@ internal static double ComputeOverfittingScore(
264302 assertionCount ++ ;
265303 }
266304
267- // Weighted combination (rubric matters more — assertions are secondary gates)
305+ // Prompt scoring — explicit skill references in prompts are a severe signal.
306+ // Each prompt issue is scored at full weight (1.0) * confidence.
307+ double promptScore = 0 ;
308+ int promptCount = promptAssessments ? . Count ?? 0 ;
309+ if ( promptAssessments is not null )
310+ {
311+ foreach ( var item in promptAssessments )
312+ {
313+ promptScore += 1.0 * item . Confidence ;
314+ }
315+ }
316+
268317 double rubricAvg = rubricCount > 0 ? rubricScore / rubricCount : 0 ;
269318 double assertionAvg = assertionCount > 0 ? assertionScore / assertionCount : 0 ;
319+ double promptAvg = promptCount > 0 ? promptScore / promptCount : 0 ;
270320
321+ if ( promptCount > 0 )
322+ {
323+ // When prompt issues exist, they dominate the score because explicit skill
324+ // references in the prompt are the strongest form of overfitting — they
325+ // directly bias which agent gets advantage.
326+ // Weight: 40% prompt, 40% rubric, 20% assertion
327+ return Math . Clamp ( 0.4 * promptAvg + 0.4 * rubricAvg + 0.2 * assertionAvg , 0.0 , 1.0 ) ;
328+ }
329+
330+ // Original weighting when no prompt issues (rubric matters more — assertions are secondary gates)
271331 return Math . Clamp ( 0.7 * rubricAvg + 0.3 * assertionAvg , 0.0 , 1.0 ) ;
272332 }
273333
334+ /// <summary>
335+ /// Deterministic pre-check: scans scenario prompts for explicit skill references
336+ /// that bias the evaluation. These patterns are unambiguously overfitted — they
337+ /// give the skilled agent a direct advantage by name-dropping the skill.
338+ /// </summary>
339+ internal static IReadOnlyList < PromptOverfitAssessment > DetectPromptOverfitting ( SkillInfo skill )
340+ {
341+ var assessments = new List < PromptOverfitAssessment > ( ) ;
342+ if ( skill . EvalConfig is null || string . IsNullOrWhiteSpace ( skill . Name ) )
343+ return assessments ;
344+
345+ foreach ( var scenario in skill . EvalConfig . Scenarios )
346+ {
347+ var prompt = scenario . Prompt ;
348+ if ( string . IsNullOrWhiteSpace ( prompt ) ) continue ;
349+
350+ // Check 1: Prompt explicitly contains the skill name (e.g., "migrate-dotnet10-to-dotnet11")
351+ if ( prompt . Contains ( skill . Name , StringComparison . OrdinalIgnoreCase ) )
352+ {
353+ assessments . Add ( new PromptOverfitAssessment (
354+ scenario . Name ,
355+ "explicit_skill_reference" ,
356+ 1.0 ,
357+ $ "Prompt explicitly mentions skill name '{ skill . Name } ' — this directly tells the skilled agent which skill to activate and disadvantages the baseline agent.") ) ;
358+ continue ; // One assessment per scenario is enough
359+ }
360+
361+ // Check 2: Prompt uses "use the ... skill" or "use ... skill to" phrasing
362+ // even if the exact skill name isn't used (e.g., "use the migration skill")
363+ if ( UseSkillPattern ( ) . IsMatch ( prompt ) )
364+ {
365+ assessments . Add ( new PromptOverfitAssessment (
366+ scenario . Name ,
367+ "skill_instruction" ,
368+ 0.9 ,
369+ "Prompt explicitly instructs the agent to 'use' a skill — this biases toward the skilled agent and creates an unfair comparison with the baseline." ) ) ;
370+ }
371+ }
372+
373+ return assessments ;
374+ }
375+
376+ [ GeneratedRegex ( @"\buse\s+(?:the\s+)?[\w-]+\s+skill\b" , RegexOptions . IgnoreCase ) ]
377+ private static partial Regex UseSkillPattern ( ) ;
378+
274379 internal static string BuildSystemPrompt ( ) =>
275380 """
276381 You are an expert evaluator assessing whether an AI skill's evaluation
@@ -434,6 +539,41 @@ to find the leak is solving the same problem differently.
434539 "Identified evidence of a memory leak and proposed a diagnosis path"
435540 → Tests whether the agent found the issue, not which metric it checked.
436541
542+ ### Scenario prompt classifications
543+
544+ ALSO assess each scenario's PROMPT for bias that unfairly advantages the
545+ skilled agent. This is a CRITICAL and often-overlooked form of overfitting:
546+
547+ - "explicit_skill_reference" — The prompt mentions the skill by name
548+ (e.g., "Use the migrate-dotnet10-to-dotnet11 skill"). This directly
549+ tells the skilled agent which skill to activate and creates an unfair
550+ disadvantage for the baseline agent, which cannot follow this instruction.
551+
552+ - "skill_instruction" — The prompt instructs the agent to "use a skill"
553+ or references skill-specific concepts that only make sense if the skill
554+ is loaded (e.g., "use the migration skill to help me").
555+
556+ - "neutral" — The prompt describes the task naturally without referencing
557+ skills. A developer might write this prompt regardless of whether a
558+ skill exists.
559+
560+ If the prompt is "neutral", do NOT include it in prompt_assessments.
561+
562+ #### Example 8: HIGH overfitting — prompt explicitly names the skill
563+
564+ SKILL name: migrate-dotnet10-to-dotnet11
565+
566+ OVERFITTED prompt — HIGH:
567+ "Use the migrate-dotnet10-to-dotnet11 skill to help me migrate my
568+ .NET 10 console app to .NET 11."
569+ → Tells the skilled agent exactly which skill to activate. The baseline
570+ agent wastes time looking for a skill it doesn't have. This is the
571+ strongest form of overfitting.
572+
573+ WELL-DESIGNED prompt:
574+ "I need to migrate my .NET 10 console app to .NET 11. What breaks?"
575+ → Describes the task naturally. Both agents get the same fair prompt.
576+
437577 Respond ONLY with JSON. No markdown, no commentary outside the JSON.
438578 """ ;
439579
@@ -470,7 +610,8 @@ internal static async Task<string> BuildUserPromptAsync(SkillInfo skill)
470610 === CLASSIFICATION REQUEST ===
471611
472612 Classify every rubric item and every assertion across all scenarios.
473- Then provide an overall overfitting score from 0.0 to 1.0.
613+ Also assess each scenario prompt for bias (skill name references, skill
614+ instructions). Then provide an overall overfitting score from 0.0 to 1.0.
474615
475616 Respond in this exact JSON schema:
476617
@@ -493,6 +634,14 @@ Then provide an overall overfitting score from 0.0 to 1.0.
493634 "reasoning": "<1-2 sentence explanation>"
494635 }
495636 ],
637+ "prompt_assessments": [
638+ {
639+ "scenario": "<scenario name>",
640+ "issue": "explicit_skill_reference" | "skill_instruction",
641+ "confidence": <0.0-1.0>,
642+ "reasoning": "<1-2 sentence explanation>"
643+ }
644+ ],
496645 "cross_scenario_issues": [
497646 "<description of any cross-scenario overfitting patterns>"
498647 ],
0 commit comments