Skip to content

Commit 62d65dc

Browse files
JanKrivanekCopilot
andauthored
[Infra] Check for overfit in the scenario prompt as well (#218)
* Check for overfit in the scenario prompt as well * Update eng/skill-validator/src/Services/OverfittingJudge.cs Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Fix tests --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
1 parent 982a0bb commit 62d65dc

5 files changed

Lines changed: 458 additions & 9 deletions

File tree

eng/dashboard/generate-benchmark-data.ps1

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -106,15 +106,19 @@ foreach ($verdict in $results.verdicts) {
106106
# When breakdowns exist we use them; when they don't (older schema)
107107
# we fall back to the verdict-level flag for every scenario.
108108
$hasBreakdowns = $verdict.overfittingResult.PSObject.Properties['rubricAssessments'] -or
109-
$verdict.overfittingResult.PSObject.Properties['assertionAssessments']
109+
$verdict.overfittingResult.PSObject.Properties['assertionAssessments'] -or
110+
$verdict.overfittingResult.PSObject.Properties['promptAssessments']
110111

111112
if ($hasBreakdowns) {
112113
$rubrics = $verdict.overfittingResult.rubricAssessments | Where-Object { $_.scenario -eq $scenarioName }
113114
$assertions = $verdict.overfittingResult.assertionAssessments | Where-Object { $_.scenario -eq $scenarioName }
115+
$prompts = $verdict.overfittingResult.promptAssessments | Where-Object { $_.scenario -eq $scenarioName }
114116
# Rubric classifications: outcome | technique | vocabulary — flag non-outcome.
115117
# Assertion classifications: broad | narrow — flag narrow.
118+
# Prompt issues: any prompt assessment for this scenario is a flag.
116119
$scenarioHasIssues = ($rubrics | Where-Object { $_.classification -ne "outcome" }) -or
117-
($assertions | Where-Object { $_.classification -eq "narrow" })
120+
($assertions | Where-Object { $_.classification -eq "narrow" }) -or
121+
($prompts | Measure-Object).Count -gt 0
118122
if ($scenarioHasIssues) {
119123
$overfittingSeverity = $verdict.overfittingResult.severity.ToLower()
120124
$overfittingScore = $verdict.overfittingResult.score

eng/skill-validator/src/Models/Models.cs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -251,11 +251,18 @@ public sealed record AssertionOverfitAssessment(
251251
double Confidence,
252252
string Reasoning);
253253

254+
public sealed record PromptOverfitAssessment(
255+
string Scenario,
256+
string Issue, // e.g. "explicit_skill_reference" | "skill_instruction"
257+
double Confidence,
258+
string Reasoning);
259+
254260
public sealed record OverfittingResult(
255261
double Score, // [0, 1]
256262
OverfittingSeverity Severity,
257263
IReadOnlyList<RubricOverfitAssessment> RubricAssessments,
258264
IReadOnlyList<AssertionOverfitAssessment> AssertionAssessments,
265+
IReadOnlyList<PromptOverfitAssessment> PromptAssessments,
259266
IReadOnlyList<string> CrossScenarioIssues,
260267
string OverallReasoning);
261268

eng/skill-validator/src/Services/OverfittingJudge.cs

Lines changed: 156 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
11
using System.Text.Json;
2+
using System.Text.RegularExpressions;
23
using SkillValidator.Models;
34
using SkillValidator.Utilities;
45
using GitHub.Copilot.SDK;
56

67
namespace SkillValidator.Services;
78

8-
public static class OverfittingJudge
9+
public static partial class OverfittingJudge
910
{
1011
private const int MaxRetries = 2;
1112
private const int MaxSkillContentChars = 48_000; // ~12K tokens
@@ -38,6 +39,10 @@ public static class OverfittingJudge
3839

3940
private static async Task<OverfittingResult> AnalyzeOnce(SkillInfo skill, OverfittingJudgeOptions options)
4041
{
42+
// Run deterministic prompt checks first — these are high-confidence signals
43+
// that don't need LLM judgment.
44+
var deterministicPromptAssessments = DetectPromptOverfitting(skill);
45+
4146
var client = await AgentRunner.GetSharedClient(options.Verbose);
4247

4348
await using var session = await client.CreateSessionAsync(new SessionConfig
@@ -91,7 +96,7 @@ private static async Task<OverfittingResult> AnalyzeOnce(SkillInfo skill, Overfi
9196
var content = await done.Task.WaitAsync(cts.Token);
9297

9398
if (!string.IsNullOrEmpty(content))
94-
return ParseOverfittingResponse(content);
99+
return ParseOverfittingResponse(content, deterministicPromptAssessments);
95100

96101
throw new InvalidOperationException("Overfitting judge returned no content");
97102
}
@@ -154,7 +159,7 @@ public static async Task GenerateFix(SkillInfo skill, OverfittingResult result,
154159
}
155160
}
156161

157-
internal static OverfittingResult ParseOverfittingResponse(string content)
162+
internal static OverfittingResult ParseOverfittingResponse(string content, IReadOnlyList<PromptOverfitAssessment>? deterministicPromptAssessments = null)
158163
{
159164
var jsonStr = LlmJson.ExtractJson(content)
160165
?? throw new InvalidOperationException(
@@ -199,6 +204,37 @@ internal static OverfittingResult ParseOverfittingResponse(string content)
199204
}
200205
}
201206

207+
// Parse LLM prompt assessments
208+
var llmPromptAssessments = new List<PromptOverfitAssessment>();
209+
if (parsed.TryGetProperty("prompt_assessments", out var promptEl) && promptEl.ValueKind == JsonValueKind.Array)
210+
{
211+
foreach (var item in promptEl.EnumerateArray())
212+
{
213+
var scenario = item.TryGetProperty("scenario", out var s) ? s.GetString() ?? "" : "";
214+
var issue = item.TryGetProperty("issue", out var i) ? i.GetString() ?? "" : "";
215+
var confidence = item.TryGetProperty("confidence", out var conf) ? conf.GetDouble() : 0.5;
216+
var reasoning = item.TryGetProperty("reasoning", out var r) ? r.GetString() ?? "" : "";
217+
llmPromptAssessments.Add(new PromptOverfitAssessment(scenario, issue, confidence, reasoning));
218+
}
219+
}
220+
221+
// Merge deterministic prompt assessments (high priority) with LLM-detected ones.
222+
// Deterministic detections are authoritative — they have confidence 1.0.
223+
// Only add LLM detections for (scenario, issue) pairs not already covered by deterministic checks.
224+
var promptAssessments = new List<PromptOverfitAssessment>(deterministicPromptAssessments ?? []);
225+
var coveredScenarioIssues = new HashSet<string>(
226+
promptAssessments.Select(p => $"{p.Scenario}\u0001{p.Issue}".ToLowerInvariant())
227+
);
228+
foreach (var llmAssessment in llmPromptAssessments)
229+
{
230+
var key = $"{llmAssessment.Scenario}\u0001{llmAssessment.Issue}".ToLowerInvariant();
231+
if (!coveredScenarioIssues.Contains(key))
232+
{
233+
promptAssessments.Add(llmAssessment);
234+
coveredScenarioIssues.Add(key);
235+
}
236+
}
237+
202238
double llmOverallScore = 0.0;
203239
if (parsed.TryGetProperty("overall_overfitting_score", out var overallEl))
204240
llmOverallScore = Math.Clamp(overallEl.GetDouble(), 0.0, 1.0);
@@ -208,7 +244,7 @@ internal static OverfittingResult ParseOverfittingResponse(string content)
208244
overallReasoning = reasonEl.GetString() ?? "";
209245

210246
// Compute score from per-element classifications
211-
double computedScore = ComputeOverfittingScore(rubricAssessments, assertionAssessments);
247+
double computedScore = ComputeOverfittingScore(rubricAssessments, assertionAssessments, promptAssessments);
212248

213249
// Blend: 60% computed (systematic) + 40% LLM holistic
214250
double finalScore = Math.Clamp(0.6 * computedScore + 0.4 * llmOverallScore, 0.0, 1.0);
@@ -225,13 +261,15 @@ internal static OverfittingResult ParseOverfittingResponse(string content)
225261
severity,
226262
rubricAssessments,
227263
assertionAssessments,
264+
promptAssessments,
228265
crossScenarioIssues,
229266
overallReasoning);
230267
}
231268

232269
internal static double ComputeOverfittingScore(
233270
IReadOnlyList<RubricOverfitAssessment> rubricAssessments,
234-
IReadOnlyList<AssertionOverfitAssessment> assertionAssessments)
271+
IReadOnlyList<AssertionOverfitAssessment> assertionAssessments,
272+
IReadOnlyList<PromptOverfitAssessment>? promptAssessments = null)
235273
{
236274
// Rubric scoring: weight by classification and confidence
237275
double rubricScore = 0;
@@ -264,13 +302,80 @@ internal static double ComputeOverfittingScore(
264302
assertionCount++;
265303
}
266304

267-
// Weighted combination (rubric matters more — assertions are secondary gates)
305+
// Prompt scoring — explicit skill references in prompts are a severe signal.
306+
// Each prompt issue is scored at full weight (1.0) * confidence.
307+
double promptScore = 0;
308+
int promptCount = promptAssessments?.Count ?? 0;
309+
if (promptAssessments is not null)
310+
{
311+
foreach (var item in promptAssessments)
312+
{
313+
promptScore += 1.0 * item.Confidence;
314+
}
315+
}
316+
268317
double rubricAvg = rubricCount > 0 ? rubricScore / rubricCount : 0;
269318
double assertionAvg = assertionCount > 0 ? assertionScore / assertionCount : 0;
319+
double promptAvg = promptCount > 0 ? promptScore / promptCount : 0;
270320

321+
if (promptCount > 0)
322+
{
323+
// When prompt issues exist, they dominate the score because explicit skill
324+
// references in the prompt are the strongest form of overfitting — they
325+
// directly bias which agent gets advantage.
326+
// Weight: 40% prompt, 40% rubric, 20% assertion
327+
return Math.Clamp(0.4 * promptAvg + 0.4 * rubricAvg + 0.2 * assertionAvg, 0.0, 1.0);
328+
}
329+
330+
// Original weighting when no prompt issues (rubric matters more — assertions are secondary gates)
271331
return Math.Clamp(0.7 * rubricAvg + 0.3 * assertionAvg, 0.0, 1.0);
272332
}
273333

334+
/// <summary>
335+
/// Deterministic pre-check: scans scenario prompts for explicit skill references
336+
/// that bias the evaluation. These patterns are unambiguously overfitted — they
337+
/// give the skilled agent a direct advantage by name-dropping the skill.
338+
/// </summary>
339+
internal static IReadOnlyList<PromptOverfitAssessment> DetectPromptOverfitting(SkillInfo skill)
340+
{
341+
var assessments = new List<PromptOverfitAssessment>();
342+
if (skill.EvalConfig is null || string.IsNullOrWhiteSpace(skill.Name))
343+
return assessments;
344+
345+
foreach (var scenario in skill.EvalConfig.Scenarios)
346+
{
347+
var prompt = scenario.Prompt;
348+
if (string.IsNullOrWhiteSpace(prompt)) continue;
349+
350+
// Check 1: Prompt explicitly contains the skill name (e.g., "migrate-dotnet10-to-dotnet11")
351+
if (prompt.Contains(skill.Name, StringComparison.OrdinalIgnoreCase))
352+
{
353+
assessments.Add(new PromptOverfitAssessment(
354+
scenario.Name,
355+
"explicit_skill_reference",
356+
1.0,
357+
$"Prompt explicitly mentions skill name '{skill.Name}' — this directly tells the skilled agent which skill to activate and disadvantages the baseline agent."));
358+
continue; // One assessment per scenario is enough
359+
}
360+
361+
// Check 2: Prompt uses "use the ... skill" or "use ... skill to" phrasing
362+
// even if the exact skill name isn't used (e.g., "use the migration skill")
363+
if (UseSkillPattern().IsMatch(prompt))
364+
{
365+
assessments.Add(new PromptOverfitAssessment(
366+
scenario.Name,
367+
"skill_instruction",
368+
0.9,
369+
"Prompt explicitly instructs the agent to 'use' a skill — this biases toward the skilled agent and creates an unfair comparison with the baseline."));
370+
}
371+
}
372+
373+
return assessments;
374+
}
375+
376+
[GeneratedRegex(@"\buse\s+(?:the\s+)?[\w-]+\s+skill\b", RegexOptions.IgnoreCase)]
377+
private static partial Regex UseSkillPattern();
378+
274379
internal static string BuildSystemPrompt() =>
275380
"""
276381
You are an expert evaluator assessing whether an AI skill's evaluation
@@ -434,6 +539,41 @@ to find the leak is solving the same problem differently.
434539
"Identified evidence of a memory leak and proposed a diagnosis path"
435540
→ Tests whether the agent found the issue, not which metric it checked.
436541
542+
### Scenario prompt classifications
543+
544+
ALSO assess each scenario's PROMPT for bias that unfairly advantages the
545+
skilled agent. This is a CRITICAL and often-overlooked form of overfitting:
546+
547+
- "explicit_skill_reference" — The prompt mentions the skill by name
548+
(e.g., "Use the migrate-dotnet10-to-dotnet11 skill"). This directly
549+
tells the skilled agent which skill to activate and creates an unfair
550+
disadvantage for the baseline agent, which cannot follow this instruction.
551+
552+
- "skill_instruction" — The prompt instructs the agent to "use a skill"
553+
or references skill-specific concepts that only make sense if the skill
554+
is loaded (e.g., "use the migration skill to help me").
555+
556+
- "neutral" — The prompt describes the task naturally without referencing
557+
skills. A developer might write this prompt regardless of whether a
558+
skill exists.
559+
560+
If the prompt is "neutral", do NOT include it in prompt_assessments.
561+
562+
#### Example 8: HIGH overfitting — prompt explicitly names the skill
563+
564+
SKILL name: migrate-dotnet10-to-dotnet11
565+
566+
OVERFITTED prompt — HIGH:
567+
"Use the migrate-dotnet10-to-dotnet11 skill to help me migrate my
568+
.NET 10 console app to .NET 11."
569+
→ Tells the skilled agent exactly which skill to activate. The baseline
570+
agent wastes time looking for a skill it doesn't have. This is the
571+
strongest form of overfitting.
572+
573+
WELL-DESIGNED prompt:
574+
"I need to migrate my .NET 10 console app to .NET 11. What breaks?"
575+
→ Describes the task naturally. Both agents get the same fair prompt.
576+
437577
Respond ONLY with JSON. No markdown, no commentary outside the JSON.
438578
""";
439579

@@ -470,7 +610,8 @@ internal static async Task<string> BuildUserPromptAsync(SkillInfo skill)
470610
=== CLASSIFICATION REQUEST ===
471611
472612
Classify every rubric item and every assertion across all scenarios.
473-
Then provide an overall overfitting score from 0.0 to 1.0.
613+
Also assess each scenario prompt for bias (skill name references, skill
614+
instructions). Then provide an overall overfitting score from 0.0 to 1.0.
474615
475616
Respond in this exact JSON schema:
476617
@@ -493,6 +634,14 @@ Then provide an overall overfitting score from 0.0 to 1.0.
493634
"reasoning": "<1-2 sentence explanation>"
494635
}
495636
],
637+
"prompt_assessments": [
638+
{
639+
"scenario": "<scenario name>",
640+
"issue": "explicit_skill_reference" | "skill_instruction",
641+
"confidence": <0.0-1.0>,
642+
"reasoning": "<1-2 sentence explanation>"
643+
}
644+
],
496645
"cross_scenario_issues": [
497646
"<description of any cross-scenario overfitting patterns>"
498647
],

eng/skill-validator/src/Services/Reporter.cs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,10 @@ private static void ReportConsole(IReadOnlyList<SkillVerdict> verdicts, bool ver
113113
// For moderate/high, show top signals
114114
if (overfitResult.Severity is OverfittingSeverity.Moderate or OverfittingSeverity.High)
115115
{
116+
// Show prompt-level issues first (most severe)
117+
foreach (var item in overfitResult.PromptAssessments)
118+
Console.WriteLine($" \x1b[2m•\x1b[0m [{item.Issue}] \x1b[2mscenario \"{item.Scenario}\"\x1b[0m\n \x1b[2m— {item.Reasoning}\x1b[0m");
119+
116120
var topRubric = overfitResult.RubricAssessments
117121
.Where(a => a.Classification != "outcome")
118122
.OrderByDescending(a => a.Confidence)

0 commit comments

Comments
 (0)