Skip to content
Open
16 changes: 16 additions & 0 deletions eng/skill-validator/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -96,12 +96,28 @@ skill-validator --verdict-warn-only --require-evals ./skills/
| `--verdict-warn-only` | `false` | Treat verdict failures as warnings (exit 0). Execution errors and `--require-evals` still fail. |
| `--no-overfitting-check` | `false` | Disable the LLM-based overfitting analysis (on by default) |
| `--overfitting-fix` | `false` | Generate `eval.fixed.yaml` with improved rubric items/assertions |
| `--run-in-docker` | `false` | Run agent sessions and setup commands inside a Docker container (requires `GITHUB_TOKEN`). |
| `--verbose` | `false` | Show tool calls and agent events during runs |
| `--reporter <spec>` | `console`, `json`, `markdown` | Output format: `console`, `json`, `junit`, `markdown`. |
| `--results-dir <path>` | `.skill-validator-results` | Directory for file reporter output. |

Models are validated on startup — invalid model names fail fast with a list of available models.

## Docker mode

Use `--run-in-docker` to run agent sessions and setup commands in a Docker container instead of the host environment.

Requirements:
- Docker installed and running
- `GITHUB_TOKEN` set in your environment (for example: `export GITHUB_TOKEN="$(gh auth token)"`)

```bash
# Run validation in Docker
dotnet run --project src/SkillValidator -- --run-in-docker ./skills/
```

When Docker mode is enabled, skill-validator builds the runtime image from `src/Docker/Dockerfile`, mounts run workdirs at `/work`, mounts discovered skill directories read-only under `/skills/*`, and stops/removes the container during normal cleanup (and on process exit).

## Output

Results are displayed in the console with color-coded scores and metric deltas. By default, `json` and `markdown` reporters are enabled and write to `.skill-validator-results/` (override with `--results-dir`). File reporters write to that directory:
Expand Down
47 changes: 28 additions & 19 deletions eng/skill-validator/src/Commands/ValidateCommand.cs
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ public static RootCommand Create()
var reporterOpt = new Option<string[]>("--reporter") { Description = "Reporter (console, json, junit, markdown). Can be repeated.", AllowMultipleArgumentsPerToken = true };
var noOverfittingCheckOpt = new Option<bool>("--no-overfitting-check") { Description = "Disable LLM-based overfitting analysis (on by default)" };
var overfittingFixOpt = new Option<bool>("--overfitting-fix") { Description = "Generate a fixed eval.yaml with improved rubric items/assertions" };
var runInDockerOpt = new Option<bool>("--run-in-docker") { Description = "Run agents and judges inside Docker containers" };

var command = new RootCommand("Validate that agent skills meaningfully improve agent performance")
{
Expand All @@ -53,6 +54,7 @@ public static RootCommand Create()
reporterOpt,
noOverfittingCheckOpt,
overfittingFixOpt,
runInDockerOpt,
};

command.SetAction(async (parseResult, _) =>
Expand Down Expand Up @@ -98,6 +100,7 @@ public static RootCommand Create()
TestsDir = parseResult.GetValue(testsDirOpt),
OverfittingCheck = !parseResult.GetValue(noOverfittingCheckOpt),
OverfittingFix = parseResult.GetValue(overfittingFixOpt),
RunInDocker = parseResult.GetValue(runInDockerOpt),
};

return await Run(config);
Expand All @@ -117,6 +120,26 @@ public static RootCommand Create()

public static async Task<int> Run(ValidatorConfig config)
{
// Discover skills first (needed to determine Docker volume mounts)
var allSkills = new List<SkillInfo>();
foreach (var path in config.SkillPaths)
{
var skills = await SkillDiscovery.DiscoverSkills(path, config.TestsDir);
allSkills.AddRange(skills);
}

if (allSkills.Count == 0)
{
Console.Error.WriteLine("No skills found in the specified paths.");
return 1;
}

Console.WriteLine($"Found {allSkills.Count} skill(s)\n");

// Set up DockerCopilotServer with skill directories to mount
if (config.RunInDocker)
DockerCopilotServer.Initialize(config.Verbose, allSkills);

// Validate model early
try
{
Expand Down Expand Up @@ -148,22 +171,6 @@ public static async Task<int> Run(ValidatorConfig config)
if (config.Verbose)
Console.WriteLine($"Results dir: {config.ResultsDir}");

// Discover skills
var allSkills = new List<SkillInfo>();
foreach (var path in config.SkillPaths)
{
var skills = await SkillDiscovery.DiscoverSkills(path, config.TestsDir);
allSkills.AddRange(skills);
}

if (allSkills.Count == 0)
{
Console.Error.WriteLine("No skills found in the specified paths.");
return 1;
}

Console.WriteLine($"Found {allSkills.Count} skill(s)\n");

if (config.Runs < 5)
Console.WriteLine($"\x1b[33m⚠ Running with {config.Runs} run(s). For statistically significant results, use --runs 5 or higher.\x1b[0m");

Expand Down Expand Up @@ -202,6 +209,8 @@ await Reporter.ReportResults(verdicts, config.Reporters, config.Verbose,
config.Model, config.JudgeModel, config.ResultsDir);

await AgentRunner.StopSharedClient();
if (DockerCopilotServer.Instance is { } dockerServer)
await dockerServer.StopAsync();
await AgentRunner.CleanupWorkDirs();

// Always fail on execution errors, even in --verdict-warn-only mode
Expand Down Expand Up @@ -262,13 +271,13 @@ await Reporter.ReportResults(verdicts, config.Reporters, config.Verbose,
log(warning);

// Launch overfitting check in parallel with scenario execution
var workDir = Path.GetTempPath();
var overfittingWorkDir = DockerCopilotServer.Instance is not null ? "/tmp" : Path.GetTempPath();
Task<OverfittingResult?> overfittingTask = Task.FromResult<OverfittingResult?>(null);
if (config.OverfittingCheck && skill.EvalConfig is not null)
{
log("🔍 Running overfitting check (parallel)...");
overfittingTask = Services.OverfittingJudge.Analyze(skill, new OverfittingJudgeOptions(
config.JudgeModel, config.Verbose, config.JudgeTimeout, workDir));
config.JudgeModel, config.Verbose, config.JudgeTimeout, overfittingWorkDir));
}

bool singleScenario = skill.EvalConfig!.Scenarios.Count == 1;
Expand Down Expand Up @@ -301,7 +310,7 @@ await Reporter.ReportResults(verdicts, config.Reporters, config.Verbose,
try
{
await Services.OverfittingJudge.GenerateFix(skill, overfittingResult, new OverfittingJudgeOptions(
config.JudgeModel, config.Verbose, config.JudgeTimeout, workDir));
config.JudgeModel, config.Verbose, config.JudgeTimeout, overfittingWorkDir));
log("📝 Generated eval.fixed.yaml with suggested improvements");
}
catch (Exception ex)
Expand Down
11 changes: 11 additions & 0 deletions eng/skill-validator/src/Docker/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
FROM mcr.microsoft.com/dotnet/sdk:10.0 AS build

ARG COPILOT_SDK_VERSION
RUN dotnet new console -o /tmp/dl \
&& dotnet add /tmp/dl package GitHub.Copilot.SDK --version $COPILOT_SDK_VERSION \
&& dotnet build /tmp/dl -c Release \
&& cp /tmp/dl/bin/Release/net10.0/runtimes/*/native/copilot /usr/local/bin/copilot \
&& chmod +x /usr/local/bin/copilot \
&& rm -rf /tmp/dl

RUN copilot --version
1 change: 1 addition & 0 deletions eng/skill-validator/src/Models/Models.cs
Original file line number Diff line number Diff line change
Expand Up @@ -307,6 +307,7 @@ public sealed record ValidatorConfig
public string? TestsDir { get; init; }
public bool OverfittingCheck { get; init; } = true;
public bool OverfittingFix { get; init; }
public bool RunInDocker { get; init; } = false;
}

public static class DefaultWeights
Expand Down
53 changes: 38 additions & 15 deletions eng/skill-validator/src/Services/AgentRunner.cs
Original file line number Diff line number Diff line change
Expand Up @@ -35,18 +35,23 @@ public static async Task<CopilotClient> GetSharedClient(bool verbose)
{
if (_sharedClient is not null) return _sharedClient;

var options = new CopilotClientOptions
var options = new CopilotClientOptions();
if (DockerCopilotServer.Instance is {} dockerServer)
{
LogLevel = verbose ? "info" : "none",
};

var githubToken = Environment.GetEnvironmentVariable("GITHUB_TOKEN");
if (!string.IsNullOrEmpty(githubToken))
_workDirs.Add(dockerServer.GetHostDir());
options.CliUrl = await dockerServer.GetCliUrlAsync();
}
else
{
options.GitHubToken = githubToken;
// Clear the token from the environment so child processes
// (e.g. LLM-generated code, eval shell commands) cannot read it.
Environment.SetEnvironmentVariable("GITHUB_TOKEN", null);
options.LogLevel = verbose ? "info" : "none";
var githubToken = Environment.GetEnvironmentVariable("GITHUB_TOKEN");
if (!string.IsNullOrEmpty(githubToken))
{
options.GitHubToken = githubToken;
// Clear the token from the environment so child processes
// (e.g. LLM-generated code, eval shell commands) cannot read it.
Environment.SetEnvironmentVariable("GITHUB_TOKEN", null);
}
}

_sharedClient = new CopilotClient(options);
Expand Down Expand Up @@ -93,6 +98,9 @@ public static bool CheckPermission(PermissionRequest request, string workDir, st

if (string.IsNullOrEmpty(reqPath)) return true;

if (DockerCopilotServer.Instance is {} dockerServer && dockerServer.TryMapContainerPathToHost(reqPath, out var mappedPath))
reqPath = mappedPath;

var resolved = Path.GetFullPath(reqPath);
var allowedDirs = new List<string> { Path.GetFullPath(workDir) };
if (skillPath is not null) allowedDirs.Add(Path.GetFullPath(skillPath));
Expand All @@ -106,10 +114,11 @@ internal static SessionConfig BuildSessionConfig(
SkillInfo? skill, string model, string workDir,
IReadOnlyDictionary<string, MCPServerDef>? mcpServers = null)
{
var parentDir = DockerCopilotServer.Instance?.GetHostDir() ?? Path.GetTempPath();
var skillPath = skill is not null ? Path.GetDirectoryName(skill.Path) : null;

// Create a unique temporary config directory for this session to not share any data
var configDir = Path.Combine(Path.GetTempPath(), $"sv-cfg-{Guid.NewGuid():N}");
var configDir = Path.Combine(parentDir, $"sv-cfg-{Guid.NewGuid():N}");
Directory.CreateDirectory(configDir);
_workDirs.Add(configDir);

Expand All @@ -133,13 +142,19 @@ internal static SessionConfig BuildSessionConfig(
}
}

var sessionWorkDir = DockerCopilotServer.Instance?.MapHostPathToContainer(workDir) ?? workDir;
var sessionSkillPath = skillPath is not null
? (DockerCopilotServer.Instance?.MapHostPathToContainer(skillPath) ?? skillPath)
: null;
var sessionConfigDir = DockerCopilotServer.Instance?.MapHostPathToContainer(configDir) ?? configDir;

return new SessionConfig
{
Model = model,
Streaming = true,
WorkingDirectory = workDir,
SkillDirectories = skill is not null ? [skillPath!] : [],
ConfigDir = configDir,
WorkingDirectory = sessionWorkDir,
SkillDirectories = skill is not null ? [sessionSkillPath!] : [],
ConfigDir = sessionConfigDir,
McpServers = sdkMcp,
InfiniteSessions = new InfiniteSessionConfig { Enabled = false },
OnPermissionRequest = (request, _) =>
Expand Down Expand Up @@ -289,7 +304,8 @@ public static async Task<RunMetrics> RunAgent(RunOptions options)

private static async Task<string> SetupWorkDir(EvalScenario scenario, string? skillPath, string? evalPath)
{
var workDir = Path.Combine(Path.GetTempPath(), $"sv-{Guid.NewGuid():N}");
var parentDir = DockerCopilotServer.Instance?.GetHostDir() ?? Path.GetTempPath();
var workDir = Path.Combine(parentDir, $"sv-{Guid.NewGuid():N}");
Directory.CreateDirectory(workDir);
_workDirs.Add(workDir);

Expand Down Expand Up @@ -335,6 +351,13 @@ private static async Task<string> SetupWorkDir(EvalScenario scenario, string? sk
{
try
{
if (DockerCopilotServer.Instance is {} dockerServer)
{
var containerDir = dockerServer.MapHostPathToContainer(workDir);
await dockerServer.ExecAsync(containerDir, cmd);
continue;
}

var psi = new ProcessStartInfo
{
FileName = OperatingSystem.IsWindows() ? "cmd.exe" : "/bin/sh",
Expand Down
Loading