Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions eng/skill-validator/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -96,12 +96,28 @@ skill-validator --verdict-warn-only --require-evals ./skills/
| `--verdict-warn-only` | `false` | Treat verdict failures as warnings (exit 0). Execution errors and `--require-evals` still fail. |
| `--no-overfitting-check` | `false` | Disable the LLM-based overfitting analysis (on by default) |
| `--overfitting-fix` | `false` | Generate `eval.fixed.yaml` with improved rubric items/assertions |
| `--run-in-docker` | `false` | Run agent sessions and setup commands inside a Docker container (requires `GITHUB_TOKEN`). |
| `--verbose` | `false` | Show tool calls and agent events during runs |
| `--reporter <spec>` | `console`, `json`, `markdown` | Output format: `console`, `json`, `junit`, `markdown`. |
| `--results-dir <path>` | `.skill-validator-results` | Directory for file reporter output. |

Models are validated on startup — invalid model names fail fast with a list of available models.

## Docker mode

Use `--run-in-docker` to run agent sessions and setup commands in a Docker container instead of the host environment.

Requirements:
- Docker installed and running
- `GITHUB_TOKEN` set in your environment (for example: `export GITHUB_TOKEN="$(gh auth token)"`)

```bash
# Run validation in Docker
dotnet run --project src/SkillValidator -- --run-in-docker ./skills/
```

When Docker mode is enabled, skill-validator builds the runtime image from `src/Docker/Dockerfile`, mounts run workdirs at `/work`, mounts discovered skill directories read-only under `/skills/*`, and stops/removes the container during normal cleanup (and on process exit).

## Output

Results are displayed in the console with color-coded scores and metric deltas. By default, `json` and `markdown` reporters are enabled and write to `.skill-validator-results/` (override with `--results-dir`). File reporters write to that directory:
Expand Down
47 changes: 28 additions & 19 deletions eng/skill-validator/src/Commands/ValidateCommand.cs
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ public static RootCommand Create()
var reporterOpt = new Option<string[]>("--reporter") { Description = "Reporter (console, json, junit, markdown). Can be repeated.", AllowMultipleArgumentsPerToken = true };
var noOverfittingCheckOpt = new Option<bool>("--no-overfitting-check") { Description = "Disable LLM-based overfitting analysis (on by default)" };
var overfittingFixOpt = new Option<bool>("--overfitting-fix") { Description = "Generate a fixed eval.yaml with improved rubric items/assertions" };
var runInDockerOpt = new Option<bool>("--run-in-docker") { Description = "Run agents and judges inside Docker containers" };

var command = new RootCommand("Validate that agent skills meaningfully improve agent performance")
{
Expand All @@ -53,6 +54,7 @@ public static RootCommand Create()
reporterOpt,
noOverfittingCheckOpt,
overfittingFixOpt,
runInDockerOpt,
};

command.SetAction(async (parseResult, _) =>
Expand Down Expand Up @@ -98,6 +100,7 @@ public static RootCommand Create()
TestsDir = parseResult.GetValue(testsDirOpt),
OverfittingCheck = !parseResult.GetValue(noOverfittingCheckOpt),
OverfittingFix = parseResult.GetValue(overfittingFixOpt),
RunInDocker = parseResult.GetValue(runInDockerOpt),
};

return await Run(config);
Expand All @@ -117,6 +120,26 @@ public static RootCommand Create()

public static async Task<int> Run(ValidatorConfig config)
{
// Discover skills first (needed to determine Docker volume mounts)
var allSkills = new List<SkillInfo>();
foreach (var path in config.SkillPaths)
{
var skills = await SkillDiscovery.DiscoverSkills(path, config.TestsDir);
allSkills.AddRange(skills);
}

if (allSkills.Count == 0)
{
Console.Error.WriteLine("No skills found in the specified paths.");
return 1;
}

Console.WriteLine($"Found {allSkills.Count} skill(s)\n");

// Set up DockerCopilotServer with skill directories to mount
if (config.RunInDocker)
DockerCopilotServer.Initialize(config.Verbose, allSkills);

// Validate model early
try
{
Expand Down Expand Up @@ -148,22 +171,6 @@ public static async Task<int> Run(ValidatorConfig config)
if (config.Verbose)
Console.WriteLine($"Results dir: {config.ResultsDir}");

// Discover skills
var allSkills = new List<SkillInfo>();
foreach (var path in config.SkillPaths)
{
var skills = await SkillDiscovery.DiscoverSkills(path, config.TestsDir);
allSkills.AddRange(skills);
}

if (allSkills.Count == 0)
{
Console.Error.WriteLine("No skills found in the specified paths.");
return 1;
}

Console.WriteLine($"Found {allSkills.Count} skill(s)\n");

if (config.Runs < 5)
Console.WriteLine($"\x1b[33m⚠ Running with {config.Runs} run(s). For statistically significant results, use --runs 5 or higher.\x1b[0m");

Expand Down Expand Up @@ -202,6 +209,8 @@ await Reporter.ReportResults(verdicts, config.Reporters, config.Verbose,
config.Model, config.JudgeModel, config.ResultsDir);

await AgentRunner.StopSharedClient();
if (DockerCopilotServer.Instance is { } dockerServer)
await dockerServer.StopAsync();
await AgentRunner.CleanupWorkDirs();

// Always fail on execution errors, even in --verdict-warn-only mode
Expand Down Expand Up @@ -262,13 +271,13 @@ await Reporter.ReportResults(verdicts, config.Reporters, config.Verbose,
log(warning);

// Launch overfitting check in parallel with scenario execution
var workDir = Path.GetTempPath();
var overfittingWorkDir = DockerCopilotServer.Instance is not null ? "/tmp" : Path.GetTempPath();
Task<OverfittingResult?> overfittingTask = Task.FromResult<OverfittingResult?>(null);
if (config.OverfittingCheck && skill.EvalConfig is not null)
{
log("🔍 Running overfitting check (parallel)...");
overfittingTask = Services.OverfittingJudge.Analyze(skill, new OverfittingJudgeOptions(
config.JudgeModel, config.Verbose, config.JudgeTimeout, workDir));
config.JudgeModel, config.Verbose, config.JudgeTimeout, overfittingWorkDir));
}

bool singleScenario = skill.EvalConfig!.Scenarios.Count == 1;
Expand Down Expand Up @@ -301,7 +310,7 @@ await Reporter.ReportResults(verdicts, config.Reporters, config.Verbose,
try
{
await Services.OverfittingJudge.GenerateFix(skill, overfittingResult, new OverfittingJudgeOptions(
config.JudgeModel, config.Verbose, config.JudgeTimeout, workDir));
config.JudgeModel, config.Verbose, config.JudgeTimeout, overfittingWorkDir));
log("📝 Generated eval.fixed.yaml with suggested improvements");
}
catch (Exception ex)
Expand Down
11 changes: 11 additions & 0 deletions eng/skill-validator/src/Docker/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
FROM mcr.microsoft.com/dotnet/sdk:10.0 AS build

ARG COPILOT_SDK_VERSION
RUN dotnet new console -o /tmp/dl \
&& dotnet add /tmp/dl package GitHub.Copilot.SDK --version $COPILOT_SDK_VERSION \
&& dotnet build /tmp/dl -c Release \
&& cp /tmp/dl/bin/Release/net10.0/runtimes/*/native/copilot /usr/local/bin/copilot \
&& chmod +x /usr/local/bin/copilot \
&& rm -rf /tmp/dl

RUN copilot --version
1 change: 1 addition & 0 deletions eng/skill-validator/src/Models/Models.cs
Original file line number Diff line number Diff line change
Expand Up @@ -307,6 +307,7 @@ public sealed record ValidatorConfig
public string? TestsDir { get; init; }
public bool OverfittingCheck { get; init; } = true;
public bool OverfittingFix { get; init; }
public bool RunInDocker { get; init; } = false;
}

public static class DefaultWeights
Expand Down
53 changes: 38 additions & 15 deletions eng/skill-validator/src/Services/AgentRunner.cs
Original file line number Diff line number Diff line change
Expand Up @@ -35,18 +35,23 @@ public static async Task<CopilotClient> GetSharedClient(bool verbose)
{
if (_sharedClient is not null) return _sharedClient;

var options = new CopilotClientOptions
var options = new CopilotClientOptions();
if (DockerCopilotServer.Instance is {} dockerServer)
{
LogLevel = verbose ? "info" : "none",
};

var githubToken = Environment.GetEnvironmentVariable("GITHUB_TOKEN");
if (!string.IsNullOrEmpty(githubToken))
_workDirs.Add(dockerServer.GetHostDir());
options.CliUrl = await dockerServer.GetCliUrlAsync();
}
else
{
options.GitHubToken = githubToken;
// Clear the token from the environment so child processes
// (e.g. LLM-generated code, eval shell commands) cannot read it.
Environment.SetEnvironmentVariable("GITHUB_TOKEN", null);
options.LogLevel = verbose ? "info" : "none";
var githubToken = Environment.GetEnvironmentVariable("GITHUB_TOKEN");
if (!string.IsNullOrEmpty(githubToken))
{
options.GitHubToken = githubToken;
// Clear the token from the environment so child processes
// (e.g. LLM-generated code, eval shell commands) cannot read it.
Environment.SetEnvironmentVariable("GITHUB_TOKEN", null);
}
}

_sharedClient = new CopilotClient(options);
Expand Down Expand Up @@ -93,6 +98,9 @@ public static bool CheckPermission(PermissionRequest request, string workDir, st

if (string.IsNullOrEmpty(reqPath)) return true;

if (DockerCopilotServer.Instance is {} dockerServer && dockerServer.TryMapContainerPathToHost(reqPath, out var mappedPath))
reqPath = mappedPath;

var resolved = Path.GetFullPath(reqPath);
var allowedDirs = new List<string> { Path.GetFullPath(workDir) };
if (skillPath is not null) allowedDirs.Add(Path.GetFullPath(skillPath));
Expand All @@ -106,10 +114,11 @@ internal static SessionConfig BuildSessionConfig(
SkillInfo? skill, string model, string workDir,
IReadOnlyDictionary<string, MCPServerDef>? mcpServers = null)
{
var parentDir = DockerCopilotServer.Instance?.GetHostDir() ?? Path.GetTempPath();
var skillPath = skill is not null ? Path.GetDirectoryName(skill.Path) : null;

// Create a unique temporary config directory for this session to not share any data
var configDir = Path.Combine(Path.GetTempPath(), $"sv-cfg-{Guid.NewGuid():N}");
var configDir = Path.Combine(parentDir, $"sv-cfg-{Guid.NewGuid():N}");
Directory.CreateDirectory(configDir);
_workDirs.Add(configDir);

Expand All @@ -133,13 +142,19 @@ internal static SessionConfig BuildSessionConfig(
}
}

var sessionWorkDir = DockerCopilotServer.Instance?.MapHostPathToContainer(workDir) ?? workDir;
var sessionSkillPath = skillPath is not null
? (DockerCopilotServer.Instance?.MapHostPathToContainer(skillPath) ?? skillPath)
: null;
var sessionConfigDir = DockerCopilotServer.Instance?.MapHostPathToContainer(configDir) ?? configDir;

return new SessionConfig
{
Model = model,
Streaming = true,
WorkingDirectory = workDir,
SkillDirectories = skill is not null ? [skillPath!] : [],
ConfigDir = configDir,
WorkingDirectory = sessionWorkDir,
SkillDirectories = skill is not null ? [sessionSkillPath!] : [],
ConfigDir = sessionConfigDir,
McpServers = sdkMcp,
InfiniteSessions = new InfiniteSessionConfig { Enabled = false },
OnPermissionRequest = (request, _) =>
Expand Down Expand Up @@ -289,7 +304,8 @@ public static async Task<RunMetrics> RunAgent(RunOptions options)

private static async Task<string> SetupWorkDir(EvalScenario scenario, string? skillPath, string? evalPath)
{
var workDir = Path.Combine(Path.GetTempPath(), $"sv-{Guid.NewGuid():N}");
var parentDir = DockerCopilotServer.Instance?.GetHostDir() ?? Path.GetTempPath();
var workDir = Path.Combine(parentDir, $"sv-{Guid.NewGuid():N}");
Directory.CreateDirectory(workDir);
_workDirs.Add(workDir);

Expand Down Expand Up @@ -335,6 +351,13 @@ private static async Task<string> SetupWorkDir(EvalScenario scenario, string? sk
{
try
{
if (DockerCopilotServer.Instance is {} dockerServer)
{
var containerDir = dockerServer.MapHostPathToContainer(workDir);
await dockerServer.ExecAsync(containerDir, cmd);
continue;
}

var psi = new ProcessStartInfo
{
FileName = OperatingSystem.IsWindows() ? "cmd.exe" : "/bin/sh",
Expand Down
Loading