diff --git a/eng/skill-validator/README.md b/eng/skill-validator/README.md index 1d48def3..192b0218 100644 --- a/eng/skill-validator/README.md +++ b/eng/skill-validator/README.md @@ -96,12 +96,28 @@ skill-validator --verdict-warn-only --require-evals ./skills/ | `--verdict-warn-only` | `false` | Treat verdict failures as warnings (exit 0). Execution errors and `--require-evals` still fail. | | `--no-overfitting-check` | `false` | Disable the LLM-based overfitting analysis (on by default) | | `--overfitting-fix` | `false` | Generate `eval.fixed.yaml` with improved rubric items/assertions | +| `--run-in-docker` | `false` | Run agent sessions and setup commands inside a Docker container (requires `GITHUB_TOKEN`). | | `--verbose` | `false` | Show tool calls and agent events during runs | | `--reporter ` | `console`, `json`, `markdown` | Output format: `console`, `json`, `junit`, `markdown`. | | `--results-dir ` | `.skill-validator-results` | Directory for file reporter output. | Models are validated on startup — invalid model names fail fast with a list of available models. +## Docker mode + +Use `--run-in-docker` to run agent sessions and setup commands in a Docker container instead of the host environment. + +Requirements: +- Docker installed and running +- `GITHUB_TOKEN` set in your environment (for example: `export GITHUB_TOKEN="$(gh auth token)"`) + +```bash +# Run validation in Docker +dotnet run --project src/SkillValidator -- --run-in-docker ./skills/ +``` + +When Docker mode is enabled, skill-validator builds the runtime image from `src/Docker/Dockerfile`, mounts run workdirs at `/work`, mounts discovered skill directories read-only under `/skills/*`, and stops/removes the container during normal cleanup (and on process exit). + ## Output Results are displayed in the console with color-coded scores and metric deltas. By default, `json` and `markdown` reporters are enabled and write to `.skill-validator-results/` (override with `--results-dir`). File reporters write to that directory: diff --git a/eng/skill-validator/src/Commands/ValidateCommand.cs b/eng/skill-validator/src/Commands/ValidateCommand.cs index 66825448..8521a132 100644 --- a/eng/skill-validator/src/Commands/ValidateCommand.cs +++ b/eng/skill-validator/src/Commands/ValidateCommand.cs @@ -30,6 +30,7 @@ public static RootCommand Create() var reporterOpt = new Option("--reporter") { Description = "Reporter (console, json, junit, markdown). Can be repeated.", AllowMultipleArgumentsPerToken = true }; var noOverfittingCheckOpt = new Option("--no-overfitting-check") { Description = "Disable LLM-based overfitting analysis (on by default)" }; var overfittingFixOpt = new Option("--overfitting-fix") { Description = "Generate a fixed eval.yaml with improved rubric items/assertions" }; + var runInDockerOpt = new Option("--run-in-docker") { Description = "Run agents and judges inside Docker containers" }; var command = new RootCommand("Validate that agent skills meaningfully improve agent performance") { @@ -53,6 +54,7 @@ public static RootCommand Create() reporterOpt, noOverfittingCheckOpt, overfittingFixOpt, + runInDockerOpt, }; command.SetAction(async (parseResult, _) => @@ -98,6 +100,7 @@ public static RootCommand Create() TestsDir = parseResult.GetValue(testsDirOpt), OverfittingCheck = !parseResult.GetValue(noOverfittingCheckOpt), OverfittingFix = parseResult.GetValue(overfittingFixOpt), + RunInDocker = parseResult.GetValue(runInDockerOpt), }; return await Run(config); @@ -117,6 +120,26 @@ public static RootCommand Create() public static async Task Run(ValidatorConfig config) { + // Discover skills first (needed to determine Docker volume mounts) + var allSkills = new List(); + foreach (var path in config.SkillPaths) + { + var skills = await SkillDiscovery.DiscoverSkills(path, config.TestsDir); + allSkills.AddRange(skills); + } + + if (allSkills.Count == 0) + { + Console.Error.WriteLine("No skills found in the specified paths."); + return 1; + } + + Console.WriteLine($"Found {allSkills.Count} skill(s)\n"); + + // Set up DockerCopilotServer with skill directories to mount + if (config.RunInDocker) + DockerCopilotServer.Initialize(config.Verbose, allSkills); + // Validate model early try { @@ -148,22 +171,6 @@ public static async Task Run(ValidatorConfig config) if (config.Verbose) Console.WriteLine($"Results dir: {config.ResultsDir}"); - // Discover skills - var allSkills = new List(); - foreach (var path in config.SkillPaths) - { - var skills = await SkillDiscovery.DiscoverSkills(path, config.TestsDir); - allSkills.AddRange(skills); - } - - if (allSkills.Count == 0) - { - Console.Error.WriteLine("No skills found in the specified paths."); - return 1; - } - - Console.WriteLine($"Found {allSkills.Count} skill(s)\n"); - if (config.Runs < 5) Console.WriteLine($"\x1b[33m⚠ Running with {config.Runs} run(s). For statistically significant results, use --runs 5 or higher.\x1b[0m"); @@ -202,6 +209,8 @@ await Reporter.ReportResults(verdicts, config.Reporters, config.Verbose, config.Model, config.JudgeModel, config.ResultsDir); await AgentRunner.StopSharedClient(); + if (DockerCopilotServer.Instance is { } dockerServer) + await dockerServer.StopAsync(); await AgentRunner.CleanupWorkDirs(); // Always fail on execution errors, even in --verdict-warn-only mode @@ -262,13 +271,13 @@ await Reporter.ReportResults(verdicts, config.Reporters, config.Verbose, log(warning); // Launch overfitting check in parallel with scenario execution - var workDir = Path.GetTempPath(); + var overfittingWorkDir = DockerCopilotServer.Instance is not null ? "/tmp" : Path.GetTempPath(); Task overfittingTask = Task.FromResult(null); if (config.OverfittingCheck && skill.EvalConfig is not null) { log("🔍 Running overfitting check (parallel)..."); overfittingTask = Services.OverfittingJudge.Analyze(skill, new OverfittingJudgeOptions( - config.JudgeModel, config.Verbose, config.JudgeTimeout, workDir)); + config.JudgeModel, config.Verbose, config.JudgeTimeout, overfittingWorkDir)); } bool singleScenario = skill.EvalConfig!.Scenarios.Count == 1; @@ -301,7 +310,7 @@ await Reporter.ReportResults(verdicts, config.Reporters, config.Verbose, try { await Services.OverfittingJudge.GenerateFix(skill, overfittingResult, new OverfittingJudgeOptions( - config.JudgeModel, config.Verbose, config.JudgeTimeout, workDir)); + config.JudgeModel, config.Verbose, config.JudgeTimeout, overfittingWorkDir)); log("📝 Generated eval.fixed.yaml with suggested improvements"); } catch (Exception ex) diff --git a/eng/skill-validator/src/Docker/Dockerfile b/eng/skill-validator/src/Docker/Dockerfile new file mode 100644 index 00000000..1dfb7ed0 --- /dev/null +++ b/eng/skill-validator/src/Docker/Dockerfile @@ -0,0 +1,11 @@ +FROM mcr.microsoft.com/dotnet/sdk:10.0 AS build + +ARG COPILOT_SDK_VERSION +RUN dotnet new console -o /tmp/dl \ + && dotnet add /tmp/dl package GitHub.Copilot.SDK --version $COPILOT_SDK_VERSION \ + && dotnet build /tmp/dl -c Release \ + && cp /tmp/dl/bin/Release/net10.0/runtimes/*/native/copilot /usr/local/bin/copilot \ + && chmod +x /usr/local/bin/copilot \ + && rm -rf /tmp/dl + +RUN copilot --version \ No newline at end of file diff --git a/eng/skill-validator/src/Models/Models.cs b/eng/skill-validator/src/Models/Models.cs index 995d7463..f9b5f617 100644 --- a/eng/skill-validator/src/Models/Models.cs +++ b/eng/skill-validator/src/Models/Models.cs @@ -307,6 +307,7 @@ public sealed record ValidatorConfig public string? TestsDir { get; init; } public bool OverfittingCheck { get; init; } = true; public bool OverfittingFix { get; init; } + public bool RunInDocker { get; init; } = false; } public static class DefaultWeights diff --git a/eng/skill-validator/src/Services/AgentRunner.cs b/eng/skill-validator/src/Services/AgentRunner.cs index 7a79a630..8bb120f1 100644 --- a/eng/skill-validator/src/Services/AgentRunner.cs +++ b/eng/skill-validator/src/Services/AgentRunner.cs @@ -35,18 +35,23 @@ public static async Task GetSharedClient(bool verbose) { if (_sharedClient is not null) return _sharedClient; - var options = new CopilotClientOptions + var options = new CopilotClientOptions(); + if (DockerCopilotServer.Instance is {} dockerServer) { - LogLevel = verbose ? "info" : "none", - }; - - var githubToken = Environment.GetEnvironmentVariable("GITHUB_TOKEN"); - if (!string.IsNullOrEmpty(githubToken)) + _workDirs.Add(dockerServer.GetHostDir()); + options.CliUrl = await dockerServer.GetCliUrlAsync(); + } + else { - options.GitHubToken = githubToken; - // Clear the token from the environment so child processes - // (e.g. LLM-generated code, eval shell commands) cannot read it. - Environment.SetEnvironmentVariable("GITHUB_TOKEN", null); + options.LogLevel = verbose ? "info" : "none"; + var githubToken = Environment.GetEnvironmentVariable("GITHUB_TOKEN"); + if (!string.IsNullOrEmpty(githubToken)) + { + options.GitHubToken = githubToken; + // Clear the token from the environment so child processes + // (e.g. LLM-generated code, eval shell commands) cannot read it. + Environment.SetEnvironmentVariable("GITHUB_TOKEN", null); + } } _sharedClient = new CopilotClient(options); @@ -93,6 +98,9 @@ public static bool CheckPermission(PermissionRequest request, string workDir, st if (string.IsNullOrEmpty(reqPath)) return true; + if (DockerCopilotServer.Instance is {} dockerServer && dockerServer.TryMapContainerPathToHost(reqPath, out var mappedPath)) + reqPath = mappedPath; + var resolved = Path.GetFullPath(reqPath); var allowedDirs = new List { Path.GetFullPath(workDir) }; if (skillPath is not null) allowedDirs.Add(Path.GetFullPath(skillPath)); @@ -106,10 +114,11 @@ internal static SessionConfig BuildSessionConfig( SkillInfo? skill, string model, string workDir, IReadOnlyDictionary? mcpServers = null) { + var parentDir = DockerCopilotServer.Instance?.GetHostDir() ?? Path.GetTempPath(); var skillPath = skill is not null ? Path.GetDirectoryName(skill.Path) : null; // Create a unique temporary config directory for this session to not share any data - var configDir = Path.Combine(Path.GetTempPath(), $"sv-cfg-{Guid.NewGuid():N}"); + var configDir = Path.Combine(parentDir, $"sv-cfg-{Guid.NewGuid():N}"); Directory.CreateDirectory(configDir); _workDirs.Add(configDir); @@ -133,13 +142,19 @@ internal static SessionConfig BuildSessionConfig( } } + var sessionWorkDir = DockerCopilotServer.Instance?.MapHostPathToContainer(workDir) ?? workDir; + var sessionSkillPath = skillPath is not null + ? (DockerCopilotServer.Instance?.MapHostPathToContainer(skillPath) ?? skillPath) + : null; + var sessionConfigDir = DockerCopilotServer.Instance?.MapHostPathToContainer(configDir) ?? configDir; + return new SessionConfig { Model = model, Streaming = true, - WorkingDirectory = workDir, - SkillDirectories = skill is not null ? [skillPath!] : [], - ConfigDir = configDir, + WorkingDirectory = sessionWorkDir, + SkillDirectories = skill is not null ? [sessionSkillPath!] : [], + ConfigDir = sessionConfigDir, McpServers = sdkMcp, InfiniteSessions = new InfiniteSessionConfig { Enabled = false }, OnPermissionRequest = (request, _) => @@ -289,7 +304,8 @@ public static async Task RunAgent(RunOptions options) private static async Task SetupWorkDir(EvalScenario scenario, string? skillPath, string? evalPath) { - var workDir = Path.Combine(Path.GetTempPath(), $"sv-{Guid.NewGuid():N}"); + var parentDir = DockerCopilotServer.Instance?.GetHostDir() ?? Path.GetTempPath(); + var workDir = Path.Combine(parentDir, $"sv-{Guid.NewGuid():N}"); Directory.CreateDirectory(workDir); _workDirs.Add(workDir); @@ -335,6 +351,13 @@ private static async Task SetupWorkDir(EvalScenario scenario, string? sk { try { + if (DockerCopilotServer.Instance is {} dockerServer) + { + var containerDir = dockerServer.MapHostPathToContainer(workDir); + await dockerServer.ExecAsync(containerDir, cmd); + continue; + } + var psi = new ProcessStartInfo { FileName = OperatingSystem.IsWindows() ? "cmd.exe" : "/bin/sh", diff --git a/eng/skill-validator/src/Services/DockerCopilotServer.cs b/eng/skill-validator/src/Services/DockerCopilotServer.cs new file mode 100644 index 00000000..14a51e57 --- /dev/null +++ b/eng/skill-validator/src/Services/DockerCopilotServer.cs @@ -0,0 +1,388 @@ +using System.Diagnostics; +using System.Diagnostics.CodeAnalysis; +using System.Reflection; +using System.Text.RegularExpressions; +using GitHub.Copilot.SDK; +using SkillValidator.Models; + +namespace SkillValidator.Services; + +public partial class DockerCopilotServer +{ + record ContainerState(int HostPort); + + public static DockerCopilotServer? Instance { get; private set; } + + public static void Initialize(bool verbose, IReadOnlyList skills) + { + Instance = Create(verbose, skills); + } + + internal static DockerCopilotServer Create(bool verbose, IReadOnlyList skills) + { + return new DockerCopilotServer(verbose, BuildSkillMounts(skills)); + } + + private const int InternalPort = 4321; + private const string ImageBaseName = "skill-validator-base"; + + private readonly string _invocationId = Guid.NewGuid().ToString("N")[..8]; + private readonly bool _verbose; + private readonly Lazy> _lazyStartTask; + + /// Host skill directory → container mount point (e.g. "/skills/dotnet"). + private readonly Dictionary _skillMounts; + + private ContainerState? _containerState; + private EventHandler? _processExitHandler; + private ConsoleCancelEventHandler? _cancelKeyPressHandler; + + private DockerCopilotServer(bool verbose, Dictionary skillMounts) + { + _verbose = verbose; + _skillMounts = skillMounts; + _lazyStartTask = new Lazy>(() => StartAsync()); + } + + internal static Dictionary BuildSkillMounts(IReadOnlyList skills) + { + var mounts = new Dictionary(StringComparer.OrdinalIgnoreCase); + var usedNames = new Dictionary(StringComparer.OrdinalIgnoreCase); + + // Mount the grandparent directory of each SKILL.md (i.e. the parent of skill.Path) + foreach (var skill in skills) + { + var fullPath = Path.GetFullPath(Path.GetDirectoryName(skill.Path)!); + if (mounts.ContainsKey(fullPath)) + continue; + var name = Path.GetFileName(fullPath); + if (usedNames.TryGetValue(name, out var count)) + { + usedNames[name] = count + 1; + name = $"{name}-{count}"; + } + else + { + usedNames[name] = 1; + } + mounts[fullPath] = $"/skills/{name}"; + } + return mounts; + } + + public string GetHostDir() => Path.Combine(Path.GetTempPath(), $"sv-container-{_invocationId}"); + + private string GetContainerName() => $"skill-validator-{_invocationId}"; + + public async Task GetCliUrlAsync(CancellationToken ct = default) + { + var state = await GetOrStartContainerAsync(ct).ConfigureAwait(false); + return $"localhost:{state.HostPort}"; + } + + private void RegisterProcessExitHandler() + { + if (_processExitHandler is not null || _cancelKeyPressHandler is not null) + return; + + _processExitHandler = (_, _) => + { + try + { + StopAsync().GetAwaiter().GetResult(); + } + catch (Exception ex) + { + if (_verbose) + Console.Error.WriteLine($"🐳 Failed to stop container on process exit: {ex.Message}"); + } + }; + _cancelKeyPressHandler = (_, _) => + { + try + { + StopAsync().GetAwaiter().GetResult(); + } + catch (Exception ex) + { + if (_verbose) + Console.Error.WriteLine($"🐳 Failed to stop container on Ctrl+C: {ex.Message}"); + } + }; + + AppDomain.CurrentDomain.ProcessExit += _processExitHandler; + Console.CancelKeyPress += _cancelKeyPressHandler; + } + + private void UnregisterProcessExitHandler() + { + if (_processExitHandler is null && _cancelKeyPressHandler is null) + return; + + if (_processExitHandler is not null) + AppDomain.CurrentDomain.ProcessExit -= _processExitHandler; + if (_cancelKeyPressHandler is not null) + Console.CancelKeyPress -= _cancelKeyPressHandler; + _processExitHandler = null; + _cancelKeyPressHandler = null; + } + + public async Task StopAsync(CancellationToken ct = default) + { + var containerName = GetContainerName(); + try + { + if (_containerState is null) + return; + + try + { + await RunDockerCommandAsync(["stop", containerName], ct); + } + catch { /* container may already be stopped */ } + + try + { + await RunDockerCommandAsync(["rm", containerName], ct); + } + catch { /* container may already be removed */ } + + _containerState = null; + + if (_verbose) + Console.Error.WriteLine($"🐳 Container {containerName} stopped and removed."); + } + finally + { + UnregisterProcessExitHandler(); + } + } + + public string MapHostPathToContainer(string hostPath) + { + var fullPath = Path.GetFullPath(hostPath); + + // Check work dir mount + if (TryMapToContainerMount(fullPath, GetHostDir(), "/work", out var workResult)) + return workResult; + + // Check skill dir mounts + foreach (var (hostSkillDir, containerMount) in _skillMounts) + { + if (TryMapToContainerMount(fullPath, hostSkillDir, containerMount, out var skillResult)) + return skillResult; + } + + throw new ArgumentException($"Host path is not mapped into the container: {hostPath}"); + } + + public bool TryMapContainerPathToHost(string containerPath, [NotNullWhen(true)] out string? hostPath) + { + if (containerPath.StartsWith("/work/") || containerPath == "/work") + { + var relativePath = containerPath == "/work" ? "." : containerPath["/work/".Length..]; + hostPath = Path.GetFullPath(Path.Combine(GetHostDir(), relativePath)); + return true; + } + + foreach (var (hostSkillDir, containerMount) in _skillMounts) + { + var prefix = containerMount + "/"; + if (containerPath.StartsWith(prefix) || containerPath == containerMount) + { + var relativePath = containerPath == containerMount ? "." : containerPath[prefix.Length..]; + hostPath = Path.GetFullPath(Path.Combine(hostSkillDir, relativePath)); + return true; + } + } + + hostPath = null; + return false; + } + + public async Task ExecAsync(string workDir, string command, CancellationToken ct = default) + { + _ = await GetOrStartContainerAsync(ct); + + await RunDockerCommandAsync(["exec", "--workdir", workDir, GetContainerName(), "/bin/sh", "-c", command], ct); + } + + private async Task GetOrStartContainerAsync(CancellationToken ct = default) + { + if (_containerState is not null) + return _containerState; + + _containerState = await _lazyStartTask.Value.WaitAsync(ct).ConfigureAwait(false); + return _containerState; + } + + private async Task StartAsync(CancellationToken ct = default) + { + if (string.IsNullOrEmpty(Environment.GetEnvironmentVariable("GITHUB_TOKEN"))) + throw new InvalidOperationException("GITHUB_TOKEN environment variable is required when running in Docker. You can get it with 'gh auth token'."); + + if (_verbose) + Console.Error.WriteLine("🐳 Building Docker image ..."); + + var sdkVersion = GetCopilotSdkVersion(); + var imageName = $"{ImageBaseName}:{sdkVersion}"; + var dockerFilePath = Path.Combine(AppContext.BaseDirectory, "Docker", "Dockerfile"); + + await RunDockerCommandAsync( + ["build", "-t", imageName, "--build-arg", $"COPILOT_SDK_VERSION={sdkVersion}", "-f", dockerFilePath, Path.GetDirectoryName(dockerFilePath)!], ct); + + if (_verbose) + Console.Error.WriteLine("🐳 Docker image built successfully."); + + var containerName = GetContainerName(); + + if (_verbose) + Console.Error.WriteLine($"🐳 Starting container {containerName}..."); + + var hostDir = GetHostDir(); + Directory.CreateDirectory(hostDir); + + var runArgs = new List + { + "run", + "--name", containerName, + "-p", $"0:{InternalPort}", // Map internal port to a random host port + "-e", "GITHUB_TOKEN", + "-v", $"{hostDir}:/work", // Mount host dir to /work in container + }; + + // Mount skill directories as read-only volumes + foreach (var (hostSkillDir, containerMount) in _skillMounts) + runArgs.AddRange(["-v", $"{hostSkillDir}:{containerMount}:ro"]); + + runArgs.AddRange([ + imageName, + // Start the Copilot server in headless mode, listening on the internal port, and using the GITHUB_TOKEN from env + "copilot", + "--headless", + "--port", InternalPort.ToString(), + "--auth-token-env", "GITHUB_TOKEN", + "--no-auto-login", + "--no-auto-update", + "--log-level", (_verbose ? "info" : "none") + ]); + + using var process = StartNonDetached(runArgs); + + using var cts = CancellationTokenSource.CreateLinkedTokenSource(ct); + cts.CancelAfter(TimeSpan.FromSeconds(30)); + + try + { + while (!cts.Token.IsCancellationRequested) + { + var line = await process.StandardOutput.ReadLineAsync(cts.Token); + if (line is null) + { + var stderr = await process.StandardError.ReadToEndAsync(ct); + throw new InvalidOperationException( + $"Container {containerName} exited before becoming ready. stderr: {stderr}"); + } + + var match = ListeningPattern().Match(line); + if (match.Success) + break; + } + + if (cts.Token.IsCancellationRequested) + throw new TimeoutException($"Container {containerName} did not become ready within 30s."); + } + finally + { + if (!process.HasExited) + { + process.Kill(); + await process.WaitForExitAsync(); + } + } + + var output = await RunDockerCommandAsync(["port", GetContainerName(), InternalPort.ToString()], ct); + var portMatch = PortPattern().Match(output); + if (!portMatch.Success) + throw new InvalidOperationException($"Could not parse port mapping from: {output}"); + + var port = int.Parse(portMatch.Groups[1].Value); + + if (_verbose) + Console.Error.WriteLine($"🐳 Container {containerName} ready (port {port})"); + + RegisterProcessExitHandler(); + + return new ContainerState(port); + } + + private static async Task RunDockerCommandAsync(IReadOnlyList args, CancellationToken ct = default) + { + using var proc = StartNonDetached(args); + + var stdoutTask = proc.StandardOutput.ReadToEndAsync(ct); + var stderrTask = proc.StandardError.ReadToEndAsync(ct); + await proc.WaitForExitAsync(ct); + var stdout = await stdoutTask; + var stderr = await stderrTask; + + if (proc.ExitCode != 0) + { + var output = string.IsNullOrWhiteSpace(stderr) ? stdout : stderr; + throw new InvalidOperationException( + $"docker {args[0]} failed (exit {proc.ExitCode}): {output.Trim()}"); + } + + return stdout.Trim(); + } + + private static Process StartNonDetached(IReadOnlyList args) + { + var psi = new ProcessStartInfo("docker", args) + { + RedirectStandardOutput = true, + RedirectStandardError = true, + UseShellExecute = false, + CreateNoWindow = true, + }; + + return Process.Start(psi) + ?? throw new InvalidOperationException("Failed to start docker run process"); + } + + private static bool TryMapToContainerMount( + string fullPath, + string hostDir, + string containerMount, + [NotNullWhen(true)] out string? containerPath) + { + var relativePath = Path.GetRelativePath(hostDir, fullPath); + if (!Path.IsPathRooted(relativePath) && + !relativePath.StartsWith(".." + Path.DirectorySeparatorChar, StringComparison.Ordinal) && + relativePath != "..") + { + containerPath = Path.Combine(containerMount, relativePath).Replace("\\", "/"); + return true; + } + + containerPath = null; + return false; + } + + internal static string GetCopilotSdkVersion() + { + var attr = typeof(CopilotClient).Assembly + .GetCustomAttribute(); + var version = attr?.InformationalVersion + ?? throw new InvalidOperationException("Could not determine GitHub.Copilot.SDK version from assembly."); + // Strip the commit hash suffix (e.g. "0.1.26+abc123" → "0.1.26") + var plusIndex = version.IndexOf('+'); + return plusIndex >= 0 ? version[..plusIndex] : version; + } + + [GeneratedRegex(@"listening on port (\d+)", RegexOptions.IgnoreCase)] + private static partial Regex ListeningPattern(); + + [GeneratedRegex(@":(\d+)$", RegexOptions.Multiline)] + private static partial Regex PortPattern(); +} diff --git a/eng/skill-validator/src/Services/Judge.cs b/eng/skill-validator/src/Services/Judge.cs index 46a630b0..00e3e28e 100644 --- a/eng/skill-validator/src/Services/Judge.cs +++ b/eng/skill-validator/src/Services/Judge.cs @@ -34,11 +34,13 @@ private static async Task JudgeRunOnce( { var client = await AgentRunner.GetSharedClient(options.Verbose); + var sessionWorkDir = DockerCopilotServer.Instance?.MapHostPathToContainer(options.WorkDir) ?? options.WorkDir; + await using var session = await client.CreateSessionAsync(new SessionConfig { Model = options.Model, Streaming = true, - WorkingDirectory = options.WorkDir, + WorkingDirectory = sessionWorkDir, SystemMessage = new SystemMessageConfig { Mode = SystemMessageMode.Replace, diff --git a/eng/skill-validator/src/Services/PairwiseJudge.cs b/eng/skill-validator/src/Services/PairwiseJudge.cs index a7dcab25..c1060fc9 100644 --- a/eng/skill-validator/src/Services/PairwiseJudge.cs +++ b/eng/skill-validator/src/Services/PairwiseJudge.cs @@ -69,11 +69,13 @@ private static async Task JudgeCall( var client = await AgentRunner.GetSharedClient(options.Verbose); var rubric = scenario.Rubric ?? []; + var sessionWorkDir = DockerCopilotServer.Instance?.MapHostPathToContainer(options.WorkDir) ?? options.WorkDir; + await using var session = await client.CreateSessionAsync(new SessionConfig { Model = options.Model, Streaming = true, - WorkingDirectory = options.WorkDir, + WorkingDirectory = sessionWorkDir, SystemMessage = new SystemMessageConfig { Mode = SystemMessageMode.Replace, diff --git a/eng/skill-validator/src/SkillValidator.csproj b/eng/skill-validator/src/SkillValidator.csproj index 0a5bd5dc..54ee5906 100644 --- a/eng/skill-validator/src/SkillValidator.csproj +++ b/eng/skill-validator/src/SkillValidator.csproj @@ -38,4 +38,8 @@ + + + + diff --git a/eng/skill-validator/tests/DockerCopilotServerTests.cs b/eng/skill-validator/tests/DockerCopilotServerTests.cs new file mode 100644 index 00000000..381eb962 --- /dev/null +++ b/eng/skill-validator/tests/DockerCopilotServerTests.cs @@ -0,0 +1,245 @@ +using SkillValidator.Models; +using SkillValidator.Services; + +namespace SkillValidator.Tests; + +public class BuildSkillMountsTests +{ + private static SkillInfo MakeSkill(string path) => new( + Name: Path.GetFileName(path), + Description: "test", + Path: path, + SkillMdPath: Path.Combine(path, "SKILL.md"), + SkillMdContent: "# Test", + EvalPath: null, + EvalConfig: null); + + [Fact] + public void MountsGrandparentOfSkillMd() + { + // skill.Path = /home/user/plugins/dotnet/dotnet-msbuild + // grandparent of SKILL.md = parent of skill.Path = /home/user/plugins/dotnet + var skills = new[] { MakeSkill("/home/user/plugins/dotnet/dotnet-msbuild") }; + var mounts = DockerCopilotServer.BuildSkillMounts(skills); + + var expected = Path.GetFullPath("/home/user/plugins/dotnet"); + Assert.Single(mounts); + Assert.True(mounts.ContainsKey(expected)); + Assert.Equal("/skills/dotnet", mounts[expected]); + } + + [Fact] + public void DeduplicatesSkillsInSameParentDirectory() + { + var skills = new[] + { + MakeSkill("/home/user/plugins/dotnet/skill-a"), + MakeSkill("/home/user/plugins/dotnet/skill-b"), + }; + var mounts = DockerCopilotServer.BuildSkillMounts(skills); + + Assert.Single(mounts); + } + + [Fact] + public void HandlesNameCollisionsWithIncrementingSuffix() + { + // Two different parent dirs both named "plugins" + var skills = new[] + { + MakeSkill("/home/user/area1/plugins/skill-a"), + MakeSkill("/home/user/area2/plugins/skill-b"), + }; + var mounts = DockerCopilotServer.BuildSkillMounts(skills); + + Assert.Equal(2, mounts.Count); + var containerPaths = mounts.Values.OrderBy(v => v).ToList(); + Assert.Equal("/skills/plugins", containerPaths[0]); + Assert.Equal("/skills/plugins-1", containerPaths[1]); + } + + [Fact] + public void MultipleDistinctParentsGetSeparateMounts() + { + var skills = new[] + { + MakeSkill("/home/user/plugins/dotnet/skill-a"), + MakeSkill("/home/user/plugins/python/skill-b"), + }; + var mounts = DockerCopilotServer.BuildSkillMounts(skills); + + Assert.Equal(2, mounts.Count); + Assert.Contains(mounts.Values, v => v == "/skills/dotnet"); + Assert.Contains(mounts.Values, v => v == "/skills/python"); + } + + [Fact] + public void EmptySkillListProducesEmptyMounts() + { + var mounts = DockerCopilotServer.BuildSkillMounts([]); + Assert.Empty(mounts); + } +} + +public class MapHostPathToContainerTests +{ + [Fact] + public void MapsWorkDirPath() + { + var server = DockerCopilotServer.Create(verbose: false, skills: []); + var hostDir = server.GetHostDir(); + Directory.CreateDirectory(hostDir); + + try + { + var subPath = Path.Combine(hostDir, "sv-abc123"); + var result = server.MapHostPathToContainer(subPath); + Assert.Equal("/work/sv-abc123", result); + } + finally + { + Directory.Delete(hostDir, true); + } + } + + [Fact] + public void MapsWorkDirNestedPath() + { + var server = DockerCopilotServer.Create(verbose: false, skills: []); + var hostDir = server.GetHostDir(); + Directory.CreateDirectory(hostDir); + + try + { + var subPath = Path.Combine(hostDir, "run1", "subdir", "file.txt"); + var result = server.MapHostPathToContainer(subPath); + Assert.Equal("/work/run1/subdir/file.txt", result); + } + finally + { + Directory.Delete(hostDir, true); + } + } + + [Fact] + public void MapsSkillDirPath() + { + // Create a real temp directory to use as a skill path + var tempParent = Path.Combine(Path.GetTempPath(), $"test-skills-{Guid.NewGuid():N}"); + var skillDir = Path.Combine(tempParent, "dotnet", "dotnet-msbuild"); + Directory.CreateDirectory(skillDir); + + try + { + var skill = new SkillInfo( + Name: "dotnet-msbuild", + Description: "test", + Path: skillDir, + SkillMdPath: Path.Combine(skillDir, "SKILL.md"), + SkillMdContent: "# Test", + EvalPath: null, + EvalConfig: null); + + var server = DockerCopilotServer.Create(verbose: false, skills: [skill]); + var fullParent = Path.GetFullPath(tempParent); + + // Map a path inside the skill's parent directory + var result = server.MapHostPathToContainer(Path.Combine(fullParent, "dotnet", "dotnet-msbuild", "SKILL.md")); + Assert.StartsWith("/skills/", result); + Assert.EndsWith("/dotnet-msbuild/SKILL.md", result); + } + finally + { + Directory.Delete(tempParent, true); + } + } + + [Fact] + public void ThrowsForUnmappedPath() + { + var server = DockerCopilotServer.Create(verbose: false, skills: []); + Assert.Throws(() => + server.MapHostPathToContainer("/some/random/path")); + } +} + +public class TryMapContainerPathToHostTests +{ + [Fact] + public void MapsWorkPathToHost() + { + var server = DockerCopilotServer.Create(verbose: false, skills: []); + var hostDir = server.GetHostDir(); + + Assert.True(server.TryMapContainerPathToHost("/work/run1/file.txt", out var hostPath)); + Assert.Equal(Path.Combine(hostDir, "run1", "file.txt"), hostPath); + } + + [Fact] + public void MapsWorkRootToHost() + { + var server = DockerCopilotServer.Create(verbose: false, skills: []); + Assert.True(server.TryMapContainerPathToHost("/work", out var hostPath)); + Assert.NotNull(hostPath); + } + + [Fact] + public void MapsSkillPathToHost() + { + var tempParent = Path.Combine(Path.GetTempPath(), $"test-skills-{Guid.NewGuid():N}"); + var skillDir = Path.Combine(tempParent, "dotnet", "my-skill"); + Directory.CreateDirectory(skillDir); + + try + { + var skill = new SkillInfo( + Name: "my-skill", + Description: "test", + Path: skillDir, + SkillMdPath: Path.Combine(skillDir, "SKILL.md"), + SkillMdContent: "# Test", + EvalPath: null, + EvalConfig: null); + + var server = DockerCopilotServer.Create(verbose: false, skills: [skill]); + + // The mount is the parent of skill.Path (/skills/dotnet → tempParent/dotnet) + var mounts = DockerCopilotServer.BuildSkillMounts([skill]); + var containerMount = mounts.Values.First(); + + Assert.True(server.TryMapContainerPathToHost($"{containerMount}/my-skill/SKILL.md", out var hostPath)); + Assert.EndsWith(Path.Combine("my-skill", "SKILL.md"), hostPath); + } + finally + { + Directory.Delete(tempParent, true); + } + } + + [Fact] + public void ReturnsFalseForUnmappedPath() + { + var server = DockerCopilotServer.Create(verbose: false, skills: []); + Assert.False(server.TryMapContainerPathToHost("/tmp/something", out _)); + } + + [Fact] + public void ReturnsFalseForPartialWorkPrefix() + { + var server = DockerCopilotServer.Create(verbose: false, skills: []); + Assert.False(server.TryMapContainerPathToHost("/workspace/file.txt", out _)); + } +} + +public class GetCopilotSdkVersionTests +{ + [Fact] + public void ReturnsSemverWithoutCommitHash() + { + var version = DockerCopilotServer.GetCopilotSdkVersion(); + + // Should be something like "0.1.26", not "0.1.26+abc123" + Assert.DoesNotContain("+", version); + Assert.Matches(@"^\d+\.\d+\.\d+", version); + } +}