dotnet · caaavik-msft · Mar 3, 2026 · Mar 6, 2026 · Mar 6, 2026 · Mar 6, 2026
@@ -96,12 +96,28 @@ skill-validator --verdict-warn-only --require-evals ./skills/
 | `--verdict-warn-only` | `false` | Treat verdict failures as warnings (exit 0). Execution errors and `--require-evals` still fail. |
 | `--no-overfitting-check` | `false` | Disable the LLM-based overfitting analysis (on by default) |
 | `--overfitting-fix` | `false` | Generate `eval.fixed.yaml` with improved rubric items/assertions |
+| `--run-in-docker` | `false` | Run agent sessions and setup commands inside a Docker container (requires `GITHUB_TOKEN`). |
 | `--verbose` | `false` | Show tool calls and agent events during runs |
 | `--reporter <spec>` | `console`, `json`, `markdown` | Output format: `console`, `json`, `junit`, `markdown`. |
 | `--results-dir <path>` | `.skill-validator-results` | Directory for file reporter output. |
 
 Models are validated on startup — invalid model names fail fast with a list of available models.
 
+## Docker mode
+
+Use `--run-in-docker` to run agent sessions and setup commands in a Docker container instead of the host environment.
+
+Requirements:
+- Docker installed and running
+- `GITHUB_TOKEN` set in your environment (for example: `export GITHUB_TOKEN="$(gh auth token)"`)
+
+```bash
+# Run validation in Docker
+dotnet run --project src/SkillValidator -- --run-in-docker ./skills/
+```
+
+When Docker mode is enabled, skill-validator builds the runtime image from `src/Docker/Dockerfile`, mounts run workdirs at `/work`, mounts discovered skill directories read-only under `/skills/*`, and stops/removes the container during normal cleanup (and on process exit).
+
 ## Output
 
 Results are displayed in the console with color-coded scores and metric deltas. By default, `json` and `markdown` reporters are enabled and write to `.skill-validator-results/` (override with `--results-dir`). File reporters write to that directory:

@@ -30,6 +30,7 @@ public static RootCommand Create()
         var reporterOpt = new Option<string[]>("--reporter") { Description = "Reporter (console, json, junit, markdown). Can be repeated.", AllowMultipleArgumentsPerToken = true };
         var noOverfittingCheckOpt = new Option<bool>("--no-overfitting-check") { Description = "Disable LLM-based overfitting analysis (on by default)" };
         var overfittingFixOpt = new Option<bool>("--overfitting-fix") { Description = "Generate a fixed eval.yaml with improved rubric items/assertions" };
+        var runInDockerOpt = new Option<bool>("--run-in-docker") { Description = "Run agents and judges inside Docker containers" };
 
         var command = new RootCommand("Validate that agent skills meaningfully improve agent performance")
         {
@@ -53,6 +54,7 @@ public static RootCommand Create()
             reporterOpt,
             noOverfittingCheckOpt,
             overfittingFixOpt,
+            runInDockerOpt,
         };
 
         command.SetAction(async (parseResult, _) =>
@@ -98,6 +100,7 @@ public static RootCommand Create()
                 TestsDir = parseResult.GetValue(testsDirOpt),
                 OverfittingCheck = !parseResult.GetValue(noOverfittingCheckOpt),
                 OverfittingFix = parseResult.GetValue(overfittingFixOpt),
+                RunInDocker = parseResult.GetValue(runInDockerOpt),
             };
 
             return await Run(config);
@@ -117,6 +120,26 @@ public static RootCommand Create()
 
     public static async Task<int> Run(ValidatorConfig config)
     {
+        // Discover skills first (needed to determine Docker volume mounts)
+        var allSkills = new List<SkillInfo>();
+        foreach (var path in config.SkillPaths)
+        {
+            var skills = await SkillDiscovery.DiscoverSkills(path, config.TestsDir);
+            allSkills.AddRange(skills);
+        }
+
+        if (allSkills.Count == 0)
+        {
+            Console.Error.WriteLine("No skills found in the specified paths.");
+            return 1;
+        }
+
+        Console.WriteLine($"Found {allSkills.Count} skill(s)\n");
+
+        // Set up DockerCopilotServer with skill directories to mount
+        if (config.RunInDocker)
+            DockerCopilotServer.Initialize(config.Verbose, allSkills);
+
         // Validate model early
         try
         {
@@ -148,22 +171,6 @@ public static async Task<int> Run(ValidatorConfig config)
         if (config.Verbose)
             Console.WriteLine($"Results dir: {config.ResultsDir}");
 
-        // Discover skills
-        var allSkills = new List<SkillInfo>();
-        foreach (var path in config.SkillPaths)
-        {
-            var skills = await SkillDiscovery.DiscoverSkills(path, config.TestsDir);
-            allSkills.AddRange(skills);
-        }
-
-        if (allSkills.Count == 0)
-        {
-            Console.Error.WriteLine("No skills found in the specified paths.");
-            return 1;
-        }
-
-        Console.WriteLine($"Found {allSkills.Count} skill(s)\n");
-
         if (config.Runs < 5)
             Console.WriteLine($"\x1b[33m⚠  Running with {config.Runs} run(s). For statistically significant results, use --runs 5 or higher.\x1b[0m");
 
@@ -202,6 +209,8 @@ await Reporter.ReportResults(verdicts, config.Reporters, config.Verbose,
             config.Model, config.JudgeModel, config.ResultsDir);
 
         await AgentRunner.StopSharedClient();
+        if (DockerCopilotServer.Instance is { } dockerServer)
+            await dockerServer.StopAsync();
         await AgentRunner.CleanupWorkDirs();
 
         // Always fail on execution errors, even in --verdict-warn-only mode
@@ -262,13 +271,13 @@ await Reporter.ReportResults(verdicts, config.Reporters, config.Verbose,
             log(warning);
 
         // Launch overfitting check in parallel with scenario execution
-        var workDir = Path.GetTempPath();
+        var overfittingWorkDir = DockerCopilotServer.Instance is not null ? "/tmp" : Path.GetTempPath();
         Task<OverfittingResult?> overfittingTask = Task.FromResult<OverfittingResult?>(null);
         if (config.OverfittingCheck && skill.EvalConfig is not null)
         {
             log("🔍 Running overfitting check (parallel)...");
             overfittingTask = Services.OverfittingJudge.Analyze(skill, new OverfittingJudgeOptions(
-                config.JudgeModel, config.Verbose, config.JudgeTimeout, workDir));
+                config.JudgeModel, config.Verbose, config.JudgeTimeout, overfittingWorkDir));
         }
 
         bool singleScenario = skill.EvalConfig!.Scenarios.Count == 1;
@@ -301,7 +310,7 @@ await Reporter.ReportResults(verdicts, config.Reporters, config.Verbose,
             try
             {
                 await Services.OverfittingJudge.GenerateFix(skill, overfittingResult, new OverfittingJudgeOptions(
-                    config.JudgeModel, config.Verbose, config.JudgeTimeout, workDir));
+                    config.JudgeModel, config.Verbose, config.JudgeTimeout, overfittingWorkDir));
                 log("📝 Generated eval.fixed.yaml with suggested improvements");
             }
             catch (Exception ex)

@@ -0,0 +1,11 @@
+FROM mcr.microsoft.com/dotnet/sdk:10.0 AS build
+
+ARG COPILOT_SDK_VERSION
+RUN dotnet new console -o /tmp/dl \
+    && dotnet add /tmp/dl package GitHub.Copilot.SDK --version $COPILOT_SDK_VERSION \
+    && dotnet build /tmp/dl -c Release \
+    && cp /tmp/dl/bin/Release/net10.0/runtimes/*/native/copilot /usr/local/bin/copilot \
+    && chmod +x /usr/local/bin/copilot \
+    && rm -rf /tmp/dl
+
+RUN copilot --version
@@ -307,6 +307,7 @@ public sealed record ValidatorConfig
     public string? TestsDir { get; init; }
     public bool OverfittingCheck { get; init; } = true;
     public bool OverfittingFix { get; init; }
+    public bool RunInDocker { get; init; } = false;
 }
 
 public static class DefaultWeights

@@ -35,18 +35,23 @@ public static async Task<CopilotClient> GetSharedClient(bool verbose)
         {
             if (_sharedClient is not null) return _sharedClient;
 
-            var options = new CopilotClientOptions
+            var options = new CopilotClientOptions();
+            if (DockerCopilotServer.Instance is {} dockerServer)
             {
-                LogLevel = verbose ? "info" : "none",
-            };
-
-            var githubToken = Environment.GetEnvironmentVariable("GITHUB_TOKEN");
-            if (!string.IsNullOrEmpty(githubToken))
+                _workDirs.Add(dockerServer.GetHostDir());
+                options.CliUrl = await dockerServer.GetCliUrlAsync();
+            }
+            else
             {
-                options.GitHubToken = githubToken;
-                // Clear the token from the environment so child processes
-                // (e.g. LLM-generated code, eval shell commands) cannot read it.
-                Environment.SetEnvironmentVariable("GITHUB_TOKEN", null);
+                options.LogLevel = verbose ? "info" : "none";
+                var githubToken = Environment.GetEnvironmentVariable("GITHUB_TOKEN");
+                if (!string.IsNullOrEmpty(githubToken))
+                {
+                    options.GitHubToken = githubToken;
+                    // Clear the token from the environment so child processes
+                    // (e.g. LLM-generated code, eval shell commands) cannot read it.
+                    Environment.SetEnvironmentVariable("GITHUB_TOKEN", null);
+                }
             }
 
             _sharedClient = new CopilotClient(options);
@@ -93,6 +98,9 @@ public static bool CheckPermission(PermissionRequest request, string workDir, st
 
         if (string.IsNullOrEmpty(reqPath)) return true;
 
+        if (DockerCopilotServer.Instance is {} dockerServer && dockerServer.TryMapContainerPathToHost(reqPath, out var mappedPath))
+            reqPath = mappedPath;
+
         var resolved = Path.GetFullPath(reqPath);
         var allowedDirs = new List<string> { Path.GetFullPath(workDir) };
         if (skillPath is not null) allowedDirs.Add(Path.GetFullPath(skillPath));
@@ -106,10 +114,11 @@ internal static SessionConfig BuildSessionConfig(
         SkillInfo? skill, string model, string workDir,
         IReadOnlyDictionary<string, MCPServerDef>? mcpServers = null)
     {
+        var parentDir = DockerCopilotServer.Instance?.GetHostDir() ?? Path.GetTempPath();
         var skillPath = skill is not null ? Path.GetDirectoryName(skill.Path) : null;
 
         // Create a unique temporary config directory for this session to not share any data
-        var configDir = Path.Combine(Path.GetTempPath(), $"sv-cfg-{Guid.NewGuid():N}");
+        var configDir = Path.Combine(parentDir, $"sv-cfg-{Guid.NewGuid():N}");
         Directory.CreateDirectory(configDir);
         _workDirs.Add(configDir);
 
@@ -133,13 +142,19 @@ internal static SessionConfig BuildSessionConfig(
             }
         }
 
+        var sessionWorkDir = DockerCopilotServer.Instance?.MapHostPathToContainer(workDir) ?? workDir;
+        var sessionSkillPath = skillPath is not null
+            ? (DockerCopilotServer.Instance?.MapHostPathToContainer(skillPath) ?? skillPath)
+            : null;
+        var sessionConfigDir = DockerCopilotServer.Instance?.MapHostPathToContainer(configDir) ?? configDir;
+
         return new SessionConfig
         {
             Model = model,
             Streaming = true,
-            WorkingDirectory = workDir,
-            SkillDirectories = skill is not null ? [skillPath!] : [],
-            ConfigDir = configDir,
+            WorkingDirectory = sessionWorkDir,
+            SkillDirectories = skill is not null ? [sessionSkillPath!] : [],
+            ConfigDir = sessionConfigDir,
             McpServers = sdkMcp,
             InfiniteSessions = new InfiniteSessionConfig { Enabled = false },
             OnPermissionRequest = (request, _) =>
@@ -289,7 +304,8 @@ public static async Task<RunMetrics> RunAgent(RunOptions options)
 
     private static async Task<string> SetupWorkDir(EvalScenario scenario, string? skillPath, string? evalPath)
     {
-        var workDir = Path.Combine(Path.GetTempPath(), $"sv-{Guid.NewGuid():N}");
+        var parentDir = DockerCopilotServer.Instance?.GetHostDir() ?? Path.GetTempPath();
+        var workDir = Path.Combine(parentDir, $"sv-{Guid.NewGuid():N}");
         Directory.CreateDirectory(workDir);
         _workDirs.Add(workDir);
 
@@ -335,6 +351,13 @@ private static async Task<string> SetupWorkDir(EvalScenario scenario, string? sk
             {
                 try
                 {
+                    if (DockerCopilotServer.Instance is {} dockerServer)
+                    {
+                        var containerDir = dockerServer.MapHostPathToContainer(workDir);
+                        await dockerServer.ExecAsync(containerDir, cmd);
+                        continue;
+                    }
+
                     var psi = new ProcessStartInfo
                     {
                         FileName = OperatingSystem.IsWindows() ? "cmd.exe" : "/bin/sh",