diff --git a/eng/skill-validator/src/Commands/RejudgeCommand.cs b/eng/skill-validator/src/Commands/RejudgeCommand.cs new file mode 100644 index 00000000..241c3ce7 --- /dev/null +++ b/eng/skill-validator/src/Commands/RejudgeCommand.cs @@ -0,0 +1,226 @@ +using System.CommandLine; +using System.Text.Json; +using SkillValidator.Models; +using SkillValidator.Services; +using SkillValidator.Utilities; + +namespace SkillValidator.Commands; + +public static class RejudgeCommand +{ + public static Command Create() + { + var resultsDirArg = new Argument("results-dir") { Description = "Path to a timestamped results directory containing sessions.db" }; + var judgeModelOpt = new Option("--judge-model") { Description = "Model to use for judging (defaults to original model)" }; + var judgeModeOpt = new Option("--judge-mode") { Description = "Judge mode: pairwise, independent, or both", DefaultValueFactory = _ => "pairwise" }; + var judgeTimeoutOpt = new Option("--judge-timeout") { Description = "Judge timeout in seconds", DefaultValueFactory = _ => 300 }; + var verboseOpt = new Option("--verbose") { Description = "Show detailed output" }; + var minImprovementOpt = new Option("--min-improvement") { Description = "Minimum improvement score to pass (0-1)", DefaultValueFactory = _ => 0.1 }; + var requireCompletionOpt = new Option("--require-completion") { Description = "Fail if skill regresses task completion", DefaultValueFactory = _ => true }; + var confidenceLevelOpt = new Option("--confidence-level") { Description = "Confidence level for statistical intervals (0-1)", DefaultValueFactory = _ => 0.95 }; + + var command = new Command("rejudge", "Re-run judges on saved sessions without re-running agents") + { + resultsDirArg, + judgeModelOpt, + judgeModeOpt, + judgeTimeoutOpt, + verboseOpt, + minImprovementOpt, + requireCompletionOpt, + confidenceLevelOpt, + }; + + command.SetAction(async (parseResult, _) => + { + var resultsDir = parseResult.GetValue(resultsDirArg)!; + var judgeModel = parseResult.GetValue(judgeModelOpt); + var verbose = parseResult.GetValue(verboseOpt); + var judgeTimeout = parseResult.GetValue(judgeTimeoutOpt) * 1000; + var minImprovement = parseResult.GetValue(minImprovementOpt); + var requireCompletion = parseResult.GetValue(requireCompletionOpt); + var confidenceLevel = parseResult.GetValue(confidenceLevelOpt); + + var judgeMode = parseResult.GetValue(judgeModeOpt) switch + { + "independent" => JudgeMode.Independent, + "both" => JudgeMode.Both, + _ => JudgeMode.Pairwise, + }; + + return await Run(resultsDir, judgeModel, judgeMode, judgeTimeout, verbose, + minImprovement, requireCompletion, confidenceLevel); + }); + + return command; + } + + public static async Task Run( + string resultsDir, + string? judgeModel, + JudgeMode judgeMode, + int judgeTimeout, + bool verbose, + double minImprovement, + bool requireCompletion, + double confidenceLevel) + { + var dbPath = Path.Combine(resultsDir, "sessions.db"); + if (!File.Exists(dbPath)) + { + Console.Error.WriteLine($"No sessions.db found at {dbPath}"); + Console.Error.WriteLine("Use --keep-sessions during evaluation to enable rejudging."); + return 1; + } + + using var sessionDb = new SessionDatabase(dbPath); + var sessions = sessionDb.GetCompletedSessions(); + + if (sessions.Count == 0) + { + Console.Error.WriteLine("No completed sessions found in the database."); + return 1; + } + + // Determine judge model from sessions if not specified + var effectiveJudgeModel = judgeModel ?? sessions[0].Model; + + // Validate model + try + { + var client = await AgentRunner.GetSharedClient(verbose); + var models = await client.ListModelsAsync(); + if (!models.Any(m => m.Id == effectiveJudgeModel)) + { + Console.Error.WriteLine($"Invalid model: \"{effectiveJudgeModel}\"\nAvailable models: {string.Join(", ", models.Select(m => m.Id))}"); + return 1; + } + } + catch (Exception error) + { + Console.Error.WriteLine($"Failed to validate model: {error}"); + return 1; + } + + Console.WriteLine($"Rejudging {sessions.Count} sessions with model: {effectiveJudgeModel}, mode: {judgeMode}"); + + bool usePairwise = judgeMode is JudgeMode.Pairwise or JudgeMode.Both; + var jsonOpts = new JsonSerializerOptions { PropertyNameCaseInsensitive = true }; + + // Group sessions into run pairs: (skill, scenario, run_index) → (baseline, with-skill) + var runPairs = sessions + .GroupBy(s => (s.SkillName, s.ScenarioName, s.RunIndex)) + .Where(g => g.Any(s => s.Role == "baseline") && g.Any(s => s.Role == "with-skill")) + .ToList(); + + if (runPairs.Count == 0) + { + Console.Error.WriteLine("No complete run pairs (baseline + with-skill) found."); + return 1; + } + + Console.WriteLine($"Found {runPairs.Count} run pair(s) across {runPairs.Select(g => g.Key.SkillName).Distinct().Count()} skill(s)\n"); + + // Group by skill → scenario for verdict computation + var skillGroups = runPairs.GroupBy(g => g.Key.SkillName); + var verdicts = new List(); + + foreach (var skillGroup in skillGroups) + { + var skillName = skillGroup.Key; + var firstSession = skillGroup.First().First(); + Console.WriteLine($"[{skillName}] Rejudging..."); + + var scenarioGroups = skillGroup.GroupBy(g => g.Key.ScenarioName); + var comparisons = new List(); + + foreach (var scenarioGroup in scenarioGroups) + { + var scenarioName = scenarioGroup.Key; + var perRunScores = new List(); + RunResult? lastBaseline = null; + RunResult? lastWithSkill = null; + PairwiseJudgeResult? lastPairwise = null; + + foreach (var runGroup in scenarioGroup) + { + var baselineSess = runGroup.First(s => s.Role == "baseline"); + var skillSess = runGroup.First(s => s.Role == "with-skill"); + + var baselineMetrics = JsonSerializer.Deserialize(baselineSess.MetricsJson!, jsonOpts)!; + var withSkillMetrics = JsonSerializer.Deserialize(skillSess.MetricsJson!, jsonOpts)!; + + // Reconstruct scenario for judge (we need rubric) + // For now, create a minimal scenario from the saved data + var scenario = new EvalScenario(scenarioName, ""); + + // Re-judge + var judgeOpts = new JudgeOptions(effectiveJudgeModel, verbose, judgeTimeout, baselineMetrics.WorkDir, firstSession.SkillPath); + var judgeTasks = await Task.WhenAll( + Judge.JudgeRun(scenario, baselineMetrics, judgeOpts), + Judge.JudgeRun(scenario, withSkillMetrics, judgeOpts with { WorkDir = withSkillMetrics.WorkDir })); + + var baselineResult = new RunResult(baselineMetrics, judgeTasks[0]); + var withSkillResult = new RunResult(withSkillMetrics, judgeTasks[1]); + + // Update judge results in DB + sessionDb.SaveJudgeResult(baselineSess.Id, JsonSerializer.Serialize(judgeTasks[0])); + sessionDb.SaveJudgeResult(skillSess.Id, JsonSerializer.Serialize(judgeTasks[1])); + + // Pairwise + PairwiseJudgeResult? pairwise = null; + if (usePairwise) + { + try + { + pairwise = await PairwiseJudge.Judge( + scenario, baselineMetrics, withSkillMetrics, + new PairwiseJudgeOptions(effectiveJudgeModel, verbose, judgeTimeout, baselineMetrics.WorkDir, firstSession.SkillPath)); + sessionDb.SavePairwiseResult(baselineSess.Id, JsonSerializer.Serialize(pairwise)); + } + catch (Exception error) + { + Console.Error.WriteLine($" ⚠️ Pairwise judge failed: {error.Message}"); + } + } + + var runComparison = Comparator.CompareScenario(scenarioName, baselineResult, withSkillResult, pairwise); + perRunScores.Add(runComparison.ImprovementScore); + + lastBaseline = baselineResult; + lastWithSkill = withSkillResult; + lastPairwise = pairwise; + } + + if (lastBaseline is not null && lastWithSkill is not null) + { + var comparison = Comparator.CompareScenario(scenarioName, lastBaseline, lastWithSkill, lastPairwise); + comparison.PerRunScores = perRunScores; + comparisons.Add(comparison); + } + } + + if (comparisons.Count > 0) + { + var skill = new SkillInfo(skillName, "", firstSession.SkillPath, firstSession.SkillPath, "", null, null); + var verdict = Comparator.ComputeVerdict(skill, comparisons, minImprovement, requireCompletion, confidenceLevel); + Console.WriteLine($"[{skillName}] {(verdict.Passed ? "✅" : "❌")} Score: {verdict.OverallImprovementScore * 100:F1}%"); + verdicts.Add(verdict); + } + } + + // Write new results + var reporters = new List + { + new(ReporterType.Console), + new(ReporterType.Json), + new(ReporterType.Markdown), + }; + await Reporter.ReportResults(verdicts, reporters, verbose, + effectiveJudgeModel, effectiveJudgeModel, resultsDir); + + await AgentRunner.StopSharedClient(); + + return verdicts.All(v => v.Passed) ? 0 : 1; + } +} diff --git a/eng/skill-validator/src/Commands/ValidateCommand.cs b/eng/skill-validator/src/Commands/ValidateCommand.cs index f20e1dc7..0cd5a7e4 100644 --- a/eng/skill-validator/src/Commands/ValidateCommand.cs +++ b/eng/skill-validator/src/Commands/ValidateCommand.cs @@ -1,4 +1,5 @@ using System.CommandLine; +using System.Text.Json; using SkillValidator.Models; using SkillValidator.Services; using SkillValidator.Utilities; @@ -30,6 +31,7 @@ public static RootCommand Create() var reporterOpt = new Option("--reporter") { Description = "Reporter (console, json, junit, markdown). Can be repeated.", AllowMultipleArgumentsPerToken = true }; var noOverfittingCheckOpt = new Option("--no-overfitting-check") { Description = "Disable LLM-based overfitting analysis (on by default)" }; var overfittingFixOpt = new Option("--overfitting-fix") { Description = "Generate a fixed eval.yaml with improved rubric items/assertions" }; + var keepSessionsOpt = new Option("--keep-sessions") { Description = "Preserve agent session data in the results directory for later rejudging" }; var command = new RootCommand("Validate that agent skills meaningfully improve agent performance") { @@ -53,6 +55,7 @@ public static RootCommand Create() reporterOpt, noOverfittingCheckOpt, overfittingFixOpt, + keepSessionsOpt, }; command.SetAction(async (parseResult, _) => @@ -98,6 +101,7 @@ public static RootCommand Create() TestsDir = parseResult.GetValue(testsDirOpt), OverfittingCheck = !parseResult.GetValue(noOverfittingCheckOpt), OverfittingFix = parseResult.GetValue(overfittingFixOpt), + KeepSessions = parseResult.GetValue(keepSessionsOpt), }; return await Run(config); @@ -169,13 +173,27 @@ public static async Task Run(ValidatorConfig config) bool usePairwise = config.JudgeMode is JudgeMode.Pairwise or JudgeMode.Both; + // Set up session persistence if requested + string? sessionsDir = null; + SessionDatabase? sessionDb = null; + string? timestampedResultsDir = null; + if (config.KeepSessions && config.ResultsDir is not null) + { + timestampedResultsDir = Path.Combine(config.ResultsDir, Reporter.FormatTimestamp(DateTime.Now)); + Directory.CreateDirectory(timestampedResultsDir); + sessionsDir = Path.Combine(timestampedResultsDir, "sessions"); + Directory.CreateDirectory(sessionsDir); + sessionDb = new SessionDatabase(Path.Combine(timestampedResultsDir, "sessions.db")); + Console.WriteLine($"Session persistence enabled: {timestampedResultsDir}"); + } + using var spinner = new Spinner(); using var skillLimit = new ConcurrencyLimiter(config.ParallelSkills); // Evaluate skills spinner.Start($"Evaluating {allSkills.Count} skill(s)..."); var skillTasks = allSkills.Select(skill => - skillLimit.RunAsync(() => EvaluateSkill(skill, config, usePairwise, spinner))); + skillLimit.RunAsync(() => EvaluateSkill(skill, config, usePairwise, spinner, sessionsDir, sessionDb))); var settled = await Task.WhenAll(skillTasks.Select(async t => { try { return (Result: await t, Error: (Exception?)null); } @@ -199,10 +217,11 @@ public static async Task Run(ValidatorConfig config) } await Reporter.ReportResults(verdicts, config.Reporters, config.Verbose, - config.Model, config.JudgeModel, config.ResultsDir); + config.Model, config.JudgeModel, config.ResultsDir, timestampedResultsDir); await AgentRunner.StopSharedClient(); - await AgentRunner.CleanupWorkDirs(); + await AgentRunner.CleanupWorkDirs(config.KeepSessions); + sessionDb?.Dispose(); // Always fail on execution errors, even in --verdict-warn-only mode if (hasRejections) return 1; @@ -224,7 +243,9 @@ await Reporter.ReportResults(verdicts, config.Reporters, config.Verbose, SkillInfo skill, ValidatorConfig config, bool usePairwise, - Spinner spinner) + Spinner spinner, + string? sessionsDir, + SessionDatabase? sessionDb) { var prefix = $"[{skill.Name}]"; var log = (string msg) => spinner.Log($"{prefix} {msg}"); @@ -275,7 +296,7 @@ await Reporter.ReportResults(verdicts, config.Reporters, config.Verbose, using var scenarioLimit = new ConcurrencyLimiter(config.ParallelScenarios); var scenarioTasks = skill.EvalConfig.Scenarios.Select(scenario => - scenarioLimit.RunAsync(() => ExecuteScenario(scenario, skill, config, usePairwise, singleScenario, spinner))); + scenarioLimit.RunAsync(() => ExecuteScenario(scenario, skill, config, usePairwise, singleScenario, spinner, sessionsDir, sessionDb))); var comparisons = (await Task.WhenAll(scenarioTasks)).ToList(); // Await overfitting result (non-fatal — never blocks an otherwise-successful evaluation) @@ -349,7 +370,9 @@ private static async Task ExecuteScenario( ValidatorConfig config, bool usePairwise, bool singleScenario, - Spinner spinner) + Spinner spinner, + string? sessionsDir, + SessionDatabase? sessionDb) { var tag = singleScenario ? $"[{skill.Name}]" : $"[{skill.Name}/{scenario.Name}]"; var scenarioLog = (string msg) => spinner.Log($"{tag} {msg}"); @@ -359,7 +382,7 @@ private static async Task ExecuteScenario( scenarioLog("📋 Starting scenario"); var runTasks = Enumerable.Range(0, config.Runs).Select(i => - runLimit.RunAsync(() => ExecuteRun(i, scenario, skill, config, usePairwise, singleScenario, spinner))); + runLimit.RunAsync(() => ExecuteRun(i, scenario, skill, config, usePairwise, singleScenario, spinner, sessionsDir, sessionDb))); var runResults = await Task.WhenAll(runTasks); scenarioLog($"✓ All {config.Runs} run(s) complete"); @@ -413,7 +436,9 @@ private static async Task ExecuteRun( ValidatorConfig config, bool usePairwise, bool singleScenario, - Spinner spinner) + Spinner spinner, + string? sessionsDir, + SessionDatabase? sessionDb) { var runTag = config.Runs > 1 ? (singleScenario ? $"[{skill.Name}/{runIndex + 1}]" : $"[{skill.Name}/{scenario.Name}/{runIndex + 1}]") @@ -423,12 +448,36 @@ private static async Task ExecuteRun( if (config.Verbose) runLog("running agents..."); + // Generate session IDs for tracking + var baselineSessionId = Guid.NewGuid().ToString("N"); + var skillSessionId = Guid.NewGuid().ToString("N"); + + // Register sessions before running — config_dir stored as relative path for portability + var skillDir = Path.GetDirectoryName(skill.Path); + var skillSha = skillDir is not null ? SessionDatabase.ComputeDirectorySha(skillDir) : null; + var baselineConfigDir = sessionsDir is not null ? Path.Combine("sessions", baselineSessionId) : null; + var skillConfigDir = sessionsDir is not null ? Path.Combine("sessions", skillSessionId) : null; + sessionDb?.RegisterSession(baselineSessionId, skill.Name, skill.Path, scenario.Name, runIndex, "baseline", config.Model, baselineConfigDir, null, scenario.Prompt, skillSha); + sessionDb?.RegisterSession(skillSessionId, skill.Name, skill.Path, scenario.Name, runIndex, "with-skill", config.Model, skillConfigDir, null, scenario.Prompt, skillSha); + var agentTasks = await Task.WhenAll( - AgentRunner.RunAgent(new RunOptions(scenario, null, skill.EvalPath, config.Model, config.Verbose, runLog)), - AgentRunner.RunAgent(new RunOptions(scenario, skill, skill.EvalPath, config.Model, config.Verbose, runLog))); + AgentRunner.RunAgent(new RunOptions(scenario, null, skill.EvalPath, config.Model, config.Verbose, runLog, + SessionsDir: sessionsDir, SessionId: baselineSessionId, SessionRole: "baseline", SkillName: skill.Name, ScenarioName: scenario.Name, RunIndex: runIndex)), + AgentRunner.RunAgent(new RunOptions(scenario, skill, skill.EvalPath, config.Model, config.Verbose, runLog, + SessionsDir: sessionsDir, SessionId: skillSessionId, SessionRole: "with-skill", SkillName: skill.Name, ScenarioName: scenario.Name, RunIndex: runIndex))); var baselineMetrics = agentTasks[0]; var withSkillMetrics = agentTasks[1]; + // Save metrics to session DB + if (sessionDb is not null) + { + var jsonOpts = new JsonSerializerOptions { WriteIndented = false }; + var baselineStatus = baselineMetrics.TimedOut ? "timed_out" : "completed"; + var skillStatus = withSkillMetrics.TimedOut ? "timed_out" : "completed"; + sessionDb.CompleteSession(baselineSessionId, baselineStatus, JsonSerializer.Serialize(baselineMetrics, jsonOpts)); + sessionDb.CompleteSession(skillSessionId, skillStatus, JsonSerializer.Serialize(withSkillMetrics, jsonOpts)); + } + // Evaluate assertions if (scenario.Assertions is { Count: > 0 }) { @@ -489,6 +538,14 @@ private static async Task ExecuteRun( var baseline = new RunResult(baselineMetrics, baselineJudge); var withSkillResult = new RunResult(withSkillMetrics, withSkillJudge); + // Save judge results to session DB + if (sessionDb is not null) + { + var jsonOpts = new JsonSerializerOptions { WriteIndented = false }; + sessionDb.SaveJudgeResult(baselineSessionId, JsonSerializer.Serialize(baselineJudge, jsonOpts)); + sessionDb.SaveJudgeResult(skillSessionId, JsonSerializer.Serialize(withSkillJudge, jsonOpts)); + } + // Pairwise judging PairwiseJudgeResult? pairwise = null; if (usePairwise) @@ -505,6 +562,13 @@ private static async Task ExecuteRun( } } + // Save pairwise result to session DB + if (sessionDb is not null && pairwise is not null) + { + var jsonOpts = new JsonSerializerOptions { WriteIndented = false }; + sessionDb.SavePairwiseResult(baselineSessionId, JsonSerializer.Serialize(pairwise, jsonOpts)); + } + // Skill activation var skillActivation = MetricsCollector.ExtractSkillActivation(withSkillMetrics.Events, baselineMetrics.ToolCallBreakdown); diff --git a/eng/skill-validator/src/Models/Models.cs b/eng/skill-validator/src/Models/Models.cs index 71be6ea4..d31f9248 100644 --- a/eng/skill-validator/src/Models/Models.cs +++ b/eng/skill-validator/src/Models/Models.cs @@ -299,6 +299,7 @@ public sealed record ValidatorConfig public string? TestsDir { get; init; } public bool OverfittingCheck { get; init; } = true; public bool OverfittingFix { get; init; } + public bool KeepSessions { get; init; } } public static class DefaultWeights diff --git a/eng/skill-validator/src/Program.cs b/eng/skill-validator/src/Program.cs index 042ea61c..90b7909c 100644 --- a/eng/skill-validator/src/Program.cs +++ b/eng/skill-validator/src/Program.cs @@ -3,6 +3,7 @@ var rootCommand = ValidateCommand.Create(); rootCommand.Add(ConsolidateCommand.Create()); +rootCommand.Add(RejudgeCommand.Create()); var parseResult = rootCommand.Parse(args); return await parseResult.InvokeAsync(); diff --git a/eng/skill-validator/src/Services/AgentRunner.cs b/eng/skill-validator/src/Services/AgentRunner.cs index 066e6050..a255a288 100644 --- a/eng/skill-validator/src/Services/AgentRunner.cs +++ b/eng/skill-validator/src/Services/AgentRunner.cs @@ -12,13 +12,20 @@ public sealed record RunOptions( string? EvalPath, string Model, bool Verbose, - Action? Log = null); + Action? Log = null, + string? SessionsDir = null, + string? SessionId = null, + string? SessionRole = null, + string? SkillName = null, + string? ScenarioName = null, + int RunIndex = 0); public static class AgentRunner { private static CopilotClient? _sharedClient; private static readonly SemaphoreSlim _clientLock = new(1, 1); private static readonly ConcurrentBag _workDirs = []; + private static readonly ConcurrentBag _configDirs = []; /// /// Returns the shared , creating it on first call. @@ -68,11 +75,16 @@ public static async Task StopSharedClient() } /// Remove all temporary working directories created during runs. - public static Task CleanupWorkDirs() + public static Task CleanupWorkDirs(bool keepSessions = false) { var dirs = _workDirs.ToArray(); _workDirs.Clear(); - return Task.WhenAll(dirs.Select(dir => + + var configDirsToClean = keepSessions ? [] : _configDirs.ToArray(); + _configDirs.Clear(); + + var allDirs = dirs.Concat(configDirsToClean); + return Task.WhenAll(allDirs.Select(dir => { try { Directory.Delete(dir, true); } catch { } return Task.CompletedTask; @@ -103,14 +115,27 @@ public static bool CheckPermission(PermissionRequest request, string workDir, st internal static SessionConfig BuildSessionConfig( SkillInfo? skill, string model, string workDir, - IReadOnlyDictionary? mcpServers = null) + IReadOnlyDictionary? mcpServers = null, + string? sessionsDir = null, string? sessionId = null) { var skillPath = skill is not null ? Path.GetDirectoryName(skill.Path) : null; - // Create a unique temporary config directory for this session to not share any data - var configDir = Path.Combine(Path.GetTempPath(), $"skill-validator-cfg-{Guid.NewGuid():N}"); - Directory.CreateDirectory(configDir); - _workDirs.Add(configDir); + string configDir; + if (sessionsDir is not null) + { + // Persistent session dir — use sessionId as folder name for DB linkage + var dirName = sessionId ?? Guid.NewGuid().ToString("N"); + configDir = Path.Combine(sessionsDir, dirName); + Directory.CreateDirectory(configDir); + _configDirs.Add(configDir); + } + else + { + // Ephemeral temp dir — cleaned up after run + configDir = Path.Combine(Path.GetTempPath(), $"skill-validator-cfg-{Guid.NewGuid():N}"); + Directory.CreateDirectory(configDir); + _configDirs.Add(configDir); + } // Convert MCPServerDef records to the SDK's Dictionary shape Dictionary? sdkMcp = null; @@ -171,7 +196,7 @@ public static async Task RunAgent(RunOptions options) var client = await GetSharedClient(options.Verbose); await using var session = await client.CreateSessionAsync( - BuildSessionConfig(options.Skill, options.Model, workDir, options.Skill?.McpServers)); + BuildSessionConfig(options.Skill, options.Model, workDir, options.Skill?.McpServers, options.SessionsDir, options.SessionId)); var done = new TaskCompletionSource(); var effectiveTimeout = options.Scenario.Timeout; diff --git a/eng/skill-validator/src/Services/Reporter.cs b/eng/skill-validator/src/Services/Reporter.cs index 1977186b..0c3201d0 100644 --- a/eng/skill-validator/src/Services/Reporter.cs +++ b/eng/skill-validator/src/Services/Reporter.cs @@ -13,13 +13,15 @@ public static async Task ReportResults( bool verbose, string? model = null, string? judgeModel = null, - string? resultsDir = null) + string? resultsDir = null, + string? timestampedResultsDir = null) { bool needsResultsDir = reporters.Any(r => r.Type is ReporterType.Json or ReporterType.Junit or ReporterType.Markdown); - string? effectiveResultsDir = resultsDir is not null && needsResultsDir - ? Path.Combine(resultsDir, FormatTimestamp(DateTime.Now)) - : null; + string? effectiveResultsDir = timestampedResultsDir + ?? (resultsDir is not null && needsResultsDir + ? Path.Combine(resultsDir, FormatTimestamp(DateTime.Now)) + : null); if (effectiveResultsDir is not null) Directory.CreateDirectory(effectiveResultsDir); diff --git a/eng/skill-validator/src/Services/SessionDatabase.cs b/eng/skill-validator/src/Services/SessionDatabase.cs new file mode 100644 index 00000000..ee3b4908 --- /dev/null +++ b/eng/skill-validator/src/Services/SessionDatabase.cs @@ -0,0 +1,257 @@ +using System.Security.Cryptography; +using Microsoft.Data.Sqlite; + +namespace SkillValidator.Services; + +/// +/// Tracks eval sessions in a SQLite database for crash recovery and rejudging. +/// Thread-safe for concurrent scenario/run execution. +/// +public sealed class SessionDatabase : IDisposable +{ + private readonly SqliteConnection _connection; + private readonly SemaphoreSlim _writeLock = new(1, 1); + + public SessionDatabase(string dbPath) + { + Directory.CreateDirectory(Path.GetDirectoryName(dbPath)!); + _connection = new SqliteConnection($"Data Source={dbPath}"); + _connection.Open(); + Initialize(); + } + + private void Initialize() + { + using var cmd = _connection.CreateCommand(); + cmd.CommandText = """ + PRAGMA journal_mode=WAL; + PRAGMA busy_timeout=5000; + + CREATE TABLE IF NOT EXISTS schema_info ( + key TEXT PRIMARY KEY, + value TEXT NOT NULL + ); + INSERT OR IGNORE INTO schema_info (key, value) VALUES ('type', 'skill-validator'); + INSERT OR IGNORE INTO schema_info (key, value) VALUES ('version', '1'); + + CREATE TABLE IF NOT EXISTS sessions ( + id TEXT PRIMARY KEY, + skill_name TEXT NOT NULL, + skill_path TEXT NOT NULL, + scenario_name TEXT NOT NULL, + run_index INTEGER NOT NULL, + role TEXT NOT NULL, + model TEXT NOT NULL, + config_dir TEXT, + work_dir TEXT, + prompt TEXT, + skill_sha TEXT, + status TEXT NOT NULL DEFAULT 'running', + started_at TEXT NOT NULL, + completed_at TEXT + ); + + CREATE TABLE IF NOT EXISTS run_results ( + session_id TEXT PRIMARY KEY REFERENCES sessions(id), + metrics_json TEXT NOT NULL, + judge_json TEXT, + pairwise_json TEXT + ); + """; + cmd.ExecuteNonQuery(); + } + + /// + /// Computes a SHA-256 hash over all files in a directory, sorted by relative path. + /// Returns the first 12 hex characters for a short, collision-resistant identifier. + /// + public static string ComputeDirectorySha(string dirPath) + { + using var sha = IncrementalHash.CreateHash(HashAlgorithmName.SHA256); + var files = Directory.GetFiles(dirPath, "*", SearchOption.AllDirectories) + .Select(f => Path.GetRelativePath(dirPath, f).Replace('\\', '/')) + .OrderBy(f => f, StringComparer.Ordinal) + .ToList(); + + foreach (var relPath in files) + { + sha.AppendData(System.Text.Encoding.UTF8.GetBytes(relPath)); + sha.AppendData(File.ReadAllBytes(Path.Combine(dirPath, relPath))); + } + + var hash = sha.GetHashAndReset(); + return Convert.ToHexString(hash)[..12].ToLowerInvariant(); + } + + public void RegisterSession(string sessionId, string skillName, string skillPath, + string scenarioName, int runIndex, string role, string model, + string? configDir, string? workDir, string? prompt = null, string? skillSha = null) + { + _writeLock.Wait(); + try + { + using var cmd = _connection.CreateCommand(); + cmd.CommandText = """ + INSERT INTO sessions (id, skill_name, skill_path, scenario_name, run_index, role, model, config_dir, work_dir, prompt, skill_sha, status, started_at) + VALUES ($id, $skill_name, $skill_path, $scenario_name, $run_index, $role, $model, $config_dir, $work_dir, $prompt, $skill_sha, 'running', $started_at) + """; + cmd.Parameters.AddWithValue("$id", sessionId); + cmd.Parameters.AddWithValue("$skill_name", skillName); + cmd.Parameters.AddWithValue("$skill_path", skillPath); + cmd.Parameters.AddWithValue("$scenario_name", scenarioName); + cmd.Parameters.AddWithValue("$run_index", runIndex); + cmd.Parameters.AddWithValue("$role", role); + cmd.Parameters.AddWithValue("$model", model); + cmd.Parameters.AddWithValue("$config_dir", (object?)configDir ?? DBNull.Value); + cmd.Parameters.AddWithValue("$work_dir", (object?)workDir ?? DBNull.Value); + cmd.Parameters.AddWithValue("$prompt", (object?)prompt ?? DBNull.Value); + cmd.Parameters.AddWithValue("$skill_sha", (object?)skillSha ?? DBNull.Value); + cmd.Parameters.AddWithValue("$started_at", DateTimeOffset.UtcNow.ToString("o")); + cmd.ExecuteNonQuery(); + } + finally { _writeLock.Release(); } + } + + public void CompleteSession(string sessionId, string status, string metricsJson) + { + _writeLock.Wait(); + try + { + using var transaction = _connection.BeginTransaction(); + + using (var cmd = _connection.CreateCommand()) + { + cmd.Transaction = transaction; + cmd.CommandText = "UPDATE sessions SET status = $status, completed_at = $completed_at WHERE id = $id"; + cmd.Parameters.AddWithValue("$id", sessionId); + cmd.Parameters.AddWithValue("$status", status); + cmd.Parameters.AddWithValue("$completed_at", DateTimeOffset.UtcNow.ToString("o")); + cmd.ExecuteNonQuery(); + } + + using (var cmd = _connection.CreateCommand()) + { + cmd.Transaction = transaction; + cmd.CommandText = """ + INSERT OR REPLACE INTO run_results (session_id, metrics_json) + VALUES ($session_id, $metrics_json) + """; + cmd.Parameters.AddWithValue("$session_id", sessionId); + cmd.Parameters.AddWithValue("$metrics_json", metricsJson); + cmd.ExecuteNonQuery(); + } + + transaction.Commit(); + } + finally { _writeLock.Release(); } + } + + public void SaveJudgeResult(string sessionId, string judgeJson) + { + _writeLock.Wait(); + try + { + using var cmd = _connection.CreateCommand(); + cmd.CommandText = "UPDATE run_results SET judge_json = $judge_json WHERE session_id = $session_id"; + cmd.Parameters.AddWithValue("$session_id", sessionId); + cmd.Parameters.AddWithValue("$judge_json", judgeJson); + cmd.ExecuteNonQuery(); + } + finally { _writeLock.Release(); } + } + + public void SavePairwiseResult(string baselineSessionId, string pairwiseJson) + { + _writeLock.Wait(); + try + { + using var cmd = _connection.CreateCommand(); + cmd.CommandText = "UPDATE run_results SET pairwise_json = $pairwise_json WHERE session_id = $session_id"; + cmd.Parameters.AddWithValue("$session_id", baselineSessionId); + cmd.Parameters.AddWithValue("$pairwise_json", pairwiseJson); + cmd.ExecuteNonQuery(); + } + finally { _writeLock.Release(); } + } + + /// + /// Returns all completed sessions grouped by (skill, scenario, run_index). + /// + public List GetCompletedSessions() + { + return GetSessions("WHERE s.status IN ('completed', 'timed_out')"); + } + + /// + /// Returns schema metadata (type, version) for DB detection by external tools. + /// + public Dictionary GetSchemaInfo() + { + var result = new Dictionary(); + using var cmd = _connection.CreateCommand(); + cmd.CommandText = "SELECT key, value FROM schema_info"; + using var reader = cmd.ExecuteReader(); + while (reader.Read()) + result[reader.GetString(0)] = reader.GetString(1); + return result; + } + + private List GetSessions(string whereClause) + { + var results = new List(); + using var cmd = _connection.CreateCommand(); + cmd.CommandText = $""" + SELECT s.id, s.skill_name, s.skill_path, s.scenario_name, s.run_index, s.role, s.model, + s.config_dir, s.work_dir, s.prompt, s.skill_sha, s.status, + r.metrics_json, r.judge_json, r.pairwise_json + FROM sessions s + LEFT JOIN run_results r ON s.id = r.session_id + {whereClause} + ORDER BY s.skill_name, s.scenario_name, s.run_index, s.role + """; + using var reader = cmd.ExecuteReader(); + while (reader.Read()) + { + results.Add(new SessionRecord( + Id: reader.GetString(0), + SkillName: reader.GetString(1), + SkillPath: reader.GetString(2), + ScenarioName: reader.GetString(3), + RunIndex: reader.GetInt32(4), + Role: reader.GetString(5), + Model: reader.GetString(6), + ConfigDir: reader.IsDBNull(7) ? null : reader.GetString(7), + WorkDir: reader.IsDBNull(8) ? null : reader.GetString(8), + Prompt: reader.IsDBNull(9) ? null : reader.GetString(9), + SkillSha: reader.IsDBNull(10) ? null : reader.GetString(10), + Status: reader.GetString(11), + MetricsJson: reader.IsDBNull(12) ? null : reader.GetString(12), + JudgeJson: reader.IsDBNull(13) ? null : reader.GetString(13), + PairwiseJson: reader.IsDBNull(14) ? null : reader.GetString(14))); + } + return results; + } + + public void Dispose() + { + _connection.Dispose(); + _writeLock.Dispose(); + } +} + +public sealed record SessionRecord( + string Id, + string SkillName, + string SkillPath, + string ScenarioName, + int RunIndex, + string Role, + string Model, + string? ConfigDir, + string? WorkDir, + string? Prompt, + string? SkillSha, + string Status, + string? MetricsJson, + string? JudgeJson, + string? PairwiseJson); diff --git a/eng/skill-validator/src/SkillValidator.csproj b/eng/skill-validator/src/SkillValidator.csproj index 54479f8f..d0d9a0f7 100644 --- a/eng/skill-validator/src/SkillValidator.csproj +++ b/eng/skill-validator/src/SkillValidator.csproj @@ -17,6 +17,7 @@ + diff --git a/eng/skill-validator/tests/SessionDatabaseTests.cs b/eng/skill-validator/tests/SessionDatabaseTests.cs new file mode 100644 index 00000000..acd04eae --- /dev/null +++ b/eng/skill-validator/tests/SessionDatabaseTests.cs @@ -0,0 +1,231 @@ +using SkillValidator.Services; + +namespace SkillValidator.Tests; + +public class SessionDatabaseTests : IDisposable +{ + private readonly string _dbPath; + private readonly SessionDatabase _db; + + public SessionDatabaseTests() + { + _dbPath = Path.Combine(Path.GetTempPath(), $"test-sessions-{Guid.NewGuid()}.db"); + _db = new SessionDatabase(_dbPath); + } + + public void Dispose() + { + _db.Dispose(); + // Clear SQLite connection pool so file handles are fully released + Microsoft.Data.Sqlite.SqliteConnection.ClearAllPools(); + TryDelete(_dbPath); + TryDelete(_dbPath + "-wal"); + TryDelete(_dbPath + "-shm"); + } + + private static void TryDelete(string path) + { + try { if (File.Exists(path)) File.Delete(path); } catch { /* best-effort cleanup */ } + } + + [Fact] + public void RegisterAndComplete_RoundTrips() + { + _db.RegisterSession("s1", "my-skill", "/path/to/skill", "scenario-a", 0, "baseline", "gpt-4.1", "sessions/s1", "/work", "Fix the bug", "abcdef012345"); + _db.CompleteSession("s1", "completed", """{"TokenEstimate":100}"""); + + var sessions = _db.GetCompletedSessions(); + var s = Assert.Single(sessions); + Assert.Equal("s1", s.Id); + Assert.Equal("my-skill", s.SkillName); + Assert.Equal("/path/to/skill", s.SkillPath); + Assert.Equal("scenario-a", s.ScenarioName); + Assert.Equal(0, s.RunIndex); + Assert.Equal("baseline", s.Role); + Assert.Equal("gpt-4.1", s.Model); + Assert.Equal("sessions/s1", s.ConfigDir); + Assert.Equal("completed", s.Status); + Assert.Equal("Fix the bug", s.Prompt); + Assert.Equal("abcdef012345", s.SkillSha); + Assert.Equal("""{"TokenEstimate":100}""", s.MetricsJson); + Assert.Null(s.JudgeJson); + Assert.Null(s.PairwiseJson); + } + + [Fact] + public void SaveJudgeResult_UpdatesExistingRow() + { + _db.RegisterSession("s1", "skill", "/p", "scn", 0, "baseline", "model", null, null); + _db.CompleteSession("s1", "completed", "{}"); + _db.SaveJudgeResult("s1", """{"OverallScore":4}"""); + + var s = Assert.Single(_db.GetCompletedSessions()); + Assert.Equal("""{"OverallScore":4}""", s.JudgeJson); + } + + [Fact] + public void SavePairwiseResult_UpdatesExistingRow() + { + _db.RegisterSession("s1", "skill", "/p", "scn", 0, "baseline", "model", null, null); + _db.CompleteSession("s1", "completed", "{}"); + _db.SavePairwiseResult("s1", """{"Winner":"with-skill"}"""); + + var s = Assert.Single(_db.GetCompletedSessions()); + Assert.Equal("""{"Winner":"with-skill"}""", s.PairwiseJson); + } + + [Fact] + public void RegisterWithoutPromptOrSkillSha_StoresNulls() + { + _db.RegisterSession("s1", "skill", "/p", "scn", 0, "baseline", "model", null, null); + _db.CompleteSession("s1", "completed", "{}"); + + var s = Assert.Single(_db.GetCompletedSessions()); + Assert.Null(s.Prompt); + Assert.Null(s.SkillSha); + } + + [Fact] + public void GetCompletedSessions_ExcludesRunning() + { + _db.RegisterSession("s1", "skill", "/p", "scn", 0, "baseline", "model", null, null); + // Never completed — should not appear + var sessions = _db.GetCompletedSessions(); + Assert.Empty(sessions); + } + + [Fact] + public void GetCompletedSessions_IncludesTimedOut() + { + _db.RegisterSession("s1", "skill", "/p", "scn", 0, "baseline", "model", null, null); + _db.CompleteSession("s1", "timed_out", "{}"); + + var sessions = _db.GetCompletedSessions(); + Assert.Single(sessions); + Assert.Equal("timed_out", sessions[0].Status); + } + + [Fact] + public void MultipleSessions_OrderedCorrectly() + { + // Register pairs for two scenarios + _db.RegisterSession("b0", "skill", "/p", "alpha", 0, "baseline", "m", null, null); + _db.RegisterSession("w0", "skill", "/p", "alpha", 0, "with-skill", "m", null, null); + _db.RegisterSession("b1", "skill", "/p", "beta", 0, "baseline", "m", null, null); + _db.RegisterSession("w1", "skill", "/p", "beta", 0, "with-skill", "m", null, null); + + _db.CompleteSession("b0", "completed", "{}"); + _db.CompleteSession("w0", "completed", "{}"); + _db.CompleteSession("b1", "completed", "{}"); + _db.CompleteSession("w1", "completed", "{}"); + + var sessions = _db.GetCompletedSessions(); + Assert.Equal(4, sessions.Count); + // Ordered by skill_name, scenario_name, run_index, role + Assert.Equal("alpha", sessions[0].ScenarioName); + Assert.Equal("baseline", sessions[0].Role); + Assert.Equal("alpha", sessions[1].ScenarioName); + Assert.Equal("with-skill", sessions[1].Role); + Assert.Equal("beta", sessions[2].ScenarioName); + } + + [Fact] + public async Task ConcurrentWrites_DoNotCorrupt() + { + const int count = 20; + var tasks = Enumerable.Range(0, count).Select(i => Task.Run(() => + { + var id = $"s{i}"; + _db.RegisterSession(id, "skill", "/p", "scn", i, i % 2 == 0 ? "baseline" : "with-skill", "m", null, null); + _db.CompleteSession(id, "completed", $"{{\"Index\":{i}}}"); + _db.SaveJudgeResult(id, $"{{\"Score\":{i}}}"); + })); + + await Task.WhenAll(tasks); + + var sessions = _db.GetCompletedSessions(); + Assert.Equal(count, sessions.Count); + Assert.All(sessions, s => + { + Assert.Equal("completed", s.Status); + Assert.NotNull(s.MetricsJson); + Assert.NotNull(s.JudgeJson); + }); + } + + [Fact] + public void ComputeDirectorySha_IsDeterministic() + { + var dir = Path.Combine(Path.GetTempPath(), $"sha-test-{Guid.NewGuid()}"); + Directory.CreateDirectory(dir); + try + { + File.WriteAllText(Path.Combine(dir, "SKILL.md"), "# Test Skill"); + File.WriteAllText(Path.Combine(dir, "plugin.json"), "{}"); + + var sha1 = SessionDatabase.ComputeDirectorySha(dir); + var sha2 = SessionDatabase.ComputeDirectorySha(dir); + Assert.Equal(sha1, sha2); + Assert.Equal(12, sha1.Length); + + // Changing content produces a different SHA + File.WriteAllText(Path.Combine(dir, "SKILL.md"), "# Modified"); + var sha3 = SessionDatabase.ComputeDirectorySha(dir); + Assert.NotEqual(sha1, sha3); + } + finally + { + Directory.Delete(dir, true); + } + } + + [Fact] + public void SeparateDbFiles_AreIndependent() + { + // Simulates two concurrent eval processes using different result dirs + var dbPath2 = Path.Combine(Path.GetTempPath(), $"test-sessions-{Guid.NewGuid()}.db"); + try + { + using var db2 = new SessionDatabase(dbPath2); + + _db.RegisterSession("s1", "skill-a", "/a", "scn", 0, "baseline", "m", null, null); + _db.CompleteSession("s1", "completed", "{}"); + + db2.RegisterSession("s1", "skill-b", "/b", "scn", 0, "baseline", "m", null, null); + db2.CompleteSession("s1", "completed", "{}"); + + // Each DB has exactly one session with different skill names + var sessions1 = _db.GetCompletedSessions(); + var sessions2 = db2.GetCompletedSessions(); + Assert.Single(sessions1); + Assert.Single(sessions2); + Assert.Equal("skill-a", sessions1[0].SkillName); + Assert.Equal("skill-b", sessions2[0].SkillName); + } + finally + { + Microsoft.Data.Sqlite.SqliteConnection.ClearAllPools(); + TryDelete(dbPath2); + TryDelete(dbPath2 + "-wal"); + TryDelete(dbPath2 + "-shm"); + } + } + + [Fact] + public void SchemaInfo_ContainsTypeAndVersion() + { + var info = _db.GetSchemaInfo(); + Assert.Equal("skill-validator", info["type"]); + Assert.Equal("1", info["version"]); + } + + [Fact] + public void ConfigDir_StoredAsRelativePath() + { + _db.RegisterSession("s1", "skill", "/p", "scn", 0, "baseline", "m", "sessions/s1", null); + _db.CompleteSession("s1", "completed", "{}"); + + var s = Assert.Single(_db.GetCompletedSessions()); + Assert.Equal("sessions/s1", s.ConfigDir); + } +}