sveltejs · paoloricciuti · Dec 25, 2025 · Dec 22, 2025 · Dec 22, 2025 · Dec 22, 2025
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -30,5 +30,8 @@ jobs:
       - name: TypeScript type check
         run: bun run tsc
 
-      - name: Run tests
+      - name: Run self-tests
         run: bun test
+
+      - name: Run benchmark tests
+        run: bun run verify-references
diff --git a/.gitignore b/.gitignore
@@ -42,3 +42,6 @@ results/*
 !results/*.json
 .vercel
 .env*.local
+
+# AI benchmark settings (auto-generated)
+.ai-settings.json
diff --git a/AGENTS.md b/AGENTS.md
@@ -279,3 +279,7 @@ Run unit tests with: `bun run test:self`
 - MCP status is clearly indicated in both the JSON metadata and HTML report with a visual badge
 - Exit code is 0 if all tests pass, 1 if any tests fail
 - Pricing is fetched from Vercel AI Gateway model metadata at runtime
+
+## Self-tests
+
+For running the self-test test suite, run `bun test`
diff --git a/bun.lock b/bun.lock
@@ -14,6 +14,7 @@
         "@testing-library/svelte": "^5.2.9",
         "@testing-library/user-event": "^14.6.1",
         "ai": "^5.0.108",
+        "p-retry": "^7.0.0",
         "vercel": "^49.1.2",
         "vitest": "^4.0.15",
         "zod": "^4.1.13",
@@ -735,6 +736,8 @@
 
     "is-glob": ["[email protected]", "", { "dependencies": { "is-extglob": "^2.1.1" } }, "sha512-xelSayHH36ZgE7ZWhli7pW34hNbNl8Ojv5KVmkJD4hBdD3th8Tfk9vYasLM+mXWOZhFkgZfxhLSnrwRr4elSSg=="],
 
+    "is-network-error": ["[email protected]", "", {}, "sha512-6oIwpsgRfnDiyEDLMay/GqCl3HoAtH5+RUKW29gYkL0QA+ipzpDLA16yQs7/RHCSu+BwgbJaOUqa4A99qNVQVw=="],
+
     "is-node-process": ["[email protected]", "", {}, "sha512-Vg4o6/fqPxIjtxgUH5QLJhwZ7gW5diGCVlXpuUfELC62CuxM1iHcRe51f2W1FDy04Ai4KJkagKjx3XaqyfRKXw=="],
 
     "is-number": ["[email protected]", "", {}, "sha512-41Cifkg6e8TylSpdtTpeLVMqvSBEVzTttHvERD741+pnZ8ANv0004MRL43QKPDlK9cGvNp6NZWZUBlbGXYxxng=="],
@@ -863,6 +866,8 @@
 
     "p-locate": ["[email protected]", "", { "dependencies": { "p-limit": "^3.0.2" } }, "sha512-LaNjtRWUBY++zB5nE/NwcaoMylSPk+S+ZHNB1TzdbMJMny6dynpAGt7X/tl/QYq3TIeE6nxHppbo2LGymrG5Pw=="],
 
+    "p-retry": ["[email protected]", "", { "dependencies": { "is-network-error": "^1.1.0" } }, "sha512-J5ApzjyRkkf601HpEeykoiCvzHQjWxPAHhyjFcEUP2SWq0+35NKh8TLhpLw+Dkq5TZBFvUM6UigdE9hIVYTl5w=="],
+
     "parent-module": ["[email protected]", "", { "dependencies": { "callsites": "^3.0.0" } }, "sha512-GQ2EWRpQV8/o+Aw8YqtfZZPfNRWZYkbidE9k5rpl/hC3vtHHBfGm2Ifi6qWV+coDGkrUKZAxE3Lot5kcsRlh+g=="],
 
     "parse-ms": ["[email protected]", "", {}, "sha512-kHt7kzLoS9VBZfUsiKjv43mr91ea+U05EyKkEtqp7vNbHxmaVuEqN7XxeEVnGrMtYOAxGrDElSi96K7EgO1zCA=="],

diff --git a/index.ts b/index.ts
@@ -1,16 +1,18 @@
 import { Experimental_Agent as Agent, hasToolCall, stepCountIs } from "ai";
 import { experimental_createMCPClient as createMCPClient } from "./node_modules/@ai-sdk/mcp/dist/index.mjs";
 import { Experimental_StdioMCPTransport as StdioMCPTransport } from "./node_modules/@ai-sdk/mcp/dist/mcp-stdio/index.mjs";
-import { writeFileSync, mkdirSync, existsSync } from "node:fs";
+import { writeFileSync, mkdirSync, existsSync, readFileSync } from "node:fs";
 import {
   generateReport,
+  calculateUnitTestTotals,
   type SingleTestResult,
 } from "./lib/report.ts";
 import {
   getTimestampedFilename,
   isHttpUrl,
   extractResultWriteContent,
   calculateTotalCost,
+  withRetry,
 } from "./lib/utils.ts";
 import {
   discoverTests,
@@ -46,9 +48,40 @@ import {
 } from "@clack/prompts";
 import { gateway } from "ai";
 
+const SETTINGS_FILE = ".ai-settings.json";
+
+interface SavedSettings {
+  models: string[];
+  mcpIntegration: "none" | "http" | "stdio";
+  mcpServerUrl?: string;
+  testingTool: boolean;
+  pricingEnabled: boolean;
+}
+
+function loadSettings(): SavedSettings | null {
+  try {
+    if (existsSync(SETTINGS_FILE)) {
+      const content = readFileSync(SETTINGS_FILE, "utf-8");
+      return JSON.parse(content) as SavedSettings;
+    }
+  } catch (error) {
+    console.warn("⚠️  Could not load saved settings, using defaults");
+  }
+  return null;
+}
+
+function saveSettings(settings: SavedSettings): void {
+  try {
+    writeFileSync(SETTINGS_FILE, JSON.stringify(settings, null, 2));
+  } catch (error) {
+    console.warn("⚠️  Could not save settings");
+  }
+}
+
 async function validateAndConfirmPricing(
   models: string[],
   pricingMap: Map<string, ModelPricingLookup | null>,
+  savedPricingEnabled?: boolean,
 ) {
   const lookups = new Map<string, ModelPricingLookup | null>();
 
@@ -71,7 +104,7 @@ async function validateAndConfirmPricing(
 
     const usePricing = await confirm({
       message: "Enable cost calculation?",
-      initialValue: true,
+      initialValue: savedPricingEnabled ?? true,
     });
 
     if (isCancel(usePricing)) {
@@ -124,23 +157,35 @@ async function validateAndConfirmPricing(
 async function selectOptions() {
   intro("🚀 Svelte AI Bench");
 
+  const savedSettings = loadSettings();
+  if (savedSettings) {
+    note("Loaded previous settings as defaults", "📋 Saved Settings");
+  }
+
   const availableModels = await gateway.getAvailableModels();
 
   const gatewayModels = availableModels.models as GatewayModel[];
   const pricingMap = buildPricingMap(gatewayModels);
 
+  const modelOptions = [{ value: "custom", label: "Custom" }].concat(
+    availableModels.models.reduce<Array<{ value: string; label: string }>>(
+      (arr, model) => {
+        if (model.modelType === "language") {
+          arr.push({ value: model.id, label: model.name });
+        }
+        return arr;
+      },
+      [],
+    ),
+  );
+
+  const savedModelValues = savedSettings?.models ?? [];
+
   const models = await multiselect({
     message: "Select model(s) to benchmark",
-    options: [{ value: "custom", label: "Custom" }].concat(
-      availableModels.models.reduce<Array<{ value: string; label: string }>>(
-        (arr, model) => {
-          if (model.modelType === "language") {
-            arr.push({ value: model.id, label: model.name });
-          }
-          return arr;
-        },
-        [],
-      ),
+    options: modelOptions,
+    initialValues: savedModelValues.filter((m) =>
+      modelOptions.some((opt) => opt.value === m),
     ),
   });
 
@@ -162,7 +207,13 @@ async function selectOptions() {
 
   const selectedModels = models.filter((model) => model !== "custom");
 
-  const pricing = await validateAndConfirmPricing(selectedModels, pricingMap);
+  const pricing = await validateAndConfirmPricing(
+    selectedModels,
+    pricingMap,
+    savedSettings?.pricingEnabled,
+  );
+
+  const savedMcpIntegration = savedSettings?.mcpIntegration ?? "none";
 
   const mcpIntegration = await select({
     message: "Which MCP integration to use?",
@@ -171,6 +222,7 @@ async function selectOptions() {
       { value: "http", label: "MCP over HTTP" },
       { value: "stdio", label: "MCP over StdIO" },
     ],
+    initialValue: savedMcpIntegration,
   });
 
   if (isCancel(mcpIntegration)) {
@@ -179,11 +231,25 @@ async function selectOptions() {
   }
 
   let mcp: string | undefined = undefined;
+  let mcpIntegrationType: "none" | "http" | "stdio" = "none";
 
   if (mcpIntegration !== "none") {
+    mcpIntegrationType = mcpIntegration as "http" | "stdio";
+
+    const savedMcpUrl = savedSettings?.mcpServerUrl;
+    const defaultMcpUrl =
+      mcpIntegration === "http"
+        ? "https://mcp.svelte.dev/mcp"
+        : "npx -y @sveltejs/mcp";
+
+    const hasSavedCustomUrl =
+      !!savedMcpUrl &&
+      savedSettings?.mcpIntegration === mcpIntegration &&
+      savedMcpUrl !== defaultMcpUrl;
+
     const custom = await confirm({
       message: "Do you want to provide a custom MCP server/command?",
-      initialValue: false,
+      initialValue: hasSavedCustomUrl ?? false,
     });
 
     if (isCancel(custom)) {
@@ -194,6 +260,7 @@ async function selectOptions() {
     if (custom) {
       const customMcp = await text({
         message: "Insert custom url or command",
+        initialValue: hasSavedCustomUrl ? savedMcpUrl : undefined,
       });
       if (isCancel(customMcp)) {
         cancel("Operation cancelled.");
@@ -202,23 +269,29 @@ async function selectOptions() {
 
       mcp = customMcp;
     } else {
-      mcp =
-        mcpIntegration === "http"
-          ? "https://mcp.svelte.dev/mcp"
-          : "npx -y @sveltejs/mcp";
+      mcp = defaultMcpUrl;
     }
   }
 
   const testingTool = await confirm({
     message: "Do you want to provide the testing tool to the model?",
-    initialValue: true,
+    initialValue: savedSettings?.testingTool ?? true,
   });
 
   if (isCancel(testingTool)) {
     cancel("Operation cancelled.");
     process.exit(0);
   }
 
+  const newSettings: SavedSettings = {
+    models: selectedModels,
+    mcpIntegration: mcpIntegrationType,
+    mcpServerUrl: mcp,
+    testingTool,
+    pricingEnabled: pricing.enabled,
+  };
+  saveSettings(newSettings);
+
   return {
     models: selectedModels,
     mcp,
@@ -301,7 +374,15 @@ async function runSingleTest(
     if (testComponentEnabled) {
       console.log("  📋 TestComponent tool is available");
     }
-    const result = await agent.generate({ prompt: fullPrompt });
+
+    const result = await withRetry(
+      async () => agent.generate({ prompt: fullPrompt }),
+      {
+        retries: 10,
+        minTimeout: 1000,
+        factor: 2,
+      },
+    );
 
     const resultWriteContent = extractResultWriteContent(result.steps);
 
@@ -321,17 +402,32 @@ async function runSingleTest(
     console.log("  ⏳ Verifying against tests...");
     const verification = await runTestVerification(test, resultWriteContent);
 
-    if (verification.passed) {
+    if (verification.validation) {
+      if (verification.validation.valid) {
+        console.log("  ✓ Code validation passed");
+      } else {
+        console.log("  ✗ Code validation failed:");
+        for (const error of verification.validation.errors) {
+          console.log(`    - ${error}`);
+        }
+      }
+    }
+
+    if (verification.validationFailed) {
+      console.log(
+        `  ⊘ Validation failed (${verification.numPassed}/${verification.numTests} tests passed)`,
+      );
+    } else if (verification.passed) {
       console.log(
-        `✓ All tests passed (${verification.numPassed}/${verification.numTests})`,
+        `  ✓ All tests passed (${verification.numPassed}/${verification.numTests})`,
       );
     } else {
       console.log(
-        `✗ Tests failed (${verification.numFailed}/${verification.numTests} failed)`,
+        `  ✗ Tests failed (${verification.numFailed}/${verification.numTests} failed)`,
       );
       if (verification.failedTests) {
         for (const ft of verification.failedTests) {
-          console.log(`- ${ft.fullName}`);
+          console.log(`    - ${ft.fullName}`);
         }
       }
     }
@@ -346,7 +442,7 @@ async function runSingleTest(
       verification,
     };
   } catch (error) {
-    console.error(`✗ Error running test: ${error}`);
+    console.error(`  ✗ Error running test: ${error}`);
     return {
       testName: test.name,
       prompt: fullPrompt,
@@ -463,7 +559,7 @@ async function main() {
 
     const model = gateway.languageModel(modelId);
 
-    const testResults = [];
+    const testResults: SingleTestResult[] = [];
     const startTime = Date.now();
 
     for (let i = 0; i < tests.length; i++) {
@@ -493,6 +589,8 @@ async function main() {
     totalFailed += failed;
     const skipped = testResults.filter((r) => !r.verification).length;
 
+    const unitTestTotals = calculateUnitTestTotals(testResults);
+
     for (const result of testResults) {
       const status = result.verification
         ? result.verification.passed
@@ -504,13 +602,28 @@ async function main() {
           ? "PASSED"
           : "FAILED"
         : "SKIPPED";
-      console.log(`${status} ${result.testName}: ${statusText}`);
+
+      const validationInfo = result.verification?.validation
+        ? result.verification.validation.valid
+          ? " (validated)"
+          : " (validation failed)"
+        : "";
+
+      const unitTestInfo = result.verification
+        ? ` [${result.verification.numPassed}/${result.verification.numTests} unit tests]`
+        : "";
+
+      console.log(
+        `${status} ${result.testName}: ${statusText}${validationInfo}${unitTestInfo}`,
+      );
     }
 
     console.log("─".repeat(50));
     console.log(
-      `Total: ${passed} passed, ${failed} failed, ${skipped} skipped (${(totalDuration / 1000).toFixed(1)}s)`,
+      `Test Suites: ✓ ${passed} passed  ✗ ${failed} failed  ${skipped > 0 ? `⊘ ${skipped} skipped  ` : ""}(${unitTestTotals.passed}/${unitTestTotals.total} unit tests)`,
     );
+    console.log(`Score:       ${unitTestTotals.score}%`);
+    console.log(`Duration:    ${(totalDuration / 1000).toFixed(1)}s`);
 
     let totalCost = null;
     let pricingInfo = null;
@@ -561,6 +674,7 @@ async function main() {
         pricingKey: pricingLookup?.matchedKey ?? null,
         pricing: pricingInfo,
         totalCost,
+        unitTestTotals,
       },
     };