Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
ba3709f
add 1.0 tests, rewritten
khromov Dec 22, 2025
a954c91
wip
khromov Dec 22, 2025
cb6abd8
wip
khromov Dec 22, 2025
3f2830a
Update Reference.svelte
khromov Dec 22, 2025
b21cd6e
add validators
khromov Dec 22, 2025
7d82289
Update index.ts
khromov Dec 22, 2025
839c050
wip
khromov Dec 22, 2025
b899e3e
fix
khromov Dec 22, 2025
b9e41c7
Create result-2025-12-22-20-36-07-anthropic-claude-haiku-4.5.json
khromov Dec 22, 2025
e1fcbe7
save old settings
khromov Dec 22, 2025
c0bf019
wip
khromov Dec 22, 2025
ec32cb4
Update test.yml
khromov Dec 23, 2025
f106225
wip
khromov Dec 23, 2025
a8cc474
Update report-template.ts
khromov Dec 23, 2025
d2c215e
Update report-template.ts
khromov Dec 23, 2025
8af63f7
Update index.ts
khromov Dec 23, 2025
2d88284
remove unnecessary comments
khromov Dec 23, 2025
23a93ce
validation fail
khromov Dec 25, 2025
f52897d
fix exit code
khromov Dec 25, 2025
b6d9915
Update AGENTS.md
khromov Dec 25, 2025
1dc642c
Update report-template.ts
khromov Dec 25, 2025
cdc3b03
remove old reports
khromov Dec 25, 2025
f42a109
Create result-2025-12-25-15-33-32-anthropic-claude-haiku-4.5.json
khromov Dec 25, 2025
34a62c9
Update output-test-runner.ts
khromov Dec 25, 2025
86a30ae
fix output
khromov Dec 25, 2025
8112764
fix counting discrepancy
khromov Dec 25, 2025
33cfe35
wip
khromov Dec 25, 2025
cca9c42
Create result-2025-12-25-16-20-03-alibaba-qwen-3-32b.json
khromov Dec 25, 2025
32d3f1e
Update output-test-runner.ts
khromov Dec 25, 2025
0e10129
clean up legacy report handling
khromov Dec 25, 2025
59da2cd
Create result-2025-12-25-16-30-17-anthropic-claude-sonnet-4.5.json
khromov Dec 25, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,5 +30,8 @@ jobs:
- name: TypeScript type check
run: bun run tsc

- name: Run tests
- name: Run self-tests
run: bun test

- name: Run benchmark tests
run: bun run verify-references
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -42,3 +42,6 @@ results/*
!results/*.json
.vercel
.env*.local

# AI benchmark settings (auto-generated)
.ai-settings.json
4 changes: 4 additions & 0 deletions AGENTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -279,3 +279,7 @@ Run unit tests with: `bun run test:self`
- MCP status is clearly indicated in both the JSON metadata and HTML report with a visual badge
- Exit code is 0 if all tests pass, 1 if any tests fail
- Pricing is fetched from Vercel AI Gateway model metadata at runtime

## Self-tests

For running the self-test test suite, run `bun test`
5 changes: 5 additions & 0 deletions bun.lock
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
"@testing-library/svelte": "^5.2.9",
"@testing-library/user-event": "^14.6.1",
"ai": "^5.0.108",
"p-retry": "^7.0.0",
"vercel": "^49.1.2",
"vitest": "^4.0.15",
"zod": "^4.1.13",
Expand Down Expand Up @@ -735,6 +736,8 @@

"is-glob": ["[email protected]", "", { "dependencies": { "is-extglob": "^2.1.1" } }, "sha512-xelSayHH36ZgE7ZWhli7pW34hNbNl8Ojv5KVmkJD4hBdD3th8Tfk9vYasLM+mXWOZhFkgZfxhLSnrwRr4elSSg=="],

"is-network-error": ["[email protected]", "", {}, "sha512-6oIwpsgRfnDiyEDLMay/GqCl3HoAtH5+RUKW29gYkL0QA+ipzpDLA16yQs7/RHCSu+BwgbJaOUqa4A99qNVQVw=="],

"is-node-process": ["[email protected]", "", {}, "sha512-Vg4o6/fqPxIjtxgUH5QLJhwZ7gW5diGCVlXpuUfELC62CuxM1iHcRe51f2W1FDy04Ai4KJkagKjx3XaqyfRKXw=="],

"is-number": ["[email protected]", "", {}, "sha512-41Cifkg6e8TylSpdtTpeLVMqvSBEVzTttHvERD741+pnZ8ANv0004MRL43QKPDlK9cGvNp6NZWZUBlbGXYxxng=="],
Expand Down Expand Up @@ -863,6 +866,8 @@

"p-locate": ["[email protected]", "", { "dependencies": { "p-limit": "^3.0.2" } }, "sha512-LaNjtRWUBY++zB5nE/NwcaoMylSPk+S+ZHNB1TzdbMJMny6dynpAGt7X/tl/QYq3TIeE6nxHppbo2LGymrG5Pw=="],

"p-retry": ["[email protected]", "", { "dependencies": { "is-network-error": "^1.1.0" } }, "sha512-J5ApzjyRkkf601HpEeykoiCvzHQjWxPAHhyjFcEUP2SWq0+35NKh8TLhpLw+Dkq5TZBFvUM6UigdE9hIVYTl5w=="],

"parent-module": ["[email protected]", "", { "dependencies": { "callsites": "^3.0.0" } }, "sha512-GQ2EWRpQV8/o+Aw8YqtfZZPfNRWZYkbidE9k5rpl/hC3vtHHBfGm2Ifi6qWV+coDGkrUKZAxE3Lot5kcsRlh+g=="],

"parse-ms": ["[email protected]", "", {}, "sha512-kHt7kzLoS9VBZfUsiKjv43mr91ea+U05EyKkEtqp7vNbHxmaVuEqN7XxeEVnGrMtYOAxGrDElSi96K7EgO1zCA=="],
Expand Down
170 changes: 142 additions & 28 deletions index.ts
Original file line number Diff line number Diff line change
@@ -1,16 +1,18 @@
import { Experimental_Agent as Agent, hasToolCall, stepCountIs } from "ai";
import { experimental_createMCPClient as createMCPClient } from "./node_modules/@ai-sdk/mcp/dist/index.mjs";
import { Experimental_StdioMCPTransport as StdioMCPTransport } from "./node_modules/@ai-sdk/mcp/dist/mcp-stdio/index.mjs";
import { writeFileSync, mkdirSync, existsSync } from "node:fs";
import { writeFileSync, mkdirSync, existsSync, readFileSync } from "node:fs";
import {
generateReport,
calculateUnitTestTotals,
type SingleTestResult,
} from "./lib/report.ts";
import {
getTimestampedFilename,
isHttpUrl,
extractResultWriteContent,
calculateTotalCost,
withRetry,
} from "./lib/utils.ts";
import {
discoverTests,
Expand Down Expand Up @@ -46,9 +48,40 @@ import {
} from "@clack/prompts";
import { gateway } from "ai";

const SETTINGS_FILE = ".ai-settings.json";

interface SavedSettings {
models: string[];
mcpIntegration: "none" | "http" | "stdio";
mcpServerUrl?: string;
testingTool: boolean;
pricingEnabled: boolean;
}

function loadSettings(): SavedSettings | null {
try {
if (existsSync(SETTINGS_FILE)) {
const content = readFileSync(SETTINGS_FILE, "utf-8");
return JSON.parse(content) as SavedSettings;
}
} catch (error) {
console.warn("⚠️ Could not load saved settings, using defaults");
}
return null;
}

function saveSettings(settings: SavedSettings): void {
try {
writeFileSync(SETTINGS_FILE, JSON.stringify(settings, null, 2));
} catch (error) {
console.warn("⚠️ Could not save settings");
}
}

async function validateAndConfirmPricing(
models: string[],
pricingMap: Map<string, ModelPricingLookup | null>,
savedPricingEnabled?: boolean,
) {
const lookups = new Map<string, ModelPricingLookup | null>();

Expand All @@ -71,7 +104,7 @@ async function validateAndConfirmPricing(

const usePricing = await confirm({
message: "Enable cost calculation?",
initialValue: true,
initialValue: savedPricingEnabled ?? true,
});

if (isCancel(usePricing)) {
Expand Down Expand Up @@ -124,23 +157,35 @@ async function validateAndConfirmPricing(
async function selectOptions() {
intro("🚀 Svelte AI Bench");

const savedSettings = loadSettings();
if (savedSettings) {
note("Loaded previous settings as defaults", "📋 Saved Settings");
}

const availableModels = await gateway.getAvailableModels();

const gatewayModels = availableModels.models as GatewayModel[];
const pricingMap = buildPricingMap(gatewayModels);

const modelOptions = [{ value: "custom", label: "Custom" }].concat(
availableModels.models.reduce<Array<{ value: string; label: string }>>(
(arr, model) => {
if (model.modelType === "language") {
arr.push({ value: model.id, label: model.name });
}
return arr;
},
[],
),
);

const savedModelValues = savedSettings?.models ?? [];

const models = await multiselect({
message: "Select model(s) to benchmark",
options: [{ value: "custom", label: "Custom" }].concat(
availableModels.models.reduce<Array<{ value: string; label: string }>>(
(arr, model) => {
if (model.modelType === "language") {
arr.push({ value: model.id, label: model.name });
}
return arr;
},
[],
),
options: modelOptions,
initialValues: savedModelValues.filter((m) =>
modelOptions.some((opt) => opt.value === m),
),
});

Expand All @@ -162,7 +207,13 @@ async function selectOptions() {

const selectedModels = models.filter((model) => model !== "custom");

const pricing = await validateAndConfirmPricing(selectedModels, pricingMap);
const pricing = await validateAndConfirmPricing(
selectedModels,
pricingMap,
savedSettings?.pricingEnabled,
);

const savedMcpIntegration = savedSettings?.mcpIntegration ?? "none";

const mcpIntegration = await select({
message: "Which MCP integration to use?",
Expand All @@ -171,6 +222,7 @@ async function selectOptions() {
{ value: "http", label: "MCP over HTTP" },
{ value: "stdio", label: "MCP over StdIO" },
],
initialValue: savedMcpIntegration,
});

if (isCancel(mcpIntegration)) {
Expand All @@ -179,11 +231,25 @@ async function selectOptions() {
}

let mcp: string | undefined = undefined;
let mcpIntegrationType: "none" | "http" | "stdio" = "none";

if (mcpIntegration !== "none") {
mcpIntegrationType = mcpIntegration as "http" | "stdio";

const savedMcpUrl = savedSettings?.mcpServerUrl;
const defaultMcpUrl =
mcpIntegration === "http"
? "https://mcp.svelte.dev/mcp"
: "npx -y @sveltejs/mcp";

const hasSavedCustomUrl =
!!savedMcpUrl &&
savedSettings?.mcpIntegration === mcpIntegration &&
savedMcpUrl !== defaultMcpUrl;

const custom = await confirm({
message: "Do you want to provide a custom MCP server/command?",
initialValue: false,
initialValue: hasSavedCustomUrl ?? false,
});

if (isCancel(custom)) {
Expand All @@ -194,6 +260,7 @@ async function selectOptions() {
if (custom) {
const customMcp = await text({
message: "Insert custom url or command",
initialValue: hasSavedCustomUrl ? savedMcpUrl : undefined,
});
if (isCancel(customMcp)) {
cancel("Operation cancelled.");
Expand All @@ -202,23 +269,29 @@ async function selectOptions() {

mcp = customMcp;
} else {
mcp =
mcpIntegration === "http"
? "https://mcp.svelte.dev/mcp"
: "npx -y @sveltejs/mcp";
mcp = defaultMcpUrl;
}
}

const testingTool = await confirm({
message: "Do you want to provide the testing tool to the model?",
initialValue: true,
initialValue: savedSettings?.testingTool ?? true,
});

if (isCancel(testingTool)) {
cancel("Operation cancelled.");
process.exit(0);
}

const newSettings: SavedSettings = {
models: selectedModels,
mcpIntegration: mcpIntegrationType,
mcpServerUrl: mcp,
testingTool,
pricingEnabled: pricing.enabled,
};
saveSettings(newSettings);

return {
models: selectedModels,
mcp,
Expand Down Expand Up @@ -301,7 +374,15 @@ async function runSingleTest(
if (testComponentEnabled) {
console.log(" 📋 TestComponent tool is available");
}
const result = await agent.generate({ prompt: fullPrompt });

const result = await withRetry(
async () => agent.generate({ prompt: fullPrompt }),
{
retries: 10,
minTimeout: 1000,
factor: 2,
},
);

const resultWriteContent = extractResultWriteContent(result.steps);

Expand All @@ -321,17 +402,32 @@ async function runSingleTest(
console.log(" ⏳ Verifying against tests...");
const verification = await runTestVerification(test, resultWriteContent);

if (verification.passed) {
if (verification.validation) {
if (verification.validation.valid) {
console.log(" ✓ Code validation passed");
} else {
console.log(" ✗ Code validation failed:");
for (const error of verification.validation.errors) {
console.log(` - ${error}`);
}
}
}

if (verification.validationFailed) {
console.log(
` ⊘ Validation failed (${verification.numPassed}/${verification.numTests} tests passed)`,
);
} else if (verification.passed) {
console.log(
`✓ All tests passed (${verification.numPassed}/${verification.numTests})`,
` ✓ All tests passed (${verification.numPassed}/${verification.numTests})`,
);
} else {
console.log(
`✗ Tests failed (${verification.numFailed}/${verification.numTests} failed)`,
` ✗ Tests failed (${verification.numFailed}/${verification.numTests} failed)`,
);
if (verification.failedTests) {
for (const ft of verification.failedTests) {
console.log(`- ${ft.fullName}`);
console.log(` - ${ft.fullName}`);
}
}
}
Expand All @@ -346,7 +442,7 @@ async function runSingleTest(
verification,
};
} catch (error) {
console.error(`✗ Error running test: ${error}`);
console.error(` ✗ Error running test: ${error}`);
return {
testName: test.name,
prompt: fullPrompt,
Expand Down Expand Up @@ -463,7 +559,7 @@ async function main() {

const model = gateway.languageModel(modelId);

const testResults = [];
const testResults: SingleTestResult[] = [];
const startTime = Date.now();

for (let i = 0; i < tests.length; i++) {
Expand Down Expand Up @@ -493,6 +589,8 @@ async function main() {
totalFailed += failed;
const skipped = testResults.filter((r) => !r.verification).length;

const unitTestTotals = calculateUnitTestTotals(testResults);

for (const result of testResults) {
const status = result.verification
? result.verification.passed
Expand All @@ -504,13 +602,28 @@ async function main() {
? "PASSED"
: "FAILED"
: "SKIPPED";
console.log(`${status} ${result.testName}: ${statusText}`);

const validationInfo = result.verification?.validation
? result.verification.validation.valid
? " (validated)"
: " (validation failed)"
: "";

const unitTestInfo = result.verification
? ` [${result.verification.numPassed}/${result.verification.numTests} unit tests]`
: "";

console.log(
`${status} ${result.testName}: ${statusText}${validationInfo}${unitTestInfo}`,
);
}

console.log("─".repeat(50));
console.log(
`Total: ${passed} passed, ${failed} failed, ${skipped} skipped (${(totalDuration / 1000).toFixed(1)}s)`,
`Test Suites: ✓ ${passed} passed${failed} failed ${skipped > 0 ? `⊘ ${skipped} skipped ` : ""}(${unitTestTotals.passed}/${unitTestTotals.total} unit tests)`,
);
console.log(`Score: ${unitTestTotals.score}%`);
console.log(`Duration: ${(totalDuration / 1000).toFixed(1)}s`);

let totalCost = null;
let pricingInfo = null;
Expand Down Expand Up @@ -561,6 +674,7 @@ async function main() {
pricingKey: pricingLookup?.matchedKey ?? null,
pricing: pricingInfo,
totalCost,
unitTestTotals,
},
};

Expand Down
Loading