From a73ddc38368ee32893b5d792999721d8ace3d148 Mon Sep 17 00:00:00 2001 From: Samuel Bushi Date: Tue, 21 Oct 2025 11:01:11 -0400 Subject: [PATCH 1/2] WIP, improved eval format --- genkit-tools/common/src/eval/evaluate.ts | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/genkit-tools/common/src/eval/evaluate.ts b/genkit-tools/common/src/eval/evaluate.ts index 8bbc6607ba..d3f2ca3dce 100644 --- a/genkit-tools/common/src/eval/evaluate.ts +++ b/genkit-tools/common/src/eval/evaluate.ts @@ -67,6 +67,8 @@ interface FullInferenceSample { const SUPPORTED_ACTION_TYPES = ['flow', 'model', 'executable-prompt'] as const; type SupportedActionType = (typeof SUPPORTED_ACTION_TYPES)[number]; +const GENERATE_ACTION_UTIL = '/util/generate'; + /** * Starts a new evaluation run. Intended to be used via the reflection API. */ @@ -430,11 +432,9 @@ async function runPromptAction(params: { // Step 2. Run rendered prompt on the model try { let modelInput = renderedPrompt.result; - if (restOfConfig) { - modelInput = { ...modelInput, config: restOfConfig }; - } + modelInput = { ...modelInput, model, config: restOfConfig ?? {} }; const runActionResponse = await manager.runAction({ - key: model, + key: GENERATE_ACTION_UTIL, input: modelInput, }); const traceIds = runActionResponse.telemetry?.traceId From 426a06ea99e37ac6fbe81c8555d95c0408566dcc Mon Sep 17 00:00:00 2001 From: Samuel Bushi Date: Tue, 28 Oct 2025 15:25:07 -0400 Subject: [PATCH 2/2] fix: Correctly handle custom config --- genkit-tools/common/src/eval/evaluate.ts | 37 +++++++++--------------- js/testapps/evals/prompts/hello.prompt | 2 +- 2 files changed, 14 insertions(+), 25 deletions(-) diff --git a/genkit-tools/common/src/eval/evaluate.ts b/genkit-tools/common/src/eval/evaluate.ts index d3f2ca3dce..d7b391f91d 100644 --- a/genkit-tools/common/src/eval/evaluate.ts +++ b/genkit-tools/common/src/eval/evaluate.ts @@ -36,7 +36,6 @@ import { import { evaluatorName, generateTestCaseId, - getAction, getEvalExtractors, getModelInput, hasAction, @@ -387,14 +386,16 @@ async function runPromptAction(params: { promptConfig?: any; }): Promise { const { manager, actionRef, sample, context, promptConfig } = { ...params }; - - const { model: modelFromConfig, ...restOfConfig } = promptConfig ?? {}; - const model = await resolveModel({ manager, actionRef, modelFromConfig }); - if (!model) { + const { model: modelFromConfig, ...restOfConfig } = promptConfig; + if (!modelFromConfig) { throw new Error( - 'Could not resolve model. Please specify model and try again' + 'Missing model: Please specific model for prompt evaluation' ); } + const model = modelFromConfig.split('/model/').pop(); + if (!model) { + throw new Error(`Improper model provided: ${modelFromConfig}`); + } let state: InferenceRunState; let renderedPrompt: { result: GenerateActionOptions; @@ -432,7 +433,12 @@ async function runPromptAction(params: { // Step 2. Run rendered prompt on the model try { let modelInput = renderedPrompt.result; - modelInput = { ...modelInput, model, config: restOfConfig ?? {} }; + // Override with runtime specific config + if (Object.keys(restOfConfig ?? {}).length > 0) { + modelInput = { ...modelInput, model, config: restOfConfig }; + } else { + modelInput = { ...modelInput, model }; + } const runActionResponse = await manager.runAction({ key: GENERATE_ACTION_UTIL, input: modelInput, @@ -541,23 +547,6 @@ async function gatherEvalInput(params: { }; } -async function resolveModel(params: { - manager: RuntimeManager; - actionRef: string; - modelFromConfig?: string; -}) { - const { manager, actionRef, modelFromConfig } = { ...params }; - - // Prefer to use modelFromConfig - if (modelFromConfig) { - return modelFromConfig; - } - - const actionData = await getAction({ manager, actionRef }); - const promptMetadata = actionData?.metadata?.prompt as any; - return promptMetadata?.model ? `/model/${promptMetadata?.model}` : undefined; -} - function getSpanErrorMessage(span: SpanData): string | undefined { if (span && span.status?.code === 2 /* SpanStatusCode.ERROR */) { // It's possible for a trace to have multiple exception events, diff --git a/js/testapps/evals/prompts/hello.prompt b/js/testapps/evals/prompts/hello.prompt index 0ef26c9c6d..c51d31d169 100644 --- a/js/testapps/evals/prompts/hello.prompt +++ b/js/testapps/evals/prompts/hello.prompt @@ -1,7 +1,7 @@ --- -model: googleai/gemini-2.5-flash config: temperature: 0.75 + topK: 10 input: schema: query: string