diff --git a/genkit-tools/common/src/eval/evaluate.ts b/genkit-tools/common/src/eval/evaluate.ts index 8bbc6607ba..d7b391f91d 100644 --- a/genkit-tools/common/src/eval/evaluate.ts +++ b/genkit-tools/common/src/eval/evaluate.ts @@ -36,7 +36,6 @@ import { import { evaluatorName, generateTestCaseId, - getAction, getEvalExtractors, getModelInput, hasAction, @@ -67,6 +66,8 @@ interface FullInferenceSample { const SUPPORTED_ACTION_TYPES = ['flow', 'model', 'executable-prompt'] as const; type SupportedActionType = (typeof SUPPORTED_ACTION_TYPES)[number]; +const GENERATE_ACTION_UTIL = '/util/generate'; + /** * Starts a new evaluation run. Intended to be used via the reflection API. */ @@ -385,14 +386,16 @@ async function runPromptAction(params: { promptConfig?: any; }): Promise { const { manager, actionRef, sample, context, promptConfig } = { ...params }; - - const { model: modelFromConfig, ...restOfConfig } = promptConfig ?? {}; - const model = await resolveModel({ manager, actionRef, modelFromConfig }); - if (!model) { + const { model: modelFromConfig, ...restOfConfig } = promptConfig; + if (!modelFromConfig) { throw new Error( - 'Could not resolve model. Please specify model and try again' + 'Missing model: Please specific model for prompt evaluation' ); } + const model = modelFromConfig.split('/model/').pop(); + if (!model) { + throw new Error(`Improper model provided: ${modelFromConfig}`); + } let state: InferenceRunState; let renderedPrompt: { result: GenerateActionOptions; @@ -430,11 +433,14 @@ async function runPromptAction(params: { // Step 2. Run rendered prompt on the model try { let modelInput = renderedPrompt.result; - if (restOfConfig) { - modelInput = { ...modelInput, config: restOfConfig }; + // Override with runtime specific config + if (Object.keys(restOfConfig ?? {}).length > 0) { + modelInput = { ...modelInput, model, config: restOfConfig }; + } else { + modelInput = { ...modelInput, model }; } const runActionResponse = await manager.runAction({ - key: model, + key: GENERATE_ACTION_UTIL, input: modelInput, }); const traceIds = runActionResponse.telemetry?.traceId @@ -541,23 +547,6 @@ async function gatherEvalInput(params: { }; } -async function resolveModel(params: { - manager: RuntimeManager; - actionRef: string; - modelFromConfig?: string; -}) { - const { manager, actionRef, modelFromConfig } = { ...params }; - - // Prefer to use modelFromConfig - if (modelFromConfig) { - return modelFromConfig; - } - - const actionData = await getAction({ manager, actionRef }); - const promptMetadata = actionData?.metadata?.prompt as any; - return promptMetadata?.model ? `/model/${promptMetadata?.model}` : undefined; -} - function getSpanErrorMessage(span: SpanData): string | undefined { if (span && span.status?.code === 2 /* SpanStatusCode.ERROR */) { // It's possible for a trace to have multiple exception events, diff --git a/js/testapps/evals/prompts/hello.prompt b/js/testapps/evals/prompts/hello.prompt index 0ef26c9c6d..c51d31d169 100644 --- a/js/testapps/evals/prompts/hello.prompt +++ b/js/testapps/evals/prompts/hello.prompt @@ -1,7 +1,7 @@ --- -model: googleai/gemini-2.5-flash config: temperature: 0.75 + topK: 10 input: schema: query: string