firebase · MengqinShen · Dec 27, 2025 · Dec 26, 2025 · Dec 26, 2025 · Dec 26, 2025
diff --git a/py/packages/genkit/src/genkit/core/reflection.py b/py/packages/genkit/src/genkit/core/reflection.py
@@ -185,7 +185,7 @@ def send_chunk(chunk):
 
                         async def run_fn():
                             return await action.arun_raw(
-                                raw_input=payload['input'],
+                                raw_input=payload.get('input'),
                                 on_chunk=send_chunk,
                                 context=context,
                             )
@@ -216,7 +216,7 @@ async def run_fn():
                     try:
 
                         async def run_fn():
-                            return await action.arun_raw(raw_input=payload['input'], context=context)
+                            return await action.arun_raw(raw_input=payload.get('input'), context=context)
 
                         output = run_async(loop, run_fn)
 
@@ -413,7 +413,7 @@ async def send_chunk(chunk):
                     yield f'{out}\n'
 
                 output = await action.arun_raw(
-                    raw_input=payload['input'],
+                    raw_input=payload.get('input'),
                     on_chunk=send_chunk,
                     context=context,
                 )
@@ -462,7 +462,7 @@ async def run_standard_action(
             A JSONResponse with the action result or error.
         """
         try:
-            output = await action.arun_raw(raw_input=payload['input'], context=context)
+            output = await action.arun_raw(raw_input=payload.get('input'), context=context)
             response = {
                 'result': dump_dict(output.response),
                 'telemetry': {'traceId': output.trace_id},

diff --git a/py/plugins/dev-local-vectorstore/src/genkit/plugins/dev_local_vectorstore/indexer.py b/py/plugins/dev-local-vectorstore/src/genkit/plugins/dev_local_vectorstore/indexer.py
@@ -20,6 +20,7 @@
 from hashlib import md5
 
 from genkit.blocks.document import Document
+from genkit.blocks.retriever import IndexerRequest
 from genkit.codec import dump_json
 from genkit.types import DocumentData, Embedding
 
@@ -30,7 +31,8 @@
 
 
 class DevLocalVectorStoreIndexer(LocalVectorStoreAPI):
-    async def index(self, docs: list[DocumentData]) -> None:
+    async def index(self, request: IndexerRequest) -> None:
+        docs = request.documents
         data = self._load_filestore()
         tasks = []
 

diff --git a/py/plugins/evaluators/src/genkit/plugins/evaluators/plugin_api.py b/py/plugins/evaluators/src/genkit/plugins/evaluators/plugin_api.py
@@ -16,6 +16,7 @@
 
 
 import json
+import os
 import re
 from collections.abc import Callable
 from typing import Any
@@ -37,6 +38,12 @@
 from genkit.types import BaseEvalDataPoint, EvalFnResponse, EvalStatusEnum, Score
 
 
+def _get_prompt_path(filename: str) -> str:
+    """Get absolute path to a prompt file in the prompts directory."""
+    plugin_dir = os.path.dirname(os.path.abspath(__file__))
+    return os.path.join(plugin_dir, '..', '..', '..', '..', 'prompts', filename)
+
+
 def evaluators_name(name: str) -> str:
     """Create an evaluators plugin name.
 
@@ -67,8 +74,10 @@ class GenkitEvaluators(Plugin):
 
     name = 'genkitEval'
 
-    def __init__(self, params: PluginOptions):
+    def __init__(self, params: PluginOptions | list[MetricConfig]):
         """Initialize Genkit Evaluators plugin."""
+        if isinstance(params, list):
+            params = PluginOptions(root=params)
         self.params = params
 
     def initialize(self, ai: Genkit) -> None:
@@ -84,20 +93,18 @@ def _configure_evaluator(self, ai: Genkit, param: MetricConfig):
 
                 async def _relevancy_eval(datapoint: BaseEvalDataPoint, options: Any | None):
                     assert datapoint.output is not None, 'output is required'
-                    assert datapoint.reference is not None, 'reference is required'
-                    assert isinstance(datapoint.reference, str), 'reference must be of string (regex)'
                     output_string = (
                         datapoint.output if isinstance(datapoint.output, str) else json.dumps(datapoint.output)
                     )
                     input_string = datapoint.input if isinstance(datapoint.input, str) else json.dumps(datapoint.input)
-                    prompt_function = await load_prompt_file('../../prompts/faithfulness_long_form.prompt')
+                    prompt_function = await load_prompt_file(_get_prompt_path('faithfulness_long_form.prompt'))
                     context = ' '.join(json.dumps(e) for e in datapoint.context)
                     prompt = await render_text(
                         prompt_function, {'input': input_string, 'output': output_string, 'context': context}
                     )
 
                     response = await ai.generate(
-                        model=param.judge,
+                        model=param.judge.name,
                         prompt=prompt,
                         config=param.config,
                         output_schema=AnswerRelevancyResponseSchema,
@@ -107,54 +114,90 @@ async def _relevancy_eval(datapoint: BaseEvalDataPoint, options: Any | None):
                     return fill_scores(datapoint, Score(score=score, status=status), param.status_override_fn)
 
                 ai.define_evaluator(
-                    name=evaluators_name(str(GenkitMetricType.MALICIOUSNESS).lower()),
+                    name=evaluators_name(str(GenkitMetricType.ANSWER_RELEVANCY).lower()),
                     display_name='Answer Relevancy',
                     definition='Assesses how pertinent the generated answer is to the given prompt',
                     fn=_relevancy_eval,
                 )
             case GenkitMetricType.FAITHFULNESS:
+                # Cache for prompts (loaded on first use)
+                _faithfulness_prompts = {}
 
                 async def _faithfulness_eval(datapoint: BaseEvalDataPoint, options: Any | None):
                     assert datapoint.output is not None, 'output is required'
-                    assert datapoint.reference is not None, 'reference is required'
-                    assert isinstance(datapoint.reference, str), 'reference must be of string (regex)'
                     output_string = (
                         datapoint.output if isinstance(datapoint.output, str) else json.dumps(datapoint.output)
                     )
                     input_string = datapoint.input if isinstance(datapoint.input, str) else json.dumps(datapoint.input)
-                    prompt_function = await load_prompt_file('../../prompts/faithfulness_long_form.prompt')
-                    context = ' '.join(json.dumps(e) for e in datapoint.context)
+                    context_list = [(json.dumps(e) if not isinstance(e, str) else e) for e in (datapoint.context or [])]
+
+                    # Lazy load and cache prompts
+                    if 'longform' not in _faithfulness_prompts:
+                        _faithfulness_prompts['longform'] = await load_prompt_file(
+                            _get_prompt_path('faithfulness_long_form.prompt')
+                        )
+                    if 'nli' not in _faithfulness_prompts:
+                        _faithfulness_prompts['nli'] = await load_prompt_file(
+                            _get_prompt_path('faithfulness_nli.prompt')
+                        )
+
+                    # Step 1: Extract statements
                     prompt = await render_text(
-                        prompt_function, {'input': input_string, 'output': output_string, 'context': context}
+                        _faithfulness_prompts['longform'], {'question': input_string, 'answer': output_string}
                     )
-
                     longform_response = await ai.generate(
-                        model=param.judge_llm,
+                        model=param.judge.name,
                         prompt=prompt,
-                        config=param.config,
+                        config=param.judge_config,
                         output_schema=LongFormResponseSchema,
                     )
+                    statements = (
+                        longform_response.output.get('statements', [])
+                        if isinstance(longform_response.output, dict)
+                        else (longform_response.output.statements if longform_response.output else [])
+                    )
+                    if not statements:
+                        raise ValueError('No statements returned')
 
-                    prompt_function = await load_prompt_file('../../prompts/faithfulness_nli.prompt')
-                    context = ' '.join(json.dumps(e) for e in datapoint.context)
+                    # Step 2: NLI Check
+                    all_statements = '\n'.join([f'statement: {s}' for s in statements])
+                    all_context = '\n'.join(context_list)
                     prompt = await render_text(
-                        prompt_function, {'input': input_string, 'output': output_string, 'context': context}
+                        _faithfulness_prompts['nli'], {'context': all_context, 'statements': all_statements}
                     )
 
-                    longform_response = await ai.generate(
-                        model=param.judge_llm,
+                    nli_response = await ai.generate(
+                        model=param.judge.name,
                         prompt=prompt,
-                        config=param.config,
-                        output_schema=LongFormResponseSchema,
+                        config=param.judge_config,
+                        output_schema=NliResponse,
+                    )
+
+                    nli_output = nli_response.output
+                    if isinstance(nli_output, dict):
+                        responses = nli_output.get('responses', [])
+                    else:
+                        responses = nli_output.responses if nli_output else []
+
+                    if not responses:
+                        raise ValueError('Evaluator response empty')
+
+                    # Handle both dict and object responses
+                    faithful_count = sum(
+                        1 for r in responses if (r.get('verdict') if isinstance(r, dict) else r.verdict)
                     )
+                    score_val = faithful_count / len(responses)
+                    reasoning = '; '.join([r.get('reason', '') if isinstance(r, dict) else r.reason for r in responses])
+                    status = EvalStatusEnum.PASS_ if score_val > 0.5 else EvalStatusEnum.FAIL
 
-                    status = EvalStatusEnum.PASS_ if longform_response else EvalStatusEnum.FAIL
                     return fill_scores(
-                        datapoint, Score(score=longform_response, status=status), param.status_override_fn
+                        datapoint,
+                        Score(score=score_val, status=status, details={'reasoning': reasoning}),
+                        param.status_override_fn,
                     )
 
                 ai.define_evaluator(
-                    name=evaluators_name(str(GenkitMetricType.MALICIOUSNESS).lower()),
+                    name=evaluators_name(str(GenkitMetricType.FAITHFULNESS).lower()),
                     display_name='Faithfulness',
                     definition='Measures the factual consistency of the generated answer against the given context',
                     fn=_faithfulness_eval,
@@ -164,20 +207,18 @@ async def _faithfulness_eval(datapoint: BaseEvalDataPoint, options: Any | None):
 
                 async def _maliciousness_eval(datapoint: BaseEvalDataPoint, options: Any | None):
                     assert datapoint.output is not None, 'output is required'
-                    assert datapoint.reference is not None, 'reference is required'
-                    assert isinstance(datapoint.reference, str), 'reference must be of string (regex)'
                     output_string = (
                         datapoint.output if isinstance(datapoint.output, str) else json.dumps(datapoint.output)
                     )
                     input_string = datapoint.input if isinstance(datapoint.input, str) else json.dumps(datapoint.input)
-                    prompt_function = await load_prompt_file('../../prompts/maliciousness.prompt')
+                    prompt_function = await load_prompt_file(_get_prompt_path('maliciousness.prompt'))
                     context = ' '.join(json.dumps(e) for e in datapoint.context)
                     prompt = await render_text(
                         prompt_function, {'input': input_string, 'output': output_string, 'context': context}
                     )
 
                     score = await ai.generate(
-                        model=param.judge_llm,
+                        model=param.judge.name,
                         prompt=prompt,
                         config=param.config,
                         output_schema=MaliciousnessResponseSchema,

diff --git a/py/samples/evaluator-demo/README.md b/py/samples/evaluator-demo/README.md
@@ -1,38 +1,82 @@
-# Evaluator Demo
+# Evaluation in Genkit
 
-An example demonstrating running flows using the Google GenAI plugin.
+This sample demonstrates the different evaluation features using Genkit Python SDK.
 
-## Setup environment
+Note: This sample focuses on evaluation features in Genkit, by utilizing the official Genkit Evaluators plugin. If you are interested in writing your own custom evaluator, please check the `custom/test_evaluator` defined in `src/index.py`.
 
-Obtain an API key from [ai.dev](https://ai.dev).
-
-Export the API key as env variable `GEMINI\_API\_KEY` in your shell
-configuration.
+## Setup and start the sample
 
 ```bash
-export GEMINI_API_KEY='<Your api key>'
+
+# Start the Genkit Dev UI
+genkit start -- uv run samples/evaluator-demo/src/index.py
+# This command should output the link to the Genkit Dev UI.
 ```
 
-## Run the sample
+The rest of the commands in this guide can be run in a separate terminal or directly in the Dev UI.
 
-Start the Genkit Developer UI:
+### Initial Setup
 
 ```bash
-genkit start -- uv run src/eval_demo.py
+# Index "docs/cat-handbook.pdf" to start
+# testing Genkit evaluation features. Please see
+# src/setup.py for more details.
+
+genkit flow:run setup
 ```
 
 ## Evaluations
 
-### Simple inference and evaluation
+### Running Evaluations via CLI
+
+Use the `eval:flow` command to run a flow against a dataset and evaluate the outputs:
+
+```bash
+# Evaluate with a specific evaluator
+genkit eval:flow pdf_qa --input data/cat_adoption_questions.json --evaluator=custom/test_evaluator
+
+# Evaluate with multiple evaluators
+genkit eval:flow pdf_qa --input data/cat_adoption_questions.json --evaluator=genkitEval/faithfulness --evaluator=genkitEval/maliciousness
 
-Use the `run_eval_demo` command to run a flow against a set of input samples and
-evaluate the generated outputs. Input (JSON) = "{}"
+# Evaluate with all available evaluators (omit --evaluator flag)
+genkit eval:flow pdf_qa --input data/cat_adoption_questions.json
+```
+
+### Running Evaluations in Dev UI
 
+1. Navigate to the **Evaluations** tab in the Dev UI
+2. Click **"Run Evaluation"** or **"New Evaluation"**
+3. Configure:
+   - **Flow**: Select the flow to evaluate (e.g., `pdf_qa`)
+   - **Dataset**: Upload or select a JSON file (e.g., `data/cat_adoption_questions.json`)
+   - **Evaluators**: Select one or more evaluators:
+     - `custom/test_evaluator` - Random evaluator for testing (fast, no LLM calls)
+     - `genkitEval/faithfulness` - Checks if output is faithful to context
+     - `genkitEval/maliciousness` - Detects harmful content
+     - `genkitEval/answer_relevancy` - Checks if answer is relevant to question
+4. Click **"Run"**
+5. View results in the Evaluations tab
 
-## Run tests
+### Programmatic Evaluation
 
-To run the automated tests for this sample:
+The `dog_facts_eval` flow demonstrates running evaluations from code. See `src/eval_in_code.py` for implementation details.
 
 ```bash
-uv run pytest -v src/eval_demo.py
+# Run programmatic evaluation
+genkit flow:run dog_facts_eval
 ```
+
+**Note:** The `dog_facts_eval` flow evaluates 20 test cases with the faithfulness metric, making 40 LLM API calls. This takes approximately 5 minutes to complete.
+
+## Available Flows
+
+- **setup**: Indexes the default PDF document (`docs/cat-handbook.pdf`) into the vector store
+- **index_pdf**: Indexes a specified PDF file (defaults to `docs/cat-wiki.pdf`)
+- **pdf_qa**: RAG flow that answers questions based on indexed PDF documents. It requires `setup` flow run first.
+- **simple_structured**: Simple flow with structured input/output
+- **simple_echo**: Simple echo flow
+- **dog_facts_eval**: Programmatic evaluation flow using the faithfulness metric on a dog facts dataset
+
+## Reference
+
+For more details on using Genkit evaluations, please refer to the official [Genkit documentation](https://firebase.google.com/docs/genkit/evaluation).
diff --git a/py/samples/evaluator-demo/data/capra-test.json b/py/samples/evaluator-demo/data/capra-test.json
diff --git a/py/samples/evaluator-demo/data/cat_adoption_questions.jsonl b/py/samples/evaluator-demo/data/cat_adoption_questions.jsonl
@@ -0,0 +1,4 @@
+{"input":"What are typical cat behaviors?"}
+{"input":"What supplies do you need when bringing home a new cat?"}
+{"input":"How often should you trim your cat's nails?"}
+{"input":"What are some plants that are toxic to cats?"}
diff --git a/py/samples/evaluator-demo/data/cat_adoption_questions_with_reference.json b/py/samples/evaluator-demo/data/cat_adoption_questions_with_reference.json
@@ -0,0 +1,18 @@
+[
+  {
+    "input": "What are typical cat behaviors?",
+    "reference": "Cats like to purr, push things away and cuddle."
+  },
+  {
+    "input": "What supplies do you need when bringing home a new cat?",
+    "reference": "Litter box, cat food and plenty of yarn"
+  },
+  {
+    "input": "How often should you trim your cat's nails?",
+    "reference": "Trim your cat's nails only when you feel like they're overgrown"
+  },
+  {
+    "input": "What are some plants that are toxic to cats?",
+    "reference": "I don't know, maybe poison ivy?"
+  }
+]