Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions py/packages/genkit/src/genkit/core/reflection.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ def send_chunk(chunk):

async def run_fn():
return await action.arun_raw(
raw_input=payload['input'],
raw_input=payload.get('input'),
on_chunk=send_chunk,
context=context,
)
Expand Down Expand Up @@ -216,7 +216,7 @@ async def run_fn():
try:

async def run_fn():
return await action.arun_raw(raw_input=payload['input'], context=context)
return await action.arun_raw(raw_input=payload.get('input'), context=context)

output = run_async(loop, run_fn)

Expand Down Expand Up @@ -413,7 +413,7 @@ async def send_chunk(chunk):
yield f'{out}\n'

output = await action.arun_raw(
raw_input=payload['input'],
raw_input=payload.get('input'),
on_chunk=send_chunk,
context=context,
)
Expand Down Expand Up @@ -462,7 +462,7 @@ async def run_standard_action(
A JSONResponse with the action result or error.
"""
try:
output = await action.arun_raw(raw_input=payload['input'], context=context)
output = await action.arun_raw(raw_input=payload.get('input'), context=context)
response = {
'result': dump_dict(output.response),
'telemetry': {'traceId': output.trace_id},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from hashlib import md5

from genkit.blocks.document import Document
from genkit.blocks.retriever import IndexerRequest
from genkit.codec import dump_json
from genkit.types import DocumentData, Embedding

Expand All @@ -30,7 +31,8 @@


class DevLocalVectorStoreIndexer(LocalVectorStoreAPI):
async def index(self, docs: list[DocumentData]) -> None:
async def index(self, request: IndexerRequest) -> None:
docs = request.documents
data = self._load_filestore()
tasks = []

Expand Down
97 changes: 69 additions & 28 deletions py/plugins/evaluators/src/genkit/plugins/evaluators/plugin_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@


import json
import os
import re
from collections.abc import Callable
from typing import Any
Expand All @@ -37,6 +38,12 @@
from genkit.types import BaseEvalDataPoint, EvalFnResponse, EvalStatusEnum, Score


def _get_prompt_path(filename: str) -> str:
"""Get absolute path to a prompt file in the prompts directory."""
plugin_dir = os.path.dirname(os.path.abspath(__file__))
return os.path.join(plugin_dir, '..', '..', '..', '..', 'prompts', filename)


def evaluators_name(name: str) -> str:
"""Create an evaluators plugin name.

Expand Down Expand Up @@ -67,8 +74,10 @@ class GenkitEvaluators(Plugin):

name = 'genkitEval'

def __init__(self, params: PluginOptions):
def __init__(self, params: PluginOptions | list[MetricConfig]):
"""Initialize Genkit Evaluators plugin."""
if isinstance(params, list):
params = PluginOptions(root=params)
self.params = params

def initialize(self, ai: Genkit) -> None:
Expand All @@ -84,20 +93,18 @@ def _configure_evaluator(self, ai: Genkit, param: MetricConfig):

async def _relevancy_eval(datapoint: BaseEvalDataPoint, options: Any | None):
assert datapoint.output is not None, 'output is required'
assert datapoint.reference is not None, 'reference is required'
assert isinstance(datapoint.reference, str), 'reference must be of string (regex)'
output_string = (
datapoint.output if isinstance(datapoint.output, str) else json.dumps(datapoint.output)
)
input_string = datapoint.input if isinstance(datapoint.input, str) else json.dumps(datapoint.input)
prompt_function = await load_prompt_file('../../prompts/faithfulness_long_form.prompt')
prompt_function = await load_prompt_file(_get_prompt_path('faithfulness_long_form.prompt'))
context = ' '.join(json.dumps(e) for e in datapoint.context)
prompt = await render_text(
prompt_function, {'input': input_string, 'output': output_string, 'context': context}
)

response = await ai.generate(
model=param.judge,
model=param.judge.name,
prompt=prompt,
config=param.config,
output_schema=AnswerRelevancyResponseSchema,
Expand All @@ -107,54 +114,90 @@ async def _relevancy_eval(datapoint: BaseEvalDataPoint, options: Any | None):
return fill_scores(datapoint, Score(score=score, status=status), param.status_override_fn)

ai.define_evaluator(
name=evaluators_name(str(GenkitMetricType.MALICIOUSNESS).lower()),
name=evaluators_name(str(GenkitMetricType.ANSWER_RELEVANCY).lower()),
display_name='Answer Relevancy',
definition='Assesses how pertinent the generated answer is to the given prompt',
fn=_relevancy_eval,
)
case GenkitMetricType.FAITHFULNESS:
# Cache for prompts (loaded on first use)
_faithfulness_prompts = {}

async def _faithfulness_eval(datapoint: BaseEvalDataPoint, options: Any | None):
assert datapoint.output is not None, 'output is required'
assert datapoint.reference is not None, 'reference is required'
assert isinstance(datapoint.reference, str), 'reference must be of string (regex)'
output_string = (
datapoint.output if isinstance(datapoint.output, str) else json.dumps(datapoint.output)
)
input_string = datapoint.input if isinstance(datapoint.input, str) else json.dumps(datapoint.input)
prompt_function = await load_prompt_file('../../prompts/faithfulness_long_form.prompt')
context = ' '.join(json.dumps(e) for e in datapoint.context)
context_list = [(json.dumps(e) if not isinstance(e, str) else e) for e in (datapoint.context or [])]

# Lazy load and cache prompts
if 'longform' not in _faithfulness_prompts:
_faithfulness_prompts['longform'] = await load_prompt_file(
_get_prompt_path('faithfulness_long_form.prompt')
)
if 'nli' not in _faithfulness_prompts:
_faithfulness_prompts['nli'] = await load_prompt_file(
_get_prompt_path('faithfulness_nli.prompt')
)

# Step 1: Extract statements
prompt = await render_text(
prompt_function, {'input': input_string, 'output': output_string, 'context': context}
_faithfulness_prompts['longform'], {'question': input_string, 'answer': output_string}
)

longform_response = await ai.generate(
model=param.judge_llm,
model=param.judge.name,
prompt=prompt,
config=param.config,
config=param.judge_config,
output_schema=LongFormResponseSchema,
)
statements = (
longform_response.output.get('statements', [])
if isinstance(longform_response.output, dict)
else (longform_response.output.statements if longform_response.output else [])
)
if not statements:
raise ValueError('No statements returned')

prompt_function = await load_prompt_file('../../prompts/faithfulness_nli.prompt')
context = ' '.join(json.dumps(e) for e in datapoint.context)
# Step 2: NLI Check
all_statements = '\n'.join([f'statement: {s}' for s in statements])
all_context = '\n'.join(context_list)
prompt = await render_text(
prompt_function, {'input': input_string, 'output': output_string, 'context': context}
_faithfulness_prompts['nli'], {'context': all_context, 'statements': all_statements}
)

longform_response = await ai.generate(
model=param.judge_llm,
nli_response = await ai.generate(
model=param.judge.name,
prompt=prompt,
config=param.config,
output_schema=LongFormResponseSchema,
config=param.judge_config,
output_schema=NliResponse,
)

nli_output = nli_response.output
if isinstance(nli_output, dict):
responses = nli_output.get('responses', [])
else:
responses = nli_output.responses if nli_output else []

if not responses:
raise ValueError('Evaluator response empty')

# Handle both dict and object responses
faithful_count = sum(
1 for r in responses if (r.get('verdict') if isinstance(r, dict) else r.verdict)
)
score_val = faithful_count / len(responses)
reasoning = '; '.join([r.get('reason', '') if isinstance(r, dict) else r.reason for r in responses])
status = EvalStatusEnum.PASS_ if score_val > 0.5 else EvalStatusEnum.FAIL

status = EvalStatusEnum.PASS_ if longform_response else EvalStatusEnum.FAIL
return fill_scores(
datapoint, Score(score=longform_response, status=status), param.status_override_fn
datapoint,
Score(score=score_val, status=status, details={'reasoning': reasoning}),
param.status_override_fn,
)

ai.define_evaluator(
name=evaluators_name(str(GenkitMetricType.MALICIOUSNESS).lower()),
name=evaluators_name(str(GenkitMetricType.FAITHFULNESS).lower()),
display_name='Faithfulness',
definition='Measures the factual consistency of the generated answer against the given context',
fn=_faithfulness_eval,
Expand All @@ -164,20 +207,18 @@ async def _faithfulness_eval(datapoint: BaseEvalDataPoint, options: Any | None):

async def _maliciousness_eval(datapoint: BaseEvalDataPoint, options: Any | None):
assert datapoint.output is not None, 'output is required'
assert datapoint.reference is not None, 'reference is required'
assert isinstance(datapoint.reference, str), 'reference must be of string (regex)'
output_string = (
datapoint.output if isinstance(datapoint.output, str) else json.dumps(datapoint.output)
)
input_string = datapoint.input if isinstance(datapoint.input, str) else json.dumps(datapoint.input)
prompt_function = await load_prompt_file('../../prompts/maliciousness.prompt')
prompt_function = await load_prompt_file(_get_prompt_path('maliciousness.prompt'))
context = ' '.join(json.dumps(e) for e in datapoint.context)
prompt = await render_text(
prompt_function, {'input': input_string, 'output': output_string, 'context': context}
)

score = await ai.generate(
model=param.judge_llm,
model=param.judge.name,
prompt=prompt,
config=param.config,
output_schema=MaliciousnessResponseSchema,
Expand Down
78 changes: 61 additions & 17 deletions py/samples/evaluator-demo/README.md
Original file line number Diff line number Diff line change
@@ -1,38 +1,82 @@
# Evaluator Demo
# Evaluation in Genkit

An example demonstrating running flows using the Google GenAI plugin.
This sample demonstrates the different evaluation features using Genkit Python SDK.

## Setup environment
Note: This sample focuses on evaluation features in Genkit, by utilizing the official Genkit Evaluators plugin. If you are interested in writing your own custom evaluator, please check the `custom/test_evaluator` defined in `src/index.py`.

Obtain an API key from [ai.dev](https://ai.dev).

Export the API key as env variable `GEMINI\_API\_KEY` in your shell
configuration.
## Setup and start the sample

```bash
export GEMINI_API_KEY='<Your api key>'

# Start the Genkit Dev UI
genkit start -- uv run samples/evaluator-demo/src/index.py
# This command should output the link to the Genkit Dev UI.
```

## Run the sample
The rest of the commands in this guide can be run in a separate terminal or directly in the Dev UI.

Start the Genkit Developer UI:
### Initial Setup

```bash
genkit start -- uv run src/eval_demo.py
# Index "docs/cat-handbook.pdf" to start
# testing Genkit evaluation features. Please see
# src/setup.py for more details.

genkit flow:run setup
```

## Evaluations

### Simple inference and evaluation
### Running Evaluations via CLI

Use the `eval:flow` command to run a flow against a dataset and evaluate the outputs:

```bash
# Evaluate with a specific evaluator
genkit eval:flow pdf_qa --input data/cat_adoption_questions.json --evaluator=custom/test_evaluator

# Evaluate with multiple evaluators
genkit eval:flow pdf_qa --input data/cat_adoption_questions.json --evaluator=genkitEval/faithfulness --evaluator=genkitEval/maliciousness

Use the `run_eval_demo` command to run a flow against a set of input samples and
evaluate the generated outputs. Input (JSON) = "{}"
# Evaluate with all available evaluators (omit --evaluator flag)
genkit eval:flow pdf_qa --input data/cat_adoption_questions.json
```

### Running Evaluations in Dev UI

1. Navigate to the **Evaluations** tab in the Dev UI
2. Click **"Run Evaluation"** or **"New Evaluation"**
3. Configure:
- **Flow**: Select the flow to evaluate (e.g., `pdf_qa`)
- **Dataset**: Upload or select a JSON file (e.g., `data/cat_adoption_questions.json`)
- **Evaluators**: Select one or more evaluators:
- `custom/test_evaluator` - Random evaluator for testing (fast, no LLM calls)
- `genkitEval/faithfulness` - Checks if output is faithful to context
- `genkitEval/maliciousness` - Detects harmful content
- `genkitEval/answer_relevancy` - Checks if answer is relevant to question
4. Click **"Run"**
5. View results in the Evaluations tab

## Run tests
### Programmatic Evaluation

To run the automated tests for this sample:
The `dog_facts_eval` flow demonstrates running evaluations from code. See `src/eval_in_code.py` for implementation details.

```bash
uv run pytest -v src/eval_demo.py
# Run programmatic evaluation
genkit flow:run dog_facts_eval
```

**Note:** The `dog_facts_eval` flow evaluates 20 test cases with the faithfulness metric, making 40 LLM API calls. This takes approximately 5 minutes to complete.

## Available Flows

- **setup**: Indexes the default PDF document (`docs/cat-handbook.pdf`) into the vector store
- **index_pdf**: Indexes a specified PDF file (defaults to `docs/cat-wiki.pdf`)
- **pdf_qa**: RAG flow that answers questions based on indexed PDF documents. It requires `setup` flow run first.
- **simple_structured**: Simple flow with structured input/output
- **simple_echo**: Simple echo flow
- **dog_facts_eval**: Programmatic evaluation flow using the faithfulness metric on a dog facts dataset

## Reference

For more details on using Genkit evaluations, please refer to the official [Genkit documentation](https://firebase.google.com/docs/genkit/evaluation).
7 changes: 7 additions & 0 deletions py/samples/evaluator-demo/data/capra-test.json

Large diffs are not rendered by default.

4 changes: 4 additions & 0 deletions py/samples/evaluator-demo/data/cat_adoption_questions.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{"input":"What are typical cat behaviors?"}
{"input":"What supplies do you need when bringing home a new cat?"}
{"input":"How often should you trim your cat's nails?"}
{"input":"What are some plants that are toxic to cats?"}
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
[
{
"input": "What are typical cat behaviors?",
"reference": "Cats like to purr, push things away and cuddle."
},
{
"input": "What supplies do you need when bringing home a new cat?",
"reference": "Litter box, cat food and plenty of yarn"
},
{
"input": "How often should you trim your cat's nails?",
"reference": "Trim your cat's nails only when you feel like they're overgrown"
},
{
"input": "What are some plants that are toxic to cats?",
"reference": "I don't know, maybe poison ivy?"
}
]
Loading
Loading