Skip to content

Commit 85a0abe

Browse files
authored
test(tools): Add more golden tests for tools (#114)
1. Add golden tests for tools for coming PR #12 2. chore: reformatting
1 parent c17573f commit 85a0abe

4 files changed

Lines changed: 102 additions & 66 deletions

File tree

documentation/examples/quick_start_examples.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,7 @@ def describe_image(llm, image_url: str, question: str, answer: str):
104104
answer="Cavalier King Charles Spaniel",
105105
)
106106

107+
107108
# %%
108109
@kbench.task(name="describe_video")
109110
def describe_video(llm, video_url: str, question: str, answer: str):

golden_tests/conftest.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,6 @@ def module_report_fixture(request):
6262
test_result.outcome
6363
)
6464

65-
6665
base_name = Path(request.module.__file__).stem
6766
report_filename = f"{base_name}_report.yaml"
6867
report_path = Path(__file__).parent / report_filename

golden_tests/test_cookbook_examples.py

Lines changed: 97 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,6 @@
3131
"""
3232

3333
# %%
34-
import json
3534
import os
3635
import tempfile
3736
from contextlib import contextmanager
@@ -44,7 +43,6 @@
4443
from pydantic import BaseModel, Field
4544

4645
import kaggle_benchmarks as kbench
47-
from kaggle_benchmarks import messages
4846
from kaggle_benchmarks.content_types import images, videos
4947

5048
# Models to be tested as the primary subject.
@@ -516,6 +514,7 @@ def test_video_url(llm):
516514

517515

518516
def run_simple_calculator(a: float, b: float, operator: str) -> float:
517+
"""Supported operators are: + - * and /"""
519518
if operator == "+":
520519
return a + b
521520
if operator == "-":
@@ -527,74 +526,108 @@ def run_simple_calculator(a: float, b: float, operator: str) -> float:
527526
raise ValueError(f"Unknown operator: {operator}")
528527

529528

530-
@benchmark_test(
531-
exclude={
532-
"anthropic/claude-haiku-4-5@20251001",
533-
"anthropic/claude-opus-4-5@20251101",
534-
"anthropic/claude-sonnet-4-5@20250929",
535-
"deepseek-ai/deepseek-r1-0528",
536-
"deepseek-ai/deepseek-v3.2",
537-
"google/gemma-3-12b",
538-
"google/gemini-3.1-flash-lite-preview",
539-
}
540-
)
529+
@benchmark_test()
541530
@kbench.task()
542-
def test_manual_tool_use(llm):
531+
def test_simple_tool_use(llm):
543532
problem = "What is 50 plus 25?"
544533
expected_answer = 75.0
545534

546-
# Define the tool schema (JSON Schema format).
547-
calculator_tool = {
548-
"type": "function",
549-
"function": {
550-
"name": "simple_calculator",
551-
"description": "Calculates the result of an arithmetic operation.",
552-
"parameters": {
553-
"type": "object",
554-
"properties": {
555-
"a": {"type": "number", "description": "The first number."},
556-
"b": {"type": "number", "description": "The second number."},
557-
"operator": {
558-
"type": "string",
559-
"description": "The operator (+, -, *, /).",
560-
},
561-
},
562-
"required": ["a", "b", "operator"],
563-
},
564-
},
565-
}
566-
567-
# 1. Get the initial response from the LLM.
568-
kbench.actors.user.send(problem)
569-
tool_call_msg = llm.respond(tools=[calculator_tool])
570-
tool_calls = tool_call_msg.tool_calls
571-
572-
kbench.assertions.assert_true(bool(tool_calls), "LLM was expected to call a tool.")
573-
574-
# 2. Parse the tool call arguments.
575-
tool_call = tool_calls[0]
576-
function_args = json.loads(tool_call["function"]["arguments"])
577-
# Removes 'signature' parameter in thinking mode.
578-
function_args.pop("signature", None)
579-
580-
# 3. Execute the actual Python function.
581-
tool_result = run_simple_calculator(**function_args)
582-
583-
# 4. Send the tool result back to the LLM.
584-
tool_actor = kbench.actors.Actor(name="Tool", role="tool", avatar="🛠️")
585-
tool_actor.send(
586-
messages.Message(
587-
sender=tool_actor,
588-
content=str(tool_result),
589-
_meta={"tool_call_id": tool_call["id"]},
590-
)
591-
)
592-
593-
# 5. Get the final answer.
594-
final_answer_msg = llm.respond()
595-
final_answer = final_answer_msg.content
535+
final_answer = llm.prompt(problem, tools=[run_simple_calculator])
536+
kbench.assertions.assert_tool_was_invoked(run_simple_calculator)
596537

597538
kbench.assertions.assert_true(
598539
str(int(expected_answer)) in final_answer,
599540
f"Expected '{expected_answer}' to be in the final answer, got '{final_answer}'.",
600541
)
542+
543+
544+
# %%
545+
def increment_counter() -> int:
546+
"""Increments a counter and returns the value."""
547+
increment_counter.count += 1
548+
return increment_counter.count
549+
550+
551+
@benchmark_test()
552+
@kbench.task()
553+
def test_stateful_tool_double_execution(llm):
554+
increment_counter.count = 0 # Reset for each test run
555+
556+
llm.prompt("Call the increment_counter tool.", tools=[increment_counter])
557+
558+
kbench.assertions.assert_equal(
559+
1, increment_counter.count, expectation="Tool should be executed exactly once."
560+
)
561+
562+
563+
# %%
564+
def add_tool(a: float, b: float) -> float:
565+
"""Adds two numbers."""
566+
add_tool.calls += 1
567+
return a + b
568+
569+
570+
def multiply_tool(a: float, b: float) -> float:
571+
"""Multiplies two numbers."""
572+
multiply_tool.calls += 1
573+
return a * b
574+
575+
576+
@benchmark_test()
577+
@kbench.task()
578+
def test_multiple_tool_selection(llm):
579+
add_tool.calls = 0
580+
multiply_tool.calls = 0
581+
582+
llm.prompt(
583+
"What is 12 multiplied by 34? Use the multiply_tool.",
584+
tools=[add_tool, multiply_tool],
585+
)
586+
587+
kbench.assertions.assert_equal(
588+
1, multiply_tool.calls, expectation="Multiply tool should be called once."
589+
)
590+
kbench.assertions.assert_equal(
591+
0, add_tool.calls, expectation="Add tool should not be called."
592+
)
593+
594+
595+
# %%
596+
def get_user_profile(user_id: str) -> dict:
597+
"""Returns user profile information as a dictionary."""
598+
if user_id == "user_123":
599+
return {"name": "Alice", "role": "Admin", "skills": ["Python", "SQL"]}
600+
return {"name": "Unknown", "role": "User", "skills": []}
601+
602+
603+
@benchmark_test()
604+
@kbench.task()
605+
def test_complex_tool_return(llm):
606+
response = llm.prompt(
607+
"Get the profile for user_123 and tell me what their role is.",
608+
tools=[get_user_profile],
609+
)
610+
611+
kbench.assertions.assert_contains_regex(
612+
r"(?i)admin", response, expectation="Model should identify the role as Admin."
613+
)
614+
615+
616+
# %%
617+
def flaky_tool() -> str:
618+
"""This tool always fails with an error."""
619+
raise ValueError("Tool execution failed simulated error.")
620+
621+
622+
@benchmark_test()
623+
@kbench.task()
624+
def test_tool_error_handling(llm):
625+
response = llm.prompt(
626+
"Call the flaky_tool and report what happens.", tools=[flaky_tool]
627+
)
628+
629+
kbench.assertions.assert_contains_regex(
630+
r"(?i)error|failed|valueerror",
631+
response,
632+
expectation="Model should report the tool failure.",
633+
)

src/kaggle_benchmarks/kaggle/serialization.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -233,7 +233,10 @@ def _message_to_proto_content(message: benchmark_messages.Message) -> dict[str,
233233

234234
part_value: dict[str, Any] = {} # Holds the 'image' or 'text' part
235235

236-
if isinstance(message.content, (content_types.images.ImageURL, content_types.videos.VideoContent)):
236+
if isinstance(
237+
message.content,
238+
(content_types.images.ImageURL, content_types.videos.VideoContent),
239+
):
237240
mime = message.content.to_mime()
238241
part_value["file_data"] = {
239242
"file_uri": mime["location"],

0 commit comments

Comments
 (0)