test(tools): Add more golden tests for tools (#114)

dolaameng · web-flow · commit 85a0abeaf2d4 · 2026-04-10T09:33:00.000-07:00
1. Add golden tests for tools for coming PR #12 2. chore: reformatting
diff --git a/documentation/examples/quick_start_examples.py b/documentation/examples/quick_start_examples.py
@@ -104,6 +104,7 @@ def describe_image(llm, image_url: str, question: str, answer: str):
     answer="Cavalier King Charles Spaniel",
 )
 
+
 # %%
 @kbench.task(name="describe_video")
 def describe_video(llm, video_url: str, question: str, answer: str):
diff --git a/golden_tests/conftest.py b/golden_tests/conftest.py
@@ -62,7 +62,6 @@ def module_report_fixture(request):
                 test_result.outcome
             )
 
-
         base_name = Path(request.module.__file__).stem
         report_filename = f"{base_name}_report.yaml"
         report_path = Path(__file__).parent / report_filename
diff --git a/golden_tests/test_cookbook_examples.py b/golden_tests/test_cookbook_examples.py
@@ -31,7 +31,6 @@
 """
 
 # %%
-import json
 import os
 import tempfile
 from contextlib import contextmanager
@@ -44,7 +43,6 @@
 from pydantic import BaseModel, Field
 
 import kaggle_benchmarks as kbench
-from kaggle_benchmarks import messages
 from kaggle_benchmarks.content_types import images, videos
 
 # Models to be tested as the primary subject.
@@ -516,6 +514,7 @@ def test_video_url(llm):
 
 
 def run_simple_calculator(a: float, b: float, operator: str) -> float:
+    """Supported operators are: + - * and /"""
     if operator == "+":
         return a + b
     if operator == "-":
@@ -527,74 +526,108 @@ def run_simple_calculator(a: float, b: float, operator: str) -> float:
     raise ValueError(f"Unknown operator: {operator}")
 
 
-@benchmark_test(
-    exclude={
-        "anthropic/claude-haiku-4-5@20251001",
-        "anthropic/claude-opus-4-5@20251101",
-        "anthropic/claude-sonnet-4-5@20250929",
-        "deepseek-ai/deepseek-r1-0528",
-        "deepseek-ai/deepseek-v3.2",
-        "google/gemma-3-12b",
-        "google/gemini-3.1-flash-lite-preview",
-    }
-)
+@benchmark_test()
 @kbench.task()
-def test_manual_tool_use(llm):
+def test_simple_tool_use(llm):
     problem = "What is 50 plus 25?"
     expected_answer = 75.0
 
-    # Define the tool schema (JSON Schema format).
-    calculator_tool = {
-        "type": "function",
-        "function": {
-            "name": "simple_calculator",
-            "description": "Calculates the result of an arithmetic operation.",
-            "parameters": {
-                "type": "object",
-                "properties": {
-                    "a": {"type": "number", "description": "The first number."},
-                    "b": {"type": "number", "description": "The second number."},
-                    "operator": {
-                        "type": "string",
-                        "description": "The operator (+, -, *, /).",
-                    },
-                },
-                "required": ["a", "b", "operator"],
-            },
-        },
-    }
-
-    # 1. Get the initial response from the LLM.
-    kbench.actors.user.send(problem)
-    tool_call_msg = llm.respond(tools=[calculator_tool])
-    tool_calls = tool_call_msg.tool_calls
-
-    kbench.assertions.assert_true(bool(tool_calls), "LLM was expected to call a tool.")
-
-    # 2. Parse the tool call arguments.
-    tool_call = tool_calls[0]
-    function_args = json.loads(tool_call["function"]["arguments"])
-    # Removes 'signature' parameter in thinking mode.
-    function_args.pop("signature", None)
-
-    # 3. Execute the actual Python function.
-    tool_result = run_simple_calculator(**function_args)
-
-    # 4. Send the tool result back to the LLM.
-    tool_actor = kbench.actors.Actor(name="Tool", role="tool", avatar="🛠️")
-    tool_actor.send(
-        messages.Message(
-            sender=tool_actor,
-            content=str(tool_result),
-            _meta={"tool_call_id": tool_call["id"]},
-        )
-    )
-
-    # 5. Get the final answer.
-    final_answer_msg = llm.respond()
-    final_answer = final_answer_msg.content
+    final_answer = llm.prompt(problem, tools=[run_simple_calculator])
+    kbench.assertions.assert_tool_was_invoked(run_simple_calculator)
 
     kbench.assertions.assert_true(
         str(int(expected_answer)) in final_answer,
         f"Expected '{expected_answer}' to be in the final answer, got '{final_answer}'.",
     )
+
+
+# %%
+def increment_counter() -> int:
+    """Increments a counter and returns the value."""
+    increment_counter.count += 1
+    return increment_counter.count
+
+
+@benchmark_test()
+@kbench.task()
+def test_stateful_tool_double_execution(llm):
+    increment_counter.count = 0  # Reset for each test run
+
+    llm.prompt("Call the increment_counter tool.", tools=[increment_counter])
+
+    kbench.assertions.assert_equal(
+        1, increment_counter.count, expectation="Tool should be executed exactly once."
+    )
+
+
+# %%
+def add_tool(a: float, b: float) -> float:
+    """Adds two numbers."""
+    add_tool.calls += 1
+    return a + b
+
+
+def multiply_tool(a: float, b: float) -> float:
+    """Multiplies two numbers."""
+    multiply_tool.calls += 1
+    return a * b
+
+
+@benchmark_test()
+@kbench.task()
+def test_multiple_tool_selection(llm):
+    add_tool.calls = 0
+    multiply_tool.calls = 0
+
+    llm.prompt(
+        "What is 12 multiplied by 34? Use the multiply_tool.",
+        tools=[add_tool, multiply_tool],
+    )
+
+    kbench.assertions.assert_equal(
+        1, multiply_tool.calls, expectation="Multiply tool should be called once."
+    )
+    kbench.assertions.assert_equal(
+        0, add_tool.calls, expectation="Add tool should not be called."
+    )
+
+
+# %%
+def get_user_profile(user_id: str) -> dict:
+    """Returns user profile information as a dictionary."""
+    if user_id == "user_123":
+        return {"name": "Alice", "role": "Admin", "skills": ["Python", "SQL"]}
+    return {"name": "Unknown", "role": "User", "skills": []}
+
+
+@benchmark_test()
+@kbench.task()
+def test_complex_tool_return(llm):
+    response = llm.prompt(
+        "Get the profile for user_123 and tell me what their role is.",
+        tools=[get_user_profile],
+    )
+
+    kbench.assertions.assert_contains_regex(
+        r"(?i)admin", response, expectation="Model should identify the role as Admin."
+    )
+
+
+# %%
+def flaky_tool() -> str:
+    """This tool always fails with an error."""
+    raise ValueError("Tool execution failed simulated error.")
+
+
+@benchmark_test()
+@kbench.task()
+def test_tool_error_handling(llm):
+    response = llm.prompt(
+        "Call the flaky_tool and report what happens.", tools=[flaky_tool]
+    )
+
+    kbench.assertions.assert_contains_regex(
+        r"(?i)error|failed|valueerror",
+        response,
+        expectation="Model should report the tool failure.",
+    )
diff --git a/src/kaggle_benchmarks/kaggle/serialization.py b/src/kaggle_benchmarks/kaggle/serialization.py
@@ -233,7 +233,10 @@ def _message_to_proto_content(message: benchmark_messages.Message) -> dict[str,
 
     part_value: dict[str, Any] = {}  # Holds the 'image' or 'text' part
 
-    if isinstance(message.content, (content_types.images.ImageURL, content_types.videos.VideoContent)):
+    if isinstance(
+        message.content,
+        (content_types.images.ImageURL, content_types.videos.VideoContent),
+    ):
         mime = message.content.to_mime()
         part_value["file_data"] = {
             "file_uri": mime["location"],

Original file line number	Diff line number	Diff line change
`@@ -104,6 +104,7 @@ def describe_image(llm, image_url: str, question: str, answer: str):`
`104`	`104`	`answer="Cavalier King Charles Spaniel",`
`105`	`105`	`)`
`106`	`106`
	`107`	`+`
`107`	`108`	`# %%`
`108`	`109`	`@kbench.task(name="describe_video")`
`109`	`110`	`def describe_video(llm, video_url: str, question: str, answer: str):`
Original file line number	Diff line number	Diff line change
`@@ -62,7 +62,6 @@ def module_report_fixture(request):`
`62`	`62`	`test_result.outcome`
`63`	`63`	`)`
`64`	`64`
`65`		`-`
`66`	`65`	`base_name = Path(request.module.__file__).stem`
`67`	`66`	`report_filename = f"{base_name}_report.yaml"`
`68`	`67`	`report_path = Path(__file__).parent / report_filename`