3131"""
3232
3333# %%
34- import json
3534import os
3635import tempfile
3736from contextlib import contextmanager
4443from pydantic import BaseModel , Field
4544
4645import kaggle_benchmarks as kbench
47- from kaggle_benchmarks import messages
4846from kaggle_benchmarks .content_types import images , videos
4947
5048# Models to be tested as the primary subject.
@@ -516,6 +514,7 @@ def test_video_url(llm):
516514
517515
518516def run_simple_calculator (a : float , b : float , operator : str ) -> float :
517+ """Supported operators are: + - * and /"""
519518 if operator == "+" :
520519 return a + b
521520 if operator == "-" :
@@ -527,74 +526,108 @@ def run_simple_calculator(a: float, b: float, operator: str) -> float:
527526 raise ValueError (f"Unknown operator: { operator } " )
528527
529528
530- @benchmark_test (
531- exclude = {
532- "anthropic/claude-haiku-4-5@20251001" ,
533- "anthropic/claude-opus-4-5@20251101" ,
534- "anthropic/claude-sonnet-4-5@20250929" ,
535- "deepseek-ai/deepseek-r1-0528" ,
536- "deepseek-ai/deepseek-v3.2" ,
537- "google/gemma-3-12b" ,
538- "google/gemini-3.1-flash-lite-preview" ,
539- }
540- )
529+ @benchmark_test ()
541530@kbench .task ()
542- def test_manual_tool_use (llm ):
531+ def test_simple_tool_use (llm ):
543532 problem = "What is 50 plus 25?"
544533 expected_answer = 75.0
545534
546- # Define the tool schema (JSON Schema format).
547- calculator_tool = {
548- "type" : "function" ,
549- "function" : {
550- "name" : "simple_calculator" ,
551- "description" : "Calculates the result of an arithmetic operation." ,
552- "parameters" : {
553- "type" : "object" ,
554- "properties" : {
555- "a" : {"type" : "number" , "description" : "The first number." },
556- "b" : {"type" : "number" , "description" : "The second number." },
557- "operator" : {
558- "type" : "string" ,
559- "description" : "The operator (+, -, *, /)." ,
560- },
561- },
562- "required" : ["a" , "b" , "operator" ],
563- },
564- },
565- }
566-
567- # 1. Get the initial response from the LLM.
568- kbench .actors .user .send (problem )
569- tool_call_msg = llm .respond (tools = [calculator_tool ])
570- tool_calls = tool_call_msg .tool_calls
571-
572- kbench .assertions .assert_true (bool (tool_calls ), "LLM was expected to call a tool." )
573-
574- # 2. Parse the tool call arguments.
575- tool_call = tool_calls [0 ]
576- function_args = json .loads (tool_call ["function" ]["arguments" ])
577- # Removes 'signature' parameter in thinking mode.
578- function_args .pop ("signature" , None )
579-
580- # 3. Execute the actual Python function.
581- tool_result = run_simple_calculator (** function_args )
582-
583- # 4. Send the tool result back to the LLM.
584- tool_actor = kbench .actors .Actor (name = "Tool" , role = "tool" , avatar = "🛠️" )
585- tool_actor .send (
586- messages .Message (
587- sender = tool_actor ,
588- content = str (tool_result ),
589- _meta = {"tool_call_id" : tool_call ["id" ]},
590- )
591- )
592-
593- # 5. Get the final answer.
594- final_answer_msg = llm .respond ()
595- final_answer = final_answer_msg .content
535+ final_answer = llm .prompt (problem , tools = [run_simple_calculator ])
536+ kbench .assertions .assert_tool_was_invoked (run_simple_calculator )
596537
597538 kbench .assertions .assert_true (
598539 str (int (expected_answer )) in final_answer ,
599540 f"Expected '{ expected_answer } ' to be in the final answer, got '{ final_answer } '." ,
600541 )
542+
543+
544+ # %%
545+ def increment_counter () -> int :
546+ """Increments a counter and returns the value."""
547+ increment_counter .count += 1
548+ return increment_counter .count
549+
550+
551+ @benchmark_test ()
552+ @kbench .task ()
553+ def test_stateful_tool_double_execution (llm ):
554+ increment_counter .count = 0 # Reset for each test run
555+
556+ llm .prompt ("Call the increment_counter tool." , tools = [increment_counter ])
557+
558+ kbench .assertions .assert_equal (
559+ 1 , increment_counter .count , expectation = "Tool should be executed exactly once."
560+ )
561+
562+
563+ # %%
564+ def add_tool (a : float , b : float ) -> float :
565+ """Adds two numbers."""
566+ add_tool .calls += 1
567+ return a + b
568+
569+
570+ def multiply_tool (a : float , b : float ) -> float :
571+ """Multiplies two numbers."""
572+ multiply_tool .calls += 1
573+ return a * b
574+
575+
576+ @benchmark_test ()
577+ @kbench .task ()
578+ def test_multiple_tool_selection (llm ):
579+ add_tool .calls = 0
580+ multiply_tool .calls = 0
581+
582+ llm .prompt (
583+ "What is 12 multiplied by 34? Use the multiply_tool." ,
584+ tools = [add_tool , multiply_tool ],
585+ )
586+
587+ kbench .assertions .assert_equal (
588+ 1 , multiply_tool .calls , expectation = "Multiply tool should be called once."
589+ )
590+ kbench .assertions .assert_equal (
591+ 0 , add_tool .calls , expectation = "Add tool should not be called."
592+ )
593+
594+
595+ # %%
596+ def get_user_profile (user_id : str ) -> dict :
597+ """Returns user profile information as a dictionary."""
598+ if user_id == "user_123" :
599+ return {"name" : "Alice" , "role" : "Admin" , "skills" : ["Python" , "SQL" ]}
600+ return {"name" : "Unknown" , "role" : "User" , "skills" : []}
601+
602+
603+ @benchmark_test ()
604+ @kbench .task ()
605+ def test_complex_tool_return (llm ):
606+ response = llm .prompt (
607+ "Get the profile for user_123 and tell me what their role is." ,
608+ tools = [get_user_profile ],
609+ )
610+
611+ kbench .assertions .assert_contains_regex (
612+ r"(?i)admin" , response , expectation = "Model should identify the role as Admin."
613+ )
614+
615+
616+ # %%
617+ def flaky_tool () -> str :
618+ """This tool always fails with an error."""
619+ raise ValueError ("Tool execution failed simulated error." )
620+
621+
622+ @benchmark_test ()
623+ @kbench .task ()
624+ def test_tool_error_handling (llm ):
625+ response = llm .prompt (
626+ "Call the flaky_tool and report what happens." , tools = [flaky_tool ]
627+ )
628+
629+ kbench .assertions .assert_contains_regex (
630+ r"(?i)error|failed|valueerror" ,
631+ response ,
632+ expectation = "Model should report the tool failure." ,
633+ )
0 commit comments