Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions tests/test_client_multimodal_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,33 @@ async def test_anthropic_from_native_response_extracts_usage():
assert response.usage.reasoning_tokens == 0


@pytest.mark.asyncio
async def test_anthropic_from_native_response_extracts_cache_usage():
from verifiers.clients.anthropic_messages_client import AnthropicMessagesClient

client = AnthropicMessagesClient(object())
native_response = SimpleNamespace(
id="msg_cache",
model="claude-haiku-4-5",
stop_reason="end_turn",
content=[SimpleNamespace(type="text", text="Hello!")],
usage=SimpleNamespace(
input_tokens=42,
output_tokens=17,
cache_creation_input_tokens=8,
cache_read_input_tokens=100,
),
)

response = await client.from_native_response(native_response)

assert response.usage is not None
assert response.usage.prompt_tokens == 50
assert response.usage.completion_tokens == 17
assert response.usage.cached_input_tokens == 100
assert response.usage.total_tokens == 67
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing importorskip guard breaks test without anthropic

Medium Severity

The new test_anthropic_from_native_response_extracts_cache_usage test imports AnthropicMessagesClient without first calling pytest.importorskip("anthropic"). Every other Anthropic test in this file (lines 57, 103, 126, 156, 213, 235) uses this guard. Since anthropic_messages_client.py unconditionally imports from anthropic at the top level, this test will crash with an ImportError in environments where the anthropic package is not installed, instead of being gracefully skipped.

Fix in Cursor Fix in Web

Triggered by project rule: BugBot Instructions

Reviewed by Cursor Bugbot for commit badc2c5. Configure here.



@pytest.mark.asyncio
async def test_anthropic_from_native_response_always_parses_reasoning():
pytest.importorskip("anthropic")
Expand Down
26 changes: 26 additions & 0 deletions tests/test_prompt_cache_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
from verifiers.types import ClientConfig
from verifiers.utils.prompt_cache_utils import apply_prompt_cache_to_kwargs


def test_anthropic_cache_control_hint_is_default_only():
extra_kwargs = apply_prompt_cache_to_kwargs(
config=ClientConfig(
client_type="anthropic_messages",
api_base_url="https://api.anthropic.com/v1",
),
sampling_args={"max_tokens": 16},
extra_kwargs={},
)

assert extra_kwargs == {"cache_control": {"type": "ephemeral"}}

extra_kwargs = apply_prompt_cache_to_kwargs(
config=ClientConfig(
client_type="anthropic_messages",
api_base_url="https://api.anthropic.com/v1",
),
sampling_args={"cache_control": {"type": "custom"}},
extra_kwargs={},
)

assert extra_kwargs == {}
13 changes: 13 additions & 0 deletions verifiers/clients/anthropic_messages_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -468,6 +468,18 @@ def parse_finish_reason(response: AnthropicMessage) -> FinishReason:

input_tokens = response.usage.input_tokens
output_tokens = response.usage.output_tokens
cached_input_tokens = getattr(response.usage, "cache_read_input_tokens", None)
cache_creation_input_tokens = getattr(
response.usage, "cache_creation_input_tokens", None
)
if isinstance(cache_creation_input_tokens, int) and not isinstance(
cache_creation_input_tokens, bool
):
input_tokens += cache_creation_input_tokens
if not isinstance(cached_input_tokens, int) or isinstance(
cached_input_tokens, bool
):
cached_input_tokens = None
Comment thread
cursor[bot] marked this conversation as resolved.

return Response(
id=response.id,
Expand All @@ -478,6 +490,7 @@ def parse_finish_reason(response: AnthropicMessage) -> FinishReason:
completion_tokens=output_tokens,
reasoning_tokens=0,
total_tokens=input_tokens + output_tokens,
cached_input_tokens=cached_input_tokens,
),
message=ResponseMessage(
content=content,
Expand Down
6 changes: 6 additions & 0 deletions verifiers/clients/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
SamplingArgs,
Tool,
)
from verifiers.utils.prompt_cache_utils import apply_prompt_cache_to_kwargs

if TYPE_CHECKING:
pass
Expand Down Expand Up @@ -126,6 +127,11 @@ async def get_response(

native_prompt, extra_kwargs = await self.to_native_prompt(prompt)
native_tools = await self.to_native_tools(tools)
extra_kwargs = apply_prompt_cache_to_kwargs(
config=self._config,
sampling_args=sampling_args,
extra_kwargs=extra_kwargs,
)
native_response = await self.get_native_response(
native_prompt,
model,
Expand Down
16 changes: 16 additions & 0 deletions verifiers/clients/openai_chat_completions_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -423,13 +423,29 @@ def parse_usage(response: OpenAIChatResponse) -> Usage | None:
completion_tokens, int
):
return None
prompt_details = get_usage_field(usage, "prompt_tokens_details")
if prompt_details is None:
prompt_details = get_usage_field(usage, "input_tokens_details")
cached_tokens = None
if prompt_details is not None:
reported_cached_tokens = get_usage_field(
prompt_details, "cached_tokens"
)
if isinstance(reported_cached_tokens, int) and not isinstance(
reported_cached_tokens, bool
):
cached_tokens = reported_cached_tokens
prompt_tokens = max(0, prompt_tokens - cached_tokens)
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OpenAI cached tokens excluded from cost calculation

Low Severity

For OpenAI-compatible clients, prompt_tokens is reduced by subtracting cached_tokens, and total_tokens is similarly reduced. The downstream cost calculation in compute_cost_usd uses input_tokens (derived from prompt_tokens) but never accounts for cached_input_tokens. This causes cost estimates to silently drop all cached token charges when a provider reports cache hits through an OpenAI-compatible interface.

Additional Locations (1)
Fix in Cursor Fix in Web

Reviewed by Cursor Bugbot for commit badc2c5. Configure here.

if not isinstance(total_tokens, int):
total_tokens = prompt_tokens + completion_tokens
elif cached_tokens is not None:
total_tokens = max(0, total_tokens - cached_tokens)
return Usage(
prompt_tokens=prompt_tokens,
reasoning_tokens=0,
completion_tokens=completion_tokens,
total_tokens=total_tokens,
cached_input_tokens=cached_tokens,
)

def parse_is_truncated(response: OpenAIChatResponse) -> bool:
Expand Down
14 changes: 14 additions & 0 deletions verifiers/clients/openai_responses_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -385,15 +385,29 @@ def parse_usage(response: OpenAIResponsesNativeResponse) -> Usage | None:
completion_tokens, int
):
return None
input_details = get_usage_field(usage, "input_tokens_details")
if input_details is None:
input_details = get_usage_field(usage, "prompt_tokens_details")
cached_tokens = None
if input_details is not None:
reported_cached_tokens = get_usage_field(input_details, "cached_tokens")
if isinstance(reported_cached_tokens, int) and not isinstance(
reported_cached_tokens, bool
):
cached_tokens = reported_cached_tokens
prompt_tokens = max(0, prompt_tokens - cached_tokens)
if not isinstance(total_tokens, int):
total_tokens = prompt_tokens + completion_tokens
elif cached_tokens is not None:
total_tokens = max(0, total_tokens - cached_tokens)
if not isinstance(reasoning_tokens, int):
reasoning_tokens = 0
return Usage(
prompt_tokens=prompt_tokens,
reasoning_tokens=reasoning_tokens,
completion_tokens=completion_tokens,
total_tokens=total_tokens,
cached_input_tokens=cached_tokens,
)

def parse_is_truncated(response: OpenAIResponsesNativeResponse) -> bool:
Expand Down
7 changes: 6 additions & 1 deletion verifiers/envs/environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -480,10 +480,15 @@ def get_state_usage(self, state: State) -> TokenUsage | None:
usage = state.get("usage")
if isinstance(usage, Mapping):
try:
return {
out: TokenUsage = {
"input_tokens": float(usage.get("input_tokens", 0.0)),
"output_tokens": float(usage.get("output_tokens", 0.0)),
}
for key in ("cached_input_tokens",):
value = usage.get(key)
if value is not None:
out[key] = float(value)
return out
except (TypeError, ValueError):
return None
return None
Expand Down
2 changes: 2 additions & 0 deletions verifiers/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,7 @@ class Usage(CustomBaseModel):
reasoning_tokens: int
completion_tokens: int
total_tokens: int
cached_input_tokens: int | None = None


class RoutedExpertsPayload(TypedDict):
Expand Down Expand Up @@ -249,6 +250,7 @@ class TrajectoryStepTokens(TypedDict):
class TokenUsage(TypedDict):
input_tokens: float
output_tokens: float
cached_input_tokens: NotRequired[float]
final_input_tokens: NotRequired[float]
final_output_tokens: NotRequired[float]

Expand Down
3 changes: 3 additions & 0 deletions verifiers/utils/eval_display.py
Original file line number Diff line number Diff line change
Expand Up @@ -410,6 +410,9 @@ def _make_tokens_row(
"input": format_numeric(usage.get("input_tokens", 0.0)),
"output": format_numeric(usage.get("output_tokens", 0.0)),
}
cached = usage.get("cached_input_tokens")
if cached is not None:
kv["cached_input"] = format_numeric(cached)
inp = usage.get("final_input_tokens")
out = usage.get("final_output_tokens")
if inp is not None:
Expand Down
11 changes: 11 additions & 0 deletions verifiers/utils/eval_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -817,6 +817,8 @@ def print_usage(results: GenerateOutputs):
usage_count = 0
input_total = 0.0
output_total = 0.0
cached_input_total = 0.0
cached_input_count = 0
final_input_total = 0.0
final_output_total = 0.0
context_count = 0
Expand All @@ -827,6 +829,10 @@ def print_usage(results: GenerateOutputs):
usage_count += 1
input_total += float(token_usage.get("input_tokens", 0.0))
output_total += float(token_usage.get("output_tokens", 0.0))
cached = token_usage.get("cached_input_tokens")
if cached is not None:
cached_input_total += float(cached)
cached_input_count += 1
inp = token_usage.get("final_input_tokens")
out = token_usage.get("final_output_tokens")
if inp is not None and out is not None:
Expand All @@ -840,6 +846,8 @@ def print_usage(results: GenerateOutputs):
input_tokens=input_total / usage_count,
output_tokens=output_total / usage_count,
)
if cached_input_count > 0:
usage["cached_input_tokens"] = cached_input_total / cached_input_count
if context_count > 0:
usage["final_input_tokens"] = final_input_total / context_count
usage["final_output_tokens"] = final_output_total / context_count
Expand All @@ -851,6 +859,9 @@ def print_usage(results: GenerateOutputs):

print("Usage:")
print(f"input_tokens (avg): {float(usage.get('input_tokens', 0.0)):.3f}")
cached = usage.get("cached_input_tokens")
if cached is not None:
print(f"cached_input_tokens (avg): {float(cached):.3f}")
print(f"output_tokens (avg): {float(usage.get('output_tokens', 0.0)):.3f}")
inp = usage.get("final_input_tokens")
out = usage.get("final_output_tokens")
Expand Down
2 changes: 2 additions & 0 deletions verifiers/utils/interception_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -868,6 +868,8 @@ def serialize_anthropic_message_response(response: Response) -> dict[str, Any]:
"input_tokens": response.usage.prompt_tokens,
"output_tokens": response.usage.completion_tokens,
}
if response.usage.cached_input_tokens is not None:
usage["cache_read_input_tokens"] = response.usage.cached_input_tokens
return {
"id": response.id,
"type": "message",
Expand Down
6 changes: 6 additions & 0 deletions verifiers/utils/metric_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,12 @@ class OutputTokensMetric(TokenUsageKeyMetric):
_key = "output_tokens"


class CachedInputTokensMetric(TokenUsageKeyMetric):
"""Mean cached_input_tokens per output."""

_key = "cached_input_tokens"


class FinalInputTokensMetric(TokenUsageKeyMetric):
"""Mean final_input_tokens (non-completion context tokens) per output."""

Expand Down
51 changes: 51 additions & 0 deletions verifiers/utils/prompt_cache_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
from collections.abc import Mapping
from typing import Any
from urllib.parse import urlsplit

from verifiers.types import ClientConfig

ANTHROPIC_ORIGINS = frozenset({"https://api.anthropic.com"})


def endpoint_origin(api_base_url: str) -> str | None:
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this could be a substring match on api.anthropic.com tbf

parsed = urlsplit(api_base_url)
if not parsed.scheme or not parsed.hostname:
return None
scheme = parsed.scheme.lower()
host = parsed.hostname.lower()
port = parsed.port
netloc = host
if ":" in host:
netloc = f"[{host}]"
if port is not None and not (
(scheme == "https" and port == 443) or (scheme == "http" and port == 80)
):
netloc = f"{netloc}:{port}"
return f"{scheme}://{netloc}"
Comment thread
macroscopeapp[bot] marked this conversation as resolved.


def uses_official_anthropic_messages(config: ClientConfig | None) -> bool:
return (
config is not None
and config.client_type == "anthropic_messages"
and endpoint_origin(config.api_base_url) in ANTHROPIC_ORIGINS
)


def _cache_control_payload() -> dict[str, str]:
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

remove this func

return {"type": "ephemeral"}


def apply_prompt_cache_to_kwargs(
*,
config: ClientConfig | None,
sampling_args: Mapping[str, Any],
extra_kwargs: Mapping[str, Any],
) -> dict[str, Any]:
updated_extra_kwargs = dict(extra_kwargs)
if (
uses_official_anthropic_messages(config)
and "cache_control" not in sampling_args
):
updated_extra_kwargs.setdefault("cache_control", _cache_control_payload())
Copy link
Copy Markdown
Collaborator

@AmeenP AmeenP May 20, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this might break when the user already have set a custom anthropic cache control setting in the sampling args

Comment thread
cursor[bot] marked this conversation as resolved.
return updated_extra_kwargs
Loading
Loading