PrimeIntellect-ai · willccbb · May 16, 2026 · May 17, 2026 · May 17, 2026 · May 18, 2026
diff --git a/tests/test_client_multimodal_types.py b/tests/test_client_multimodal_types.py
@@ -181,6 +181,33 @@ async def test_anthropic_from_native_response_extracts_usage():
     assert response.usage.reasoning_tokens == 0
 
 
+@pytest.mark.asyncio
+async def test_anthropic_from_native_response_extracts_cache_usage():
+    from verifiers.clients.anthropic_messages_client import AnthropicMessagesClient
+
+    client = AnthropicMessagesClient(object())
+    native_response = SimpleNamespace(
+        id="msg_cache",
+        model="claude-haiku-4-5",
+        stop_reason="end_turn",
+        content=[SimpleNamespace(type="text", text="Hello!")],
+        usage=SimpleNamespace(
+            input_tokens=42,
+            output_tokens=17,
+            cache_creation_input_tokens=8,
+            cache_read_input_tokens=100,
+        ),
+    )
+
+    response = await client.from_native_response(native_response)
+
+    assert response.usage is not None
+    assert response.usage.prompt_tokens == 50
+    assert response.usage.completion_tokens == 17
+    assert response.usage.cached_input_tokens == 100
+    assert response.usage.total_tokens == 67
+
+
 @pytest.mark.asyncio
 async def test_anthropic_from_native_response_always_parses_reasoning():
     pytest.importorskip("anthropic")

diff --git a/tests/test_prompt_cache_utils.py b/tests/test_prompt_cache_utils.py
@@ -0,0 +1,26 @@
+from verifiers.types import ClientConfig
+from verifiers.utils.prompt_cache_utils import apply_prompt_cache_to_kwargs
+
+
+def test_anthropic_cache_control_hint_is_default_only():
+    extra_kwargs = apply_prompt_cache_to_kwargs(
+        config=ClientConfig(
+            client_type="anthropic_messages",
+            api_base_url="https://api.anthropic.com/v1",
+        ),
+        sampling_args={"max_tokens": 16},
+        extra_kwargs={},
+    )
+
+    assert extra_kwargs == {"cache_control": {"type": "ephemeral"}}
+
+    extra_kwargs = apply_prompt_cache_to_kwargs(
+        config=ClientConfig(
+            client_type="anthropic_messages",
+            api_base_url="https://api.anthropic.com/v1",
+        ),
+        sampling_args={"cache_control": {"type": "custom"}},
+        extra_kwargs={},
+    )
+
+    assert extra_kwargs == {}
diff --git a/verifiers/clients/anthropic_messages_client.py b/verifiers/clients/anthropic_messages_client.py
@@ -468,6 +468,18 @@ def parse_finish_reason(response: AnthropicMessage) -> FinishReason:
 
         input_tokens = response.usage.input_tokens
         output_tokens = response.usage.output_tokens
+        cached_input_tokens = getattr(response.usage, "cache_read_input_tokens", None)
+        cache_creation_input_tokens = getattr(
+            response.usage, "cache_creation_input_tokens", None
+        )
+        if isinstance(cache_creation_input_tokens, int) and not isinstance(
+            cache_creation_input_tokens, bool
+        ):
+            input_tokens += cache_creation_input_tokens
+        if not isinstance(cached_input_tokens, int) or isinstance(
+            cached_input_tokens, bool
+        ):
+            cached_input_tokens = None
 
         return Response(
             id=response.id,
@@ -478,6 +490,7 @@ def parse_finish_reason(response: AnthropicMessage) -> FinishReason:
                 completion_tokens=output_tokens,
                 reasoning_tokens=0,
                 total_tokens=input_tokens + output_tokens,
+                cached_input_tokens=cached_input_tokens,
             ),
             message=ResponseMessage(
                 content=content,

diff --git a/verifiers/clients/client.py b/verifiers/clients/client.py
@@ -19,6 +19,7 @@
     SamplingArgs,
     Tool,
 )
+from verifiers.utils.prompt_cache_utils import apply_prompt_cache_to_kwargs
 
 if TYPE_CHECKING:
     pass
@@ -126,6 +127,11 @@ async def get_response(
 
             native_prompt, extra_kwargs = await self.to_native_prompt(prompt)
             native_tools = await self.to_native_tools(tools)
+            extra_kwargs = apply_prompt_cache_to_kwargs(
+                config=self._config,
+                sampling_args=sampling_args,
+                extra_kwargs=extra_kwargs,
+            )
             native_response = await self.get_native_response(
                 native_prompt,
                 model,

diff --git a/verifiers/clients/openai_chat_completions_client.py b/verifiers/clients/openai_chat_completions_client.py
@@ -423,13 +423,29 @@ def parse_usage(response: OpenAIChatResponse) -> Usage | None:
                 completion_tokens, int
             ):
                 return None
+            prompt_details = get_usage_field(usage, "prompt_tokens_details")
+            if prompt_details is None:
+                prompt_details = get_usage_field(usage, "input_tokens_details")
+            cached_tokens = None
+            if prompt_details is not None:
+                reported_cached_tokens = get_usage_field(
+                    prompt_details, "cached_tokens"
+                )
+                if isinstance(reported_cached_tokens, int) and not isinstance(
+                    reported_cached_tokens, bool
+                ):
+                    cached_tokens = reported_cached_tokens
+                    prompt_tokens = max(0, prompt_tokens - cached_tokens)
             if not isinstance(total_tokens, int):
                 total_tokens = prompt_tokens + completion_tokens
+            elif cached_tokens is not None:
+                total_tokens = max(0, total_tokens - cached_tokens)
             return Usage(
                 prompt_tokens=prompt_tokens,
                 reasoning_tokens=0,
                 completion_tokens=completion_tokens,
                 total_tokens=total_tokens,
+                cached_input_tokens=cached_tokens,
             )
 
         def parse_is_truncated(response: OpenAIChatResponse) -> bool:

diff --git a/verifiers/clients/openai_responses_client.py b/verifiers/clients/openai_responses_client.py
@@ -385,15 +385,29 @@ def parse_usage(response: OpenAIResponsesNativeResponse) -> Usage | None:
                 completion_tokens, int
             ):
                 return None
+            input_details = get_usage_field(usage, "input_tokens_details")
+            if input_details is None:
+                input_details = get_usage_field(usage, "prompt_tokens_details")
+            cached_tokens = None
+            if input_details is not None:
+                reported_cached_tokens = get_usage_field(input_details, "cached_tokens")
+                if isinstance(reported_cached_tokens, int) and not isinstance(
+                    reported_cached_tokens, bool
+                ):
+                    cached_tokens = reported_cached_tokens
+                    prompt_tokens = max(0, prompt_tokens - cached_tokens)
             if not isinstance(total_tokens, int):
                 total_tokens = prompt_tokens + completion_tokens
+            elif cached_tokens is not None:
+                total_tokens = max(0, total_tokens - cached_tokens)
             if not isinstance(reasoning_tokens, int):
                 reasoning_tokens = 0
             return Usage(
                 prompt_tokens=prompt_tokens,
                 reasoning_tokens=reasoning_tokens,
                 completion_tokens=completion_tokens,
                 total_tokens=total_tokens,
+                cached_input_tokens=cached_tokens,
             )
 
         def parse_is_truncated(response: OpenAIResponsesNativeResponse) -> bool:

diff --git a/verifiers/envs/environment.py b/verifiers/envs/environment.py
@@ -480,10 +480,15 @@ def get_state_usage(self, state: State) -> TokenUsage | None:
         usage = state.get("usage")
         if isinstance(usage, Mapping):
             try:
-                return {
+                out: TokenUsage = {
                     "input_tokens": float(usage.get("input_tokens", 0.0)),
                     "output_tokens": float(usage.get("output_tokens", 0.0)),
                 }
+                for key in ("cached_input_tokens",):
+                    value = usage.get(key)
+                    if value is not None:
+                        out[key] = float(value)
+                return out
             except (TypeError, ValueError):
                 return None
         return None

diff --git a/verifiers/types.py b/verifiers/types.py
@@ -181,6 +181,7 @@ class Usage(CustomBaseModel):
     reasoning_tokens: int
     completion_tokens: int
     total_tokens: int
+    cached_input_tokens: int | None = None
 
 
 class RoutedExpertsPayload(TypedDict):
@@ -249,6 +250,7 @@ class TrajectoryStepTokens(TypedDict):
 class TokenUsage(TypedDict):
     input_tokens: float
     output_tokens: float
+    cached_input_tokens: NotRequired[float]
     final_input_tokens: NotRequired[float]
     final_output_tokens: NotRequired[float]
 

diff --git a/verifiers/utils/eval_display.py b/verifiers/utils/eval_display.py
@@ -410,6 +410,9 @@ def _make_tokens_row(
             "input": format_numeric(usage.get("input_tokens", 0.0)),
             "output": format_numeric(usage.get("output_tokens", 0.0)),
         }
+        cached = usage.get("cached_input_tokens")
+        if cached is not None:
+            kv["cached_input"] = format_numeric(cached)
         inp = usage.get("final_input_tokens")
         out = usage.get("final_output_tokens")
         if inp is not None:

diff --git a/verifiers/utils/eval_utils.py b/verifiers/utils/eval_utils.py
@@ -817,6 +817,8 @@ def print_usage(results: GenerateOutputs):
     usage_count = 0
     input_total = 0.0
     output_total = 0.0
+    cached_input_total = 0.0
+    cached_input_count = 0
     final_input_total = 0.0
     final_output_total = 0.0
     context_count = 0
@@ -827,6 +829,10 @@ def print_usage(results: GenerateOutputs):
         usage_count += 1
         input_total += float(token_usage.get("input_tokens", 0.0))
         output_total += float(token_usage.get("output_tokens", 0.0))
+        cached = token_usage.get("cached_input_tokens")
+        if cached is not None:
+            cached_input_total += float(cached)
+            cached_input_count += 1
         inp = token_usage.get("final_input_tokens")
         out = token_usage.get("final_output_tokens")
         if inp is not None and out is not None:
@@ -840,6 +846,8 @@ def print_usage(results: GenerateOutputs):
             input_tokens=input_total / usage_count,
             output_tokens=output_total / usage_count,
         )
+        if cached_input_count > 0:
+            usage["cached_input_tokens"] = cached_input_total / cached_input_count
         if context_count > 0:
             usage["final_input_tokens"] = final_input_total / context_count
             usage["final_output_tokens"] = final_output_total / context_count
@@ -851,6 +859,9 @@ def print_usage(results: GenerateOutputs):
 
     print("Usage:")
     print(f"input_tokens (avg): {float(usage.get('input_tokens', 0.0)):.3f}")
+    cached = usage.get("cached_input_tokens")
+    if cached is not None:
+        print(f"cached_input_tokens (avg): {float(cached):.3f}")
     print(f"output_tokens (avg): {float(usage.get('output_tokens', 0.0)):.3f}")
     inp = usage.get("final_input_tokens")
     out = usage.get("final_output_tokens")

diff --git a/verifiers/utils/interception_utils.py b/verifiers/utils/interception_utils.py
@@ -868,6 +868,8 @@ def serialize_anthropic_message_response(response: Response) -> dict[str, Any]:
             "input_tokens": response.usage.prompt_tokens,
             "output_tokens": response.usage.completion_tokens,
         }
+        if response.usage.cached_input_tokens is not None:
+            usage["cache_read_input_tokens"] = response.usage.cached_input_tokens
     return {
         "id": response.id,
         "type": "message",

diff --git a/verifiers/utils/metric_utils.py b/verifiers/utils/metric_utils.py
@@ -92,6 +92,12 @@ class OutputTokensMetric(TokenUsageKeyMetric):
     _key = "output_tokens"
 
 
+class CachedInputTokensMetric(TokenUsageKeyMetric):
+    """Mean cached_input_tokens per output."""
+
+    _key = "cached_input_tokens"
+
+
 class FinalInputTokensMetric(TokenUsageKeyMetric):
     """Mean final_input_tokens (non-completion context tokens) per output."""
 

diff --git a/verifiers/utils/prompt_cache_utils.py b/verifiers/utils/prompt_cache_utils.py
@@ -0,0 +1,51 @@
+from collections.abc import Mapping
+from typing import Any
+from urllib.parse import urlsplit
+
+from verifiers.types import ClientConfig
+
+ANTHROPIC_ORIGINS = frozenset({"https://api.anthropic.com"})
+
+
+def endpoint_origin(api_base_url: str) -> str | None:
+    parsed = urlsplit(api_base_url)
+    if not parsed.scheme or not parsed.hostname:
+        return None
+    scheme = parsed.scheme.lower()
+    host = parsed.hostname.lower()
+    port = parsed.port
+    netloc = host
+    if ":" in host:
+        netloc = f"[{host}]"
+    if port is not None and not (
+        (scheme == "https" and port == 443) or (scheme == "http" and port == 80)
+    ):
+        netloc = f"{netloc}:{port}"
+    return f"{scheme}://{netloc}"
+
+
+def uses_official_anthropic_messages(config: ClientConfig | None) -> bool:
+    return (
+        config is not None
+        and config.client_type == "anthropic_messages"
+        and endpoint_origin(config.api_base_url) in ANTHROPIC_ORIGINS
+    )
+
+
+def _cache_control_payload() -> dict[str, str]:
+    return {"type": "ephemeral"}
+
+
+def apply_prompt_cache_to_kwargs(
+    *,
+    config: ClientConfig | None,
+    sampling_args: Mapping[str, Any],
+    extra_kwargs: Mapping[str, Any],
+) -> dict[str, Any]:
+    updated_extra_kwargs = dict(extra_kwargs)
+    if (
+        uses_official_anthropic_messages(config)
+        and "cache_control" not in sampling_args
+    ):
+        updated_extra_kwargs.setdefault("cache_control", _cache_control_payload())
+    return updated_extra_kwargs