diff --git a/pydantic_ai_slim/pydantic_ai/messages.py b/pydantic_ai_slim/pydantic_ai/messages.py index 28447187ef..5d09c63c0d 100644 --- a/pydantic_ai_slim/pydantic_ai/messages.py +++ b/pydantic_ai_slim/pydantic_ai/messages.py @@ -423,7 +423,18 @@ def format(self) -> str: __repr__ = _utils.dataclasses_no_defaults_repr -UserContent: TypeAlias = 'str | ImageUrl | AudioUrl | DocumentUrl | VideoUrl | BinaryContent' +@dataclass(repr=False) +class UploadedFile: + """File uploaded to the LLM provider.""" + + file: Any + """A provider-specific file object, e.g. a file ID or a file URL.""" + + kind: Literal['uploaded-file'] = 'uploaded-file' + """Type identifier, this is available on all parts as a discriminator.""" + + +UserContent: TypeAlias = 'str | ImageUrl | AudioUrl | DocumentUrl | VideoUrl | BinaryContent | UploadedFile' @dataclass(repr=False) diff --git a/pydantic_ai_slim/pydantic_ai/models/bedrock.py b/pydantic_ai_slim/pydantic_ai/models/bedrock.py index 85766ae216..6928ff1ff0 100644 --- a/pydantic_ai_slim/pydantic_ai/models/bedrock.py +++ b/pydantic_ai_slim/pydantic_ai/models/bedrock.py @@ -35,6 +35,7 @@ ThinkingPart, ToolCallPart, ToolReturnPart, + UploadedFile, UserPromptPart, VideoUrl, ) @@ -577,6 +578,8 @@ async def _map_user_prompt(part: UserPromptPart, document_count: Iterator[int]) content.append({'video': video}) elif isinstance(item, AudioUrl): # pragma: no cover raise NotImplementedError('Audio is not supported yet.') + elif isinstance(item, UploadedFile): + raise NotImplementedError('Uploaded files are not supported yet.') else: assert_never(item) return [{'role': 'user', 'content': content}] diff --git a/pydantic_ai_slim/pydantic_ai/models/gemini.py b/pydantic_ai_slim/pydantic_ai/models/gemini.py index 517acbc614..7fb7748389 100644 --- a/pydantic_ai_slim/pydantic_ai/models/gemini.py +++ b/pydantic_ai_slim/pydantic_ai/models/gemini.py @@ -33,6 +33,7 @@ ThinkingPart, ToolCallPart, ToolReturnPart, + UploadedFile, UserPromptPart, VideoUrl, ) @@ -368,6 +369,8 @@ async def _map_user_prompt(self, part: UserPromptPart) -> list[_GeminiPartUnion] else: # pragma: lax no cover file_data = _GeminiFileDataPart(file_data={'file_uri': item.url, 'mime_type': item.media_type}) content.append(file_data) + elif isinstance(item, UploadedFile): + raise NotImplementedError('Uploaded files are not supported for GeminiModel.') else: assert_never(item) # pragma: lax no cover return content diff --git a/pydantic_ai_slim/pydantic_ai/models/google.py b/pydantic_ai_slim/pydantic_ai/models/google.py index 6d21a88678..b1bd80d931 100644 --- a/pydantic_ai_slim/pydantic_ai/models/google.py +++ b/pydantic_ai_slim/pydantic_ai/models/google.py @@ -31,6 +31,7 @@ ThinkingPart, ToolCallPart, ToolReturnPart, + UploadedFile, UserPromptPart, VideoUrl, ) @@ -54,6 +55,7 @@ ContentUnionDict, CountTokensConfigDict, ExecutableCodeDict, + File, FunctionCallDict, FunctionCallingConfigDict, FunctionCallingConfigMode, @@ -425,7 +427,7 @@ async def _map_messages(self, messages: list[ModelMessage]) -> tuple[ContentDict if isinstance(part, SystemPromptPart): system_parts.append({'text': part.content}) elif isinstance(part, UserPromptPart): - message_parts.extend(await self._map_user_prompt(part)) + message_parts.extend(await self._map_user_prompt(part, contents)) elif isinstance(part, ToolReturnPart): message_parts.append( { @@ -465,7 +467,7 @@ async def _map_messages(self, messages: list[ModelMessage]) -> tuple[ContentDict system_instruction = ContentDict(role='user', parts=system_parts) if system_parts else None return system_instruction, contents - async def _map_user_prompt(self, part: UserPromptPart) -> list[PartDict]: + async def _map_user_prompt(self, part: UserPromptPart, contents: list[ContentUnionDict]) -> list[PartDict]: if isinstance(part.content, str): return [{'text': part.content}] else: @@ -499,6 +501,12 @@ async def _map_user_prompt(self, part: UserPromptPart) -> list[PartDict]: content.append( {'file_data': {'file_uri': item.url, 'mime_type': item.media_type}} ) # pragma: lax no cover + elif isinstance(item, UploadedFile): + if not isinstance(item.file, File): + raise UserError('UploadedFile.file must be a genai.types.File object') + # genai.types.File is its own ContentUnionDict and not a + # PartDict, so append to the contents directly. + contents.append(item.file) else: assert_never(item) return content diff --git a/pydantic_ai_slim/pydantic_ai/models/huggingface.py b/pydantic_ai_slim/pydantic_ai/models/huggingface.py index ff854b1244..1518b6d617 100644 --- a/pydantic_ai_slim/pydantic_ai/models/huggingface.py +++ b/pydantic_ai_slim/pydantic_ai/models/huggingface.py @@ -32,6 +32,7 @@ ThinkingPart, ToolCallPart, ToolReturnPart, + UploadedFile, UserPromptPart, VideoUrl, ) @@ -424,6 +425,8 @@ async def _map_user_prompt(part: UserPromptPart) -> ChatCompletionInputMessage: raise NotImplementedError('DocumentUrl is not supported for Hugging Face') elif isinstance(item, VideoUrl): raise NotImplementedError('VideoUrl is not supported for Hugging Face') + elif isinstance(item, UploadedFile): + raise NotImplementedError('Uploaded files are not supported for Hugging Face') else: assert_never(item) return ChatCompletionInputMessage(role='user', content=content) # type: ignore diff --git a/pydantic_ai_slim/pydantic_ai/models/openai.py b/pydantic_ai_slim/pydantic_ai/models/openai.py index 7f5d2b5956..ab09e31176 100644 --- a/pydantic_ai_slim/pydantic_ai/models/openai.py +++ b/pydantic_ai_slim/pydantic_ai/models/openai.py @@ -8,6 +8,8 @@ from datetime import datetime from typing import Any, Literal, Union, cast, overload +from httpx import URL +from openai.types import FileObject from pydantic import ValidationError from typing_extensions import assert_never, deprecated @@ -36,6 +38,7 @@ ThinkingPart, ToolCallPart, ToolReturnPart, + UploadedFile, UserPromptPart, VideoUrl, ) @@ -623,7 +626,7 @@ async def _map_user_message(self, message: ModelRequest) -> AsyncIterable[chat.C else: yield chat.ChatCompletionSystemMessageParam(role='system', content=part.content) elif isinstance(part, UserPromptPart): - yield await self._map_user_prompt(part) + yield await self._map_user_prompt(part, self._provider) elif isinstance(part, ToolReturnPart): yield chat.ChatCompletionToolMessageParam( role='tool', @@ -645,7 +648,7 @@ async def _map_user_message(self, message: ModelRequest) -> AsyncIterable[chat.C assert_never(part) @staticmethod - async def _map_user_prompt(part: UserPromptPart) -> chat.ChatCompletionUserMessageParam: + async def _map_user_prompt(part: UserPromptPart, provider: Provider[Any]) -> chat.ChatCompletionUserMessageParam: content: str | list[ChatCompletionContentPartParam] if isinstance(part.content, str): content = part.content @@ -697,6 +700,9 @@ async def _map_user_prompt(part: UserPromptPart) -> chat.ChatCompletionUserMessa content.append(file) elif isinstance(item, VideoUrl): # pragma: no cover raise NotImplementedError('VideoUrl is not supported for OpenAI') + elif isinstance(item, UploadedFile): + file = _map_uploaded_file(item, provider) + content.append(File(file=FileFile(file_id=file.id), type='file')) else: assert_never(item) return chat.ChatCompletionUserMessageParam(role='user', content=content) @@ -984,7 +990,7 @@ async def _map_messages( if isinstance(part, SystemPromptPart): openai_messages.append(responses.EasyInputMessageParam(role='system', content=part.content)) elif isinstance(part, UserPromptPart): - openai_messages.append(await self._map_user_prompt(part)) + openai_messages.append(await self._map_user_prompt(part, self._provider)) elif isinstance(part, ToolReturnPart): openai_messages.append( FunctionCallOutput( @@ -1066,7 +1072,7 @@ def _map_json_schema(self, o: OutputObjectDefinition) -> responses.ResponseForma return response_format_param @staticmethod - async def _map_user_prompt(part: UserPromptPart) -> responses.EasyInputMessageParam: + async def _map_user_prompt(part: UserPromptPart, provider: Provider[Any]) -> responses.EasyInputMessageParam: content: str | list[responses.ResponseInputContentParam] if isinstance(part.content, str): content = part.content @@ -1124,6 +1130,9 @@ async def _map_user_prompt(part: UserPromptPart) -> responses.EasyInputMessagePa ) elif isinstance(item, VideoUrl): # pragma: no cover raise NotImplementedError('VideoUrl is not supported for OpenAI.') + elif isinstance(item, UploadedFile): + file = _map_uploaded_file(item, provider) + content.append(responses.ResponseInputFileParam(file_id=file.id, type='input_file')) else: assert_never(item) return responses.EasyInputMessageParam(role='user', content=content) @@ -1358,3 +1367,19 @@ def _map_usage(response: chat.ChatCompletion | ChatCompletionChunk | responses.R u.input_audio_tokens = response_usage.prompt_tokens_details.audio_tokens or 0 u.cache_read_tokens = response_usage.prompt_tokens_details.cached_tokens or 0 return u + + +def _map_openai_uploaded_file(item: UploadedFile) -> FileObject: + if not isinstance(item.file, FileObject): + raise UserError('UploadedFile.file must be an openai.types.FileObject') + return item.file + + +def _map_uploaded_file(uploaded_file: UploadedFile, provider: Provider[Any]) -> FileObject: + """Map an UploadedFile to a File object.""" + url = URL(provider.base_url) + + if url.host == 'api.openai.com': + return _map_openai_uploaded_file(uploaded_file) + else: + raise UserError(f'UploadedFile is not supported for `{provider.name}` with base_url {provider.base_url}.') diff --git a/tests/assets/smiley.pdf b/tests/assets/smiley.pdf new file mode 100644 index 0000000000..ab8cd0df81 Binary files /dev/null and b/tests/assets/smiley.pdf differ diff --git a/tests/models/cassettes/test_google/test_uploaded_file_input.yaml b/tests/models/cassettes/test_google/test_uploaded_file_input.yaml new file mode 100644 index 0000000000..c8b2896380 --- /dev/null +++ b/tests/models/cassettes/test_google/test_uploaded_file_input.yaml @@ -0,0 +1,70 @@ +interactions: +- request: + headers: + accept: + - '*/*' + accept-encoding: + - gzip, deflate + connection: + - keep-alive + content-length: + - '280' + content-type: + - application/json + host: + - generativelanguage.googleapis.com + method: POST + parsed_body: + contents: + - parts: + - fileData: + fileUri: https://generativelanguage.googleapis.com/v1beta/files/6myu0b1v3mxl + mimeType: application/pdf + role: user + - parts: + - text: Give me a short description of this image + role: user + generationConfig: {} + uri: https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent + response: + headers: + alt-svc: + - h3=":443"; ma=2592000,h3-29=":443"; ma=2592000 + content-length: + - '881' + content-type: + - application/json; charset=UTF-8 + server-timing: + - gfet4t7; dur=5652 + transfer-encoding: + - chunked + vary: + - Origin + - X-Origin + - Referer + parsed_body: + candidates: + - content: + parts: + - text: The image displays a classic smiley face. It features a bright yellow circular face with two simple black + dot eyes and an upward-curved black line forming a smile. The yellow circle has a subtle darker yellow outline + and is set against a plain white background. + role: model + finishReason: STOP + index: 0 + modelVersion: gemini-2.5-flash + responseId: T7OkaOv-JOemmtkP5IXU2QI + usageMetadata: + candidatesTokenCount: 51 + promptTokenCount: 268 + promptTokensDetails: + - modality: TEXT + tokenCount: 10 + - modality: DOCUMENT + tokenCount: 258 + thoughtsTokenCount: 678 + totalTokenCount: 997 + status: + code: 200 + message: OK +version: 1 diff --git a/tests/models/cassettes/test_openai/test_uploaded_file_input.yaml b/tests/models/cassettes/test_openai/test_uploaded_file_input.yaml new file mode 100644 index 0000000000..f1eb5cbd1a --- /dev/null +++ b/tests/models/cassettes/test_openai/test_uploaded_file_input.yaml @@ -0,0 +1,85 @@ +interactions: +- request: + headers: + accept: + - application/json + accept-encoding: + - gzip, deflate + connection: + - keep-alive + content-length: + - '206' + content-type: + - application/json + host: + - api.openai.com + method: POST + parsed_body: + messages: + - content: + - text: Give me a short description of this image + type: text + - file: + file_id: file-7yEHnJNSSBeUYfkLq6G8KG + type: file + role: user + model: gpt-4o + stream: false + uri: https://api.openai.com/v1/chat/completions + response: + headers: + access-control-expose-headers: + - X-Request-ID + alt-svc: + - h3=":443"; ma=86400 + connection: + - keep-alive + content-length: + - '974' + content-type: + - application/json + openai-organization: + - coplane + openai-processing-ms: + - '4261' + openai-project: + - proj_KGkpeAYM2vPXvZOVtXfnuZ9r + openai-version: + - '2020-10-01' + strict-transport-security: + - max-age=31536000; includeSubDomains; preload + transfer-encoding: + - chunked + parsed_body: + choices: + - finish_reason: stop + index: 0 + logprobs: null + message: + annotations: [] + content: The image is a simple design of a classic yellow smiley face. It features a bright yellow circle with two + black dots for eyes and a curved black line for a smiling mouth. + refusal: null + role: assistant + created: 1755630898 + id: chatcmpl-C6M5KUA0T23RWuMTuAopnIc4ygeJb + model: gpt-4o-2024-08-06 + object: chat.completion + service_tier: default + system_fingerprint: fp_80956533cb + usage: + completion_tokens: 36 + completion_tokens_details: + accepted_prediction_tokens: 0 + audio_tokens: 0 + reasoning_tokens: 0 + rejected_prediction_tokens: 0 + prompt_tokens: 312 + prompt_tokens_details: + audio_tokens: 0 + cached_tokens: 0 + total_tokens: 348 + status: + code: 200 + message: OK +version: 1 diff --git a/tests/models/test_google.py b/tests/models/test_google.py index d94703e699..5d999016e1 100644 --- a/tests/models/test_google.py +++ b/tests/models/test_google.py @@ -36,6 +36,7 @@ ThinkingPartDelta, ToolCallPart, ToolReturnPart, + UploadedFile, UserPromptPart, VideoUrl, ) @@ -49,6 +50,7 @@ with try_import() as imports_successful: from google.genai.types import ( CodeExecutionResult, + File, GenerateContentResponse, GenerateContentResponseUsageMetadata, HarmBlockThreshold, @@ -1706,3 +1708,28 @@ def test_map_usage(): }, ) ) + + +async def test_uploaded_file_input(allow_model_requests: None, google_provider: GoogleProvider): + m = GoogleModel('gemini-2.5-flash', provider=google_provider) + agent = Agent(m, system_prompt='You are a helpful chatbot.') + # client = google_provider.client + # with open('tests/assets/smiley.pdf', 'rb') as f: + # google_file = client.files.upload( + # file=f, + # config={ + # 'mime_type': 'application/pdf', + # }, + # ) + # print(google_file) + google_file = File( + name='files/6myu0b1v3mxl', + mime_type='application/pdf', + uri='https://generativelanguage.googleapis.com/v1beta/files/6myu0b1v3mxl', + ) + agent = Agent(m) + + result = await agent.run(['Give me a short description of this image', UploadedFile(file=google_file)]) + assert result.output == snapshot( + 'The image displays a classic smiley face. It features a bright yellow circular face with two simple black dot eyes and an upward-curved black line forming a smile. The yellow circle has a subtle darker yellow outline and is set against a plain white background.' + ) diff --git a/tests/models/test_openai.py b/tests/models/test_openai.py index 466068f354..adbf54c24d 100644 --- a/tests/models/test_openai.py +++ b/tests/models/test_openai.py @@ -35,6 +35,7 @@ ThinkingPartDelta, ToolCallPart, ToolReturnPart, + UploadedFile, UserPromptPart, ) from pydantic_ai.models import ModelRequestParameters @@ -52,7 +53,7 @@ with try_import() as imports_successful: from openai import NOT_GIVEN, APIStatusError, AsyncOpenAI - from openai.types import chat + from openai.types import FileObject, chat from openai.types.chat.chat_completion import Choice, ChoiceLogprobs from openai.types.chat.chat_completion_chunk import ( Choice as ChunkChoice, @@ -2926,3 +2927,35 @@ async def test_openai_model_settings_temperature_ignored_on_gpt_5(allow_model_re result = await agent.run('What is the capital of France?', model_settings=ModelSettings(temperature=0.0)) assert result.output == snapshot('Paris.') + + +async def test_uploaded_file_input(allow_model_requests: None, openai_api_key: str): + provider = OpenAIProvider(api_key=openai_api_key) + m = OpenAIModel('gpt-4o', provider=provider) + # VCR recording breaks when dealing with openai file upload request due to + # binary contents. For that reason, we have manually run once the upload + # and rebuild the FileObject manually (from the print command output). + # with open('tests/assets/smiley.pdf', 'rb') as f: + # file_bytes = f.read() + # openai_file = await provider.client.files.create( + # file=('image.pdf', file_bytes, 'application/pdf'), + # purpose='user_data', + # ) + # print(openai_file) + openai_file = FileObject( + id='file-7yEHnJNSSBeUYfkLq6G8KG', + bytes=5930, + created_at=1755612061, + filename='image.pdf', # OpenAI file upload API only accepts pdf + object='file', + purpose='user_data', + status='processed', + expires_at=None, + status_details=None, + ) + agent = Agent(m) + + result = await agent.run(['Give me a short description of this image', UploadedFile(file=openai_file)]) + assert result.output == snapshot( + 'The image is a simple design of a classic yellow smiley face. It features a bright yellow circle with two black dots for eyes and a curved black line for a smiling mouth.' + )