diff --git a/docs/llms-full.txt b/docs/llms-full.txt index e33b033c0..c4bac19ba 100644 --- a/docs/llms-full.txt +++ b/docs/llms-full.txt @@ -33,7 +33,7 @@ The Agents SDK delivers a focused set of Python primitives—agents, tools, guar - [Voice quickstart](https://openai.github.io/openai-agents-python/voice/quickstart/): Build an end-to-end voice assistant with streaming transcription, text-to-speech, and event-driven responses. - [Voice pipeline](https://openai.github.io/openai-agents-python/voice/pipeline/): Customize audio capture, buffering, model invocation, and playback in voice-first experiences. - [Voice tracing](https://openai.github.io/openai-agents-python/voice/tracing/): Inspect voice session traces, latency breakdowns, and audio event timelines. -- [Realtime quickstart](https://openai.github.io/openai-agents-python/realtime/quickstart/): Launch realtime agents over WebRTC or websockets, subscribe to events, and manage low-latency execution. +- [Realtime quickstart](https://openai.github.io/openai-agents-python/realtime/quickstart/): Launch realtime agents over websockets (WebRTC is not available in the Python SDK), subscribe to events, and manage low-latency execution. - [Realtime guide](https://openai.github.io/openai-agents-python/realtime/guide/): Deep dive into realtime session lifecycle, event schemas, concurrency, and backpressure handling. ## Models and Provider Integrations diff --git a/docs/llms.txt b/docs/llms.txt index d7dc81c7c..86c4058e1 100644 --- a/docs/llms.txt +++ b/docs/llms.txt @@ -36,7 +36,7 @@ The SDK focuses on a concise set of primitives so you can orchestrate multi-agen ## Modalities and Interfaces - [Voice quickstart](https://openai.github.io/openai-agents-python/voice/quickstart/): Build speech-enabled agents with streaming transcription and TTS. - [Voice pipeline](https://openai.github.io/openai-agents-python/voice/pipeline/): Customize audio ingestion, tool execution, and response rendering. -- [Realtime quickstart](https://openai.github.io/openai-agents-python/realtime/quickstart/): Stand up low-latency realtime agents with WebRTC and websocket transports. +- [Realtime quickstart](https://openai.github.io/openai-agents-python/realtime/quickstart/): Stand up low-latency realtime agents with websocket transport (WebRTC is not available in the Python SDK). - [Realtime guide](https://openai.github.io/openai-agents-python/realtime/guide/): Deep dive into session lifecycle, event formats, and concurrency patterns. ## API Reference Highlights diff --git a/src/agents/realtime/__init__.py b/src/agents/realtime/__init__.py index 3f0793fa1..74937d151 100644 --- a/src/agents/realtime/__init__.py +++ b/src/agents/realtime/__init__.py @@ -84,6 +84,7 @@ ) from .openai_realtime import ( DEFAULT_MODEL_SETTINGS, + OpenAIRealtimeSIPModel, OpenAIRealtimeWebSocketModel, get_api_key, ) @@ -176,6 +177,7 @@ "RealtimeModelUserInputMessage", # OpenAI Realtime "DEFAULT_MODEL_SETTINGS", + "OpenAIRealtimeSIPModel", "OpenAIRealtimeWebSocketModel", "get_api_key", # Session diff --git a/src/agents/realtime/audio_formats.py b/src/agents/realtime/audio_formats.py index d9757d244..fdfe12304 100644 --- a/src/agents/realtime/audio_formats.py +++ b/src/agents/realtime/audio_formats.py @@ -1,5 +1,8 @@ from __future__ import annotations +from collections.abc import Mapping +from typing import Any, Literal + from openai.types.realtime.realtime_audio_formats import ( AudioPCM, AudioPCMA, @@ -11,7 +14,7 @@ def to_realtime_audio_format( - input_audio_format: str | RealtimeAudioFormats | None, + input_audio_format: str | RealtimeAudioFormats | Mapping[str, Any] | None, ) -> RealtimeAudioFormats | None: format: RealtimeAudioFormats | None = None if input_audio_format is not None: @@ -24,6 +27,27 @@ def to_realtime_audio_format( format = AudioPCMA(type="audio/pcma") else: logger.debug(f"Unknown input_audio_format: {input_audio_format}") + elif isinstance(input_audio_format, Mapping): + fmt_type = input_audio_format.get("type") + rate = input_audio_format.get("rate") + if fmt_type == "audio/pcm": + pcm_rate: Literal[24000] | None + if isinstance(rate, (int, float)) and int(rate) == 24000: + pcm_rate = 24000 + elif rate is None: + pcm_rate = 24000 + else: + logger.debug( + f"Unknown pcm rate in input_audio_format mapping: {input_audio_format}" + ) + pcm_rate = 24000 + format = AudioPCM(type="audio/pcm", rate=pcm_rate) + elif fmt_type == "audio/pcmu": + format = AudioPCMU(type="audio/pcmu") + elif fmt_type == "audio/pcma": + format = AudioPCMA(type="audio/pcma") + else: + logger.debug(f"Unknown input_audio_format mapping: {input_audio_format}") else: format = input_audio_format return format diff --git a/src/agents/realtime/config.py b/src/agents/realtime/config.py index 9b6712a28..4ac9bcaa4 100644 --- a/src/agents/realtime/config.py +++ b/src/agents/realtime/config.py @@ -1,10 +1,7 @@ from __future__ import annotations -from typing import ( - Any, - Literal, - Union, -) +from collections.abc import Mapping +from typing import Any, Literal, Union from openai.types.realtime.realtime_audio_formats import ( RealtimeAudioFormats as OpenAIRealtimeAudioFormats, @@ -28,13 +25,20 @@ "gpt-4o-realtime-preview-2024-12-17", "gpt-4o-realtime-preview-2024-10-01", "gpt-4o-mini-realtime-preview-2024-12-17", + "gpt-realtime-mini", + "gpt-realtime-mini-2025-10-06", ], str, ] """The name of a realtime model.""" -RealtimeAudioFormat: TypeAlias = Union[Literal["pcm16", "g711_ulaw", "g711_alaw"], str] +RealtimeAudioFormat: TypeAlias = Union[ + Literal["pcm16", "g711_ulaw", "g711_alaw"], + str, + Mapping[str, Any], + OpenAIRealtimeAudioFormats, +] """The audio format for realtime audio streams.""" @@ -96,6 +100,30 @@ class RealtimeTurnDetectionConfig(TypedDict): """Threshold for server-vad to trigger a response if the user is idle for this duration.""" +class RealtimeAudioInputConfig(TypedDict, total=False): + """Configuration for audio input in realtime sessions.""" + + format: RealtimeAudioFormat | OpenAIRealtimeAudioFormats + noise_reduction: RealtimeInputAudioNoiseReductionConfig | None + transcription: RealtimeInputAudioTranscriptionConfig + turn_detection: RealtimeTurnDetectionConfig + + +class RealtimeAudioOutputConfig(TypedDict, total=False): + """Configuration for audio output in realtime sessions.""" + + format: RealtimeAudioFormat | OpenAIRealtimeAudioFormats + voice: str + speed: float + + +class RealtimeAudioConfig(TypedDict, total=False): + """Audio configuration for realtime sessions.""" + + input: RealtimeAudioInputConfig + output: RealtimeAudioOutputConfig + + class RealtimeSessionModelSettings(TypedDict): """Model settings for a realtime model session.""" @@ -111,6 +139,12 @@ class RealtimeSessionModelSettings(TypedDict): modalities: NotRequired[list[Literal["text", "audio"]]] """The modalities the model should support.""" + output_modalities: NotRequired[list[Literal["text", "audio"]]] + """The output modalities the model should support.""" + + audio: NotRequired[RealtimeAudioConfig] + """The audio configuration for the session.""" + voice: NotRequired[str] """The voice to use for audio output.""" diff --git a/src/agents/realtime/openai_realtime.py b/src/agents/realtime/openai_realtime.py index 236162622..af8625f09 100644 --- a/src/agents/realtime/openai_realtime.py +++ b/src/agents/realtime/openai_realtime.py @@ -79,7 +79,7 @@ ) from openai.types.responses.response_prompt import ResponsePrompt from pydantic import Field, TypeAdapter -from typing_extensions import assert_never +from typing_extensions import TypeAlias, assert_never from websockets.asyncio.client import ClientConnection from agents.handoffs import Handoff @@ -91,11 +91,15 @@ from ..exceptions import UserError from ..logger import logger +from ..run_context import RunContextWrapper, TContext from ..version import __version__ +from .agent import RealtimeAgent from .config import ( RealtimeModelTracingConfig, + RealtimeRunConfig, RealtimeSessionModelSettings, ) +from .handoffs import realtime_handoff from .items import RealtimeMessageItem, RealtimeToolCallItem from .model import ( RealtimeModel, @@ -131,6 +135,16 @@ RealtimeModelSendUserInput, ) +FormatInput: TypeAlias = Union[ + str, + AudioPCM, + AudioPCMU, + AudioPCMA, + Mapping[str, Any], + None, +] + + # Avoid direct imports of non-exported names by referencing via module OpenAIRealtimeAudioConfig = _rt_audio_config.RealtimeAudioConfig OpenAIRealtimeAudioInput = _rt_audio_config.RealtimeAudioConfigInput # type: ignore[attr-defined] @@ -178,6 +192,60 @@ def get_server_event_type_adapter() -> TypeAdapter[AllRealtimeServerEvents]: return ServerEventTypeAdapter +async def _collect_enabled_handoffs( + agent: RealtimeAgent[Any], context_wrapper: RunContextWrapper[Any] +) -> list[Handoff[Any, RealtimeAgent[Any]]]: + handoffs: list[Handoff[Any, RealtimeAgent[Any]]] = [] + for handoff_item in agent.handoffs: + if isinstance(handoff_item, Handoff): + handoffs.append(handoff_item) + elif isinstance(handoff_item, RealtimeAgent): + handoffs.append(realtime_handoff(handoff_item)) + + async def _check_handoff_enabled(handoff_obj: Handoff[Any, RealtimeAgent[Any]]) -> bool: + attr = handoff_obj.is_enabled + if isinstance(attr, bool): + return attr + res = attr(context_wrapper, agent) + if inspect.isawaitable(res): + return await res + return res + + results = await asyncio.gather(*(_check_handoff_enabled(h) for h in handoffs)) + return [h for h, ok in zip(handoffs, results) if ok] + + +async def _build_model_settings_from_agent( + *, + agent: RealtimeAgent[Any], + context_wrapper: RunContextWrapper[Any], + base_settings: RealtimeSessionModelSettings, + starting_settings: RealtimeSessionModelSettings | None, + run_config: RealtimeRunConfig | None, +) -> RealtimeSessionModelSettings: + updated_settings = base_settings.copy() + + if agent.prompt is not None: + updated_settings["prompt"] = agent.prompt + + instructions, tools, handoffs = await asyncio.gather( + agent.get_system_prompt(context_wrapper), + agent.get_all_tools(context_wrapper), + _collect_enabled_handoffs(agent, context_wrapper), + ) + updated_settings["instructions"] = instructions or "" + updated_settings["tools"] = tools or [] + updated_settings["handoffs"] = handoffs or [] + + if starting_settings: + updated_settings.update(starting_settings) + + if run_config and run_config.get("tracing_disabled", False): + updated_settings["tracing"] = None + + return updated_settings + + # Note: Avoid a module-level union alias for Python 3.9 compatibility. # Using a union at runtime (e.g., A | B) in a type alias triggers evaluation # during import on 3.9. We instead inline the union in annotations below. @@ -819,6 +887,27 @@ def _read_format_type(fmt: object) -> str | None: return type_value if isinstance(type_value, str) else None + @staticmethod + def _normalize_turn_detection_config(config: object) -> object: + """Normalize camelCase turn detection keys to snake_case for API compatibility.""" + if not isinstance(config, Mapping): + return config + + normalized = dict(config) + key_map = { + "createResponse": "create_response", + "interruptResponse": "interrupt_response", + "prefixPaddingMs": "prefix_padding_ms", + "silenceDurationMs": "silence_duration_ms", + "idleTimeoutMs": "idle_timeout_ms", + } + for camel_key, snake_key in key_map.items(): + if camel_key in normalized and snake_key not in normalized: + normalized[snake_key] = normalized[camel_key] + normalized.pop(camel_key, None) + + return normalized + async def _update_session_config(self, model_settings: RealtimeSessionModelSettings) -> None: session_config = self._get_session_config(model_settings) await self._send_raw_message( @@ -829,62 +918,95 @@ def _get_session_config( self, model_settings: RealtimeSessionModelSettings ) -> OpenAISessionCreateRequest: """Get the session config.""" - audio_input_args = {} + audio_input_args: dict[str, Any] = {} + audio_output_args: dict[str, Any] = {} + + audio_config = model_settings.get("audio") + audio_config_mapping = audio_config if isinstance(audio_config, Mapping) else None + input_audio_config: Mapping[str, Any] = ( + cast(Mapping[str, Any], audio_config_mapping.get("input", {})) + if audio_config_mapping + else {} + ) + output_audio_config: Mapping[str, Any] = ( + cast(Mapping[str, Any], audio_config_mapping.get("output", {})) + if audio_config_mapping + else {} + ) - if self._call_id: - audio_input_args["format"] = to_realtime_audio_format( - model_settings.get("input_audio_format") - ) - else: - audio_input_args["format"] = to_realtime_audio_format( - model_settings.get( + input_format_source: FormatInput = ( + input_audio_config.get("format") if input_audio_config else None + ) + if input_format_source is None: + if self._call_id: + input_format_source = model_settings.get("input_audio_format") + else: + input_format_source = model_settings.get( "input_audio_format", DEFAULT_MODEL_SETTINGS.get("input_audio_format") ) - ) + audio_input_args["format"] = to_realtime_audio_format(input_format_source) - if "input_audio_noise_reduction" in model_settings: - audio_input_args["noise_reduction"] = model_settings.get("input_audio_noise_reduction") # type: ignore[assignment] + if "noise_reduction" in input_audio_config: + audio_input_args["noise_reduction"] = input_audio_config.get("noise_reduction") + elif "input_audio_noise_reduction" in model_settings: + audio_input_args["noise_reduction"] = model_settings.get("input_audio_noise_reduction") - if "input_audio_transcription" in model_settings: - audio_input_args["transcription"] = model_settings.get("input_audio_transcription") # type: ignore[assignment] + if "transcription" in input_audio_config: + audio_input_args["transcription"] = input_audio_config.get("transcription") + elif "input_audio_transcription" in model_settings: + audio_input_args["transcription"] = model_settings.get("input_audio_transcription") else: - audio_input_args["transcription"] = DEFAULT_MODEL_SETTINGS.get( # type: ignore[assignment] + audio_input_args["transcription"] = DEFAULT_MODEL_SETTINGS.get( "input_audio_transcription" ) - if "turn_detection" in model_settings: - audio_input_args["turn_detection"] = model_settings.get("turn_detection") # type: ignore[assignment] + if "turn_detection" in input_audio_config: + audio_input_args["turn_detection"] = self._normalize_turn_detection_config( + input_audio_config.get("turn_detection") + ) + elif "turn_detection" in model_settings: + audio_input_args["turn_detection"] = self._normalize_turn_detection_config( + model_settings.get("turn_detection") + ) else: - audio_input_args["turn_detection"] = DEFAULT_MODEL_SETTINGS.get("turn_detection") # type: ignore[assignment] + audio_input_args["turn_detection"] = DEFAULT_MODEL_SETTINGS.get("turn_detection") - audio_output_args = { - "voice": model_settings.get("voice", DEFAULT_MODEL_SETTINGS.get("voice")), - } + requested_voice = output_audio_config.get("voice") if output_audio_config else None + audio_output_args["voice"] = requested_voice or model_settings.get( + "voice", DEFAULT_MODEL_SETTINGS.get("voice") + ) - if self._call_id: - audio_output_args["format"] = to_realtime_audio_format( # type: ignore[assignment] - model_settings.get("output_audio_format") - ) - else: - audio_output_args["format"] = to_realtime_audio_format( # type: ignore[assignment] - model_settings.get( + output_format_source: FormatInput = ( + output_audio_config.get("format") if output_audio_config else None + ) + if output_format_source is None: + if self._call_id: + output_format_source = model_settings.get("output_audio_format") + else: + output_format_source = model_settings.get( "output_audio_format", DEFAULT_MODEL_SETTINGS.get("output_audio_format") ) - ) + audio_output_args["format"] = to_realtime_audio_format(output_format_source) - if "speed" in model_settings: - audio_output_args["speed"] = model_settings.get("speed") # type: ignore[assignment] + if "speed" in output_audio_config: + audio_output_args["speed"] = output_audio_config.get("speed") + elif "speed" in model_settings: + audio_output_args["speed"] = model_settings.get("speed") + + output_modalities = ( + model_settings.get("output_modalities") + or model_settings.get("modalities") + or DEFAULT_MODEL_SETTINGS.get("modalities") + ) # Construct full session object. `type` will be excluded at serialization time for updates. session_create_request = OpenAISessionCreateRequest( type="realtime", model=(model_settings.get("model_name") or self.model) or "gpt-realtime", - output_modalities=model_settings.get( - "modalities", DEFAULT_MODEL_SETTINGS.get("modalities") - ), + output_modalities=output_modalities, audio=OpenAIRealtimeAudioConfig( - input=OpenAIRealtimeAudioInput(**audio_input_args), # type: ignore[arg-type] - output=OpenAIRealtimeAudioOutput(**audio_output_args), # type: ignore[arg-type] + input=OpenAIRealtimeAudioInput(**audio_input_args), + output=OpenAIRealtimeAudioOutput(**audio_output_args), ), tools=cast( Any, @@ -949,6 +1071,42 @@ def _tools_to_session_tools( class OpenAIRealtimeSIPModel(OpenAIRealtimeWebSocketModel): """Realtime model that attaches to SIP-originated calls using a call ID.""" + @staticmethod + async def build_initial_session_payload( + agent: RealtimeAgent[Any], + *, + context: TContext | None = None, + model_config: RealtimeModelConfig | None = None, + run_config: RealtimeRunConfig | None = None, + overrides: RealtimeSessionModelSettings | None = None, + ) -> OpenAISessionCreateRequest: + """Build a session payload that mirrors what a RealtimeSession would send on connect. + + This helper can be used to accept SIP-originated calls by forwarding the returned payload to + the Realtime Calls API without duplicating session setup logic. + """ + run_config_settings = (run_config or {}).get("model_settings") or {} + initial_model_settings = (model_config or {}).get("initial_model_settings") or {} + base_settings: RealtimeSessionModelSettings = { + **run_config_settings, + **initial_model_settings, + } + + context_wrapper = RunContextWrapper(context) + merged_settings = await _build_model_settings_from_agent( + agent=agent, + context_wrapper=context_wrapper, + base_settings=base_settings, + starting_settings=initial_model_settings, + run_config=run_config, + ) + + if overrides: + merged_settings.update(overrides) + + model = OpenAIRealtimeWebSocketModel() + return model._get_session_config(merged_settings) + async def connect(self, options: RealtimeModelConfig) -> None: call_id = options.get("call_id") if not call_id: diff --git a/tests/realtime/test_audio_formats_unit.py b/tests/realtime/test_audio_formats_unit.py index 5c621d462..3eaf13556 100644 --- a/tests/realtime/test_audio_formats_unit.py +++ b/tests/realtime/test_audio_formats_unit.py @@ -1,4 +1,4 @@ -from openai.types.realtime.realtime_audio_formats import AudioPCM +from openai.types.realtime.realtime_audio_formats import AudioPCM, AudioPCMA, AudioPCMU from agents.realtime.audio_formats import to_realtime_audio_format @@ -26,3 +26,24 @@ def test_to_realtime_audio_format_passthrough_and_unknown_logs(): def test_to_realtime_audio_format_none(): assert to_realtime_audio_format(None) is None + + +def test_to_realtime_audio_format_from_mapping(): + pcm = to_realtime_audio_format({"type": "audio/pcm", "rate": 16000}) + assert isinstance(pcm, AudioPCM) + assert pcm.type == "audio/pcm" + assert pcm.rate == 24000 + + pcm_default_rate = to_realtime_audio_format({"type": "audio/pcm"}) + assert isinstance(pcm_default_rate, AudioPCM) + assert pcm_default_rate.rate == 24000 + + ulaw = to_realtime_audio_format({"type": "audio/pcmu"}) + assert isinstance(ulaw, AudioPCMU) + assert ulaw.type == "audio/pcmu" + + alaw = to_realtime_audio_format({"type": "audio/pcma"}) + assert isinstance(alaw, AudioPCMA) + assert alaw.type == "audio/pcma" + + assert to_realtime_audio_format({"type": "audio/unknown", "rate": 8000}) is None diff --git a/tests/realtime/test_openai_realtime.py b/tests/realtime/test_openai_realtime.py index 5954bbc93..7895989d6 100644 --- a/tests/realtime/test_openai_realtime.py +++ b/tests/realtime/test_openai_realtime.py @@ -669,6 +669,48 @@ def test_session_config_preserves_sip_audio_formats(self, model): assert cfg.audio.output is not None assert cfg.audio.output.format is None + def test_session_config_respects_audio_block_and_output_modalities(self, model): + settings = { + "input_audio_format": "pcm16", + "output_audio_format": "pcm16", + "modalities": ["audio"], + "output_modalities": ["text"], + "audio": { + "input": { + "format": {"type": "audio/pcmu"}, + "turn_detection": { + "type": "server_vad", + "createResponse": True, + "silenceDurationMs": 450, + }, + }, + "output": { + "format": {"type": "audio/pcma"}, + "voice": "synth-1", + "speed": 1.5, + }, + }, + } + cfg = model._get_session_config(settings) + + assert cfg.output_modalities == ["text"] + assert cfg.audio is not None + assert cfg.audio.input.format is not None + assert cfg.audio.input.format.type == "audio/pcmu" + assert cfg.audio.output.format is not None + assert cfg.audio.output.format.type == "audio/pcma" + assert cfg.audio.output.voice == "synth-1" + assert cfg.audio.output.speed == 1.5 + assert cfg.audio.input.transcription is not None + + turn_detection = cfg.audio.input.turn_detection + turn_detection_mapping = ( + turn_detection if isinstance(turn_detection, dict) else turn_detection.model_dump() + ) + assert turn_detection_mapping["create_response"] is True + assert turn_detection_mapping["silence_duration_ms"] == 450 + assert "silenceDurationMs" not in turn_detection_mapping + @pytest.mark.asyncio async def test_handle_error_event_success(self, model): """Test successful handling of error events.""" diff --git a/tests/realtime/test_realtime_model_settings.py b/tests/realtime/test_realtime_model_settings.py new file mode 100644 index 000000000..a73a63414 --- /dev/null +++ b/tests/realtime/test_realtime_model_settings.py @@ -0,0 +1,131 @@ +from __future__ import annotations + +from unittest.mock import AsyncMock + +import pytest +from openai.types.realtime.realtime_session_create_request import ( + RealtimeSessionCreateRequest, +) + +from agents.handoffs import Handoff +from agents.realtime.agent import RealtimeAgent +from agents.realtime.config import RealtimeRunConfig, RealtimeSessionModelSettings +from agents.realtime.handoffs import realtime_handoff +from agents.realtime.model import RealtimeModelConfig +from agents.realtime.openai_realtime import ( + OpenAIRealtimeSIPModel, + _build_model_settings_from_agent, + _collect_enabled_handoffs, +) +from agents.run_context import RunContextWrapper +from agents.tool import function_tool + + +@pytest.mark.asyncio +async def test_collect_enabled_handoffs_filters_disabled() -> None: + parent = RealtimeAgent(name="parent") + disabled = realtime_handoff( + RealtimeAgent(name="child_disabled"), + is_enabled=lambda ctx, agent: False, + ) + parent.handoffs = [disabled, RealtimeAgent(name="child_enabled")] + + enabled = await _collect_enabled_handoffs(parent, RunContextWrapper(None)) + + assert len(enabled) == 1 + assert isinstance(enabled[0], Handoff) + assert enabled[0].agent_name == "child_enabled" + + +@pytest.mark.asyncio +async def test_build_model_settings_from_agent_merges_agent_fields(monkeypatch: pytest.MonkeyPatch): + agent = RealtimeAgent(name="root", prompt={"id": "prompt-id"}) + monkeypatch.setattr(agent, "get_system_prompt", AsyncMock(return_value="sys")) + + @function_tool + def helper() -> str: + """Helper tool for testing.""" + return "ok" + + monkeypatch.setattr(agent, "get_all_tools", AsyncMock(return_value=[helper])) + agent.handoffs = [RealtimeAgent(name="handoff-child")] + base_settings: RealtimeSessionModelSettings = {"model_name": "gpt-realtime"} + starting_settings: RealtimeSessionModelSettings = {"voice": "verse"} + run_config: RealtimeRunConfig = {"tracing_disabled": True} + + merged = await _build_model_settings_from_agent( + agent=agent, + context_wrapper=RunContextWrapper(None), + base_settings=base_settings, + starting_settings=starting_settings, + run_config=run_config, + ) + + assert merged["prompt"] == {"id": "prompt-id"} + assert merged["instructions"] == "sys" + assert merged["tools"][0].name == helper.name + assert merged["handoffs"][0].agent_name == "handoff-child" + assert merged["voice"] == "verse" + assert merged["model_name"] == "gpt-realtime" + assert merged["tracing"] is None + assert base_settings == {"model_name": "gpt-realtime"} + + +@pytest.mark.asyncio +async def test_sip_model_build_initial_session_payload(monkeypatch: pytest.MonkeyPatch): + agent = RealtimeAgent(name="parent", prompt={"id": "prompt-99"}) + child_agent = RealtimeAgent(name="child") + agent.handoffs = [child_agent] + + @function_tool + def ping() -> str: + """Ping tool used for session payload building.""" + return "pong" + + monkeypatch.setattr(agent, "get_system_prompt", AsyncMock(return_value="parent-system")) + monkeypatch.setattr(agent, "get_all_tools", AsyncMock(return_value=[ping])) + + model_config: RealtimeModelConfig = { + "initial_model_settings": { + "model_name": "gpt-realtime-mini", + "voice": "verse", + } + } + run_config: RealtimeRunConfig = { + "model_settings": {"output_modalities": ["text"]}, + "tracing_disabled": True, + } + overrides: RealtimeSessionModelSettings = { + "audio": {"input": {"format": {"type": "audio/pcmu"}}}, + "output_audio_format": "g711_ulaw", + } + + payload = await OpenAIRealtimeSIPModel.build_initial_session_payload( + agent, + context={"user": "abc"}, + model_config=model_config, + run_config=run_config, + overrides=overrides, + ) + + assert isinstance(payload, RealtimeSessionCreateRequest) + assert payload.model == "gpt-realtime-mini" + assert payload.output_modalities == ["text"] + assert payload.audio is not None + audio = payload.audio + assert audio.input is not None + assert audio.input.format is not None + assert audio.input.format.type == "audio/pcmu" + assert audio.output is not None + assert audio.output.format is not None + assert audio.output.format.type == "audio/pcmu" + assert audio.output.voice == "verse" + assert payload.instructions == "parent-system" + assert payload.prompt is not None and payload.prompt.id == "prompt-99" + tool_names: set[str] = set() + for tool in payload.tools or []: + name = getattr(tool, "name", None) + if name: + tool_names.add(name) + assert ping.name in tool_names + assert f"transfer_to_{child_agent.name}" in tool_names