Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/llms-full.txt
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ The Agents SDK delivers a focused set of Python primitives—agents, tools, guar
- [Voice quickstart](https://openai.github.io/openai-agents-python/voice/quickstart/): Build an end-to-end voice assistant with streaming transcription, text-to-speech, and event-driven responses.
- [Voice pipeline](https://openai.github.io/openai-agents-python/voice/pipeline/): Customize audio capture, buffering, model invocation, and playback in voice-first experiences.
- [Voice tracing](https://openai.github.io/openai-agents-python/voice/tracing/): Inspect voice session traces, latency breakdowns, and audio event timelines.
- [Realtime quickstart](https://openai.github.io/openai-agents-python/realtime/quickstart/): Launch realtime agents over WebRTC or websockets, subscribe to events, and manage low-latency execution.
- [Realtime quickstart](https://openai.github.io/openai-agents-python/realtime/quickstart/): Launch realtime agents over websockets (WebRTC is not available in the Python SDK), subscribe to events, and manage low-latency execution.
- [Realtime guide](https://openai.github.io/openai-agents-python/realtime/guide/): Deep dive into realtime session lifecycle, event schemas, concurrency, and backpressure handling.

## Models and Provider Integrations
Expand Down
2 changes: 1 addition & 1 deletion docs/llms.txt
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ The SDK focuses on a concise set of primitives so you can orchestrate multi-agen
## Modalities and Interfaces
- [Voice quickstart](https://openai.github.io/openai-agents-python/voice/quickstart/): Build speech-enabled agents with streaming transcription and TTS.
- [Voice pipeline](https://openai.github.io/openai-agents-python/voice/pipeline/): Customize audio ingestion, tool execution, and response rendering.
- [Realtime quickstart](https://openai.github.io/openai-agents-python/realtime/quickstart/): Stand up low-latency realtime agents with WebRTC and websocket transports.
- [Realtime quickstart](https://openai.github.io/openai-agents-python/realtime/quickstart/): Stand up low-latency realtime agents with websocket transport (WebRTC is not available in the Python SDK).
- [Realtime guide](https://openai.github.io/openai-agents-python/realtime/guide/): Deep dive into session lifecycle, event formats, and concurrency patterns.

## API Reference Highlights
Expand Down
2 changes: 2 additions & 0 deletions src/agents/realtime/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@
)
from .openai_realtime import (
DEFAULT_MODEL_SETTINGS,
OpenAIRealtimeSIPModel,
OpenAIRealtimeWebSocketModel,
get_api_key,
)
Expand Down Expand Up @@ -176,6 +177,7 @@
"RealtimeModelUserInputMessage",
# OpenAI Realtime
"DEFAULT_MODEL_SETTINGS",
"OpenAIRealtimeSIPModel",
"OpenAIRealtimeWebSocketModel",
"get_api_key",
# Session
Expand Down
26 changes: 25 additions & 1 deletion src/agents/realtime/audio_formats.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
from __future__ import annotations

from collections.abc import Mapping
from typing import Any, Literal

from openai.types.realtime.realtime_audio_formats import (
AudioPCM,
AudioPCMA,
Expand All @@ -11,7 +14,7 @@


def to_realtime_audio_format(
input_audio_format: str | RealtimeAudioFormats | None,
input_audio_format: str | RealtimeAudioFormats | Mapping[str, Any] | None,
) -> RealtimeAudioFormats | None:
format: RealtimeAudioFormats | None = None
if input_audio_format is not None:
Expand All @@ -24,6 +27,27 @@ def to_realtime_audio_format(
format = AudioPCMA(type="audio/pcma")
else:
logger.debug(f"Unknown input_audio_format: {input_audio_format}")
elif isinstance(input_audio_format, Mapping):
fmt_type = input_audio_format.get("type")
rate = input_audio_format.get("rate")
if fmt_type == "audio/pcm":
pcm_rate: Literal[24000] | None
if isinstance(rate, (int, float)) and int(rate) == 24000:
pcm_rate = 24000
elif rate is None:
pcm_rate = 24000
else:
logger.debug(
f"Unknown pcm rate in input_audio_format mapping: {input_audio_format}"
)
pcm_rate = 24000
format = AudioPCM(type="audio/pcm", rate=pcm_rate)
elif fmt_type == "audio/pcmu":
format = AudioPCMU(type="audio/pcmu")
elif fmt_type == "audio/pcma":
format = AudioPCMA(type="audio/pcma")
else:
logger.debug(f"Unknown input_audio_format mapping: {input_audio_format}")
else:
format = input_audio_format
return format
46 changes: 40 additions & 6 deletions src/agents/realtime/config.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,7 @@
from __future__ import annotations

from typing import (
Any,
Literal,
Union,
)
from collections.abc import Mapping
from typing import Any, Literal, Union

from openai.types.realtime.realtime_audio_formats import (
RealtimeAudioFormats as OpenAIRealtimeAudioFormats,
Expand All @@ -28,13 +25,20 @@
"gpt-4o-realtime-preview-2024-12-17",
"gpt-4o-realtime-preview-2024-10-01",
"gpt-4o-mini-realtime-preview-2024-12-17",
"gpt-realtime-mini",
"gpt-realtime-mini-2025-10-06",
],
str,
]
"""The name of a realtime model."""


RealtimeAudioFormat: TypeAlias = Union[Literal["pcm16", "g711_ulaw", "g711_alaw"], str]
RealtimeAudioFormat: TypeAlias = Union[
Literal["pcm16", "g711_ulaw", "g711_alaw"],
str,
Mapping[str, Any],
OpenAIRealtimeAudioFormats,
]
"""The audio format for realtime audio streams."""


Expand Down Expand Up @@ -96,6 +100,30 @@ class RealtimeTurnDetectionConfig(TypedDict):
"""Threshold for server-vad to trigger a response if the user is idle for this duration."""


class RealtimeAudioInputConfig(TypedDict, total=False):
"""Configuration for audio input in realtime sessions."""

format: RealtimeAudioFormat | OpenAIRealtimeAudioFormats
noise_reduction: RealtimeInputAudioNoiseReductionConfig | None
transcription: RealtimeInputAudioTranscriptionConfig
turn_detection: RealtimeTurnDetectionConfig


class RealtimeAudioOutputConfig(TypedDict, total=False):
"""Configuration for audio output in realtime sessions."""

format: RealtimeAudioFormat | OpenAIRealtimeAudioFormats
voice: str
speed: float


class RealtimeAudioConfig(TypedDict, total=False):
"""Audio configuration for realtime sessions."""

input: RealtimeAudioInputConfig
output: RealtimeAudioOutputConfig


class RealtimeSessionModelSettings(TypedDict):
"""Model settings for a realtime model session."""

Expand All @@ -111,6 +139,12 @@ class RealtimeSessionModelSettings(TypedDict):
modalities: NotRequired[list[Literal["text", "audio"]]]
"""The modalities the model should support."""

output_modalities: NotRequired[list[Literal["text", "audio"]]]
"""The output modalities the model should support."""

audio: NotRequired[RealtimeAudioConfig]
"""The audio configuration for the session."""

voice: NotRequired[str]
"""The voice to use for audio output."""

Expand Down
Loading