Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Python: draft initial implementation of Realtime API #10127

Draft
wants to merge 23 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
346 changes: 346 additions & 0 deletions docs/decisions/00XX-realtime-api-clients.md

Large diffs are not rendered by default.

3 changes: 2 additions & 1 deletion python/.cspell.json
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@
"SEMANTICKERNEL",
"OTEL",
"vectorizable",
"desync"
"desync",
"webrtc"
]
}
11 changes: 8 additions & 3 deletions python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ dependencies = [
"pybars4 ~= 0.9",
"jinja2 ~= 3.1",
"nest-asyncio ~= 1.6",
"taskgroup >= 0.2.2; python_version < '3.11'",
]

### Optional dependencies
Expand All @@ -61,7 +62,8 @@ chroma = [
]
google = [
"google-cloud-aiplatform ~= 1.60",
"google-generativeai ~= 0.7"
"google-generativeai ~= 0.7",
"google-genai ~= 0.4"
]
hugging_face = [
"transformers[torch] ~= 4.28",
Expand Down Expand Up @@ -123,6 +125,11 @@ dapr = [
"dapr-ext-fastapi>=1.14.0",
"flask-dapr>=1.14.0"
]
openai_realtime = [
"openai[realtime] ~= 1.0",
"aiortc>=1.9.0",
"sounddevice>=0.5.1",
]

[tool.uv]
prerelease = "if-necessary-or-explicit"
Expand Down Expand Up @@ -220,5 +227,3 @@ name = "semantic_kernel"
[build-system]
requires = ["flit-core >= 3.9,<4.0"]
build-backend = "flit_core.buildapi"


Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
# Copyright (c) Microsoft. All rights reserved.

import asyncio
import logging

from samples.concepts.audio.utils import check_audio_devices
from semantic_kernel.connectors.ai.open_ai import (
ListenEvents,
OpenAIRealtime,
OpenAIRealtimeExecutionSettings,
TurnDetection,
)
from semantic_kernel.connectors.ai.utils import SKAudioPlayer

logging.basicConfig(level=logging.WARNING)
aiortc_log = logging.getLogger("aiortc")
aiortc_log.setLevel(logging.WARNING)
aioice_log = logging.getLogger("aioice")
aioice_log.setLevel(logging.WARNING)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

# This simple sample demonstrates how to use the OpenAI Realtime API to create
# a chat bot that can listen and respond directly through audio.
# It requires installing:
# - semantic-kernel[openai_realtime]
# - pyaudio
# - sounddevice
# - pydub
# - aiortc
# e.g. pip install pyaudio sounddevice pydub

# The characterics of your speaker and microphone are a big factor in a smooth conversation
# so you may need to try out different devices for each.
# you can also play around with the turn_detection settings to get the best results.
# It has device id's set in the AudioRecorderStream and AudioPlayerAsync classes,
# so you may need to adjust these for your system.
# you can check the available devices by uncommenting line below the function
check_audio_devices()


async def main() -> None:
# create the realtime client and optionally add the audio output function, this is optional
# you can define the protocol to use, either "websocket" or "webrtc"
# they will behave the same way, even though the underlying protocol is quite different
realtime_client = OpenAIRealtime("webrtc")
# Create the settings for the session
settings = OpenAIRealtimeExecutionSettings(
instructions="""
You are a chat bot. Your name is Mosscap and
you have one goal: figure out what people need.
Your full name, should you need to know it, is
Splendid Speckled Mosscap. You communicate
effectively, but you tend to answer with long
flowery prose.
""",
voice="alloy",
turn_detection=TurnDetection(type="server_vad", create_response=True, silence_duration_ms=800, threshold=0.8),
)
# the context manager calls the create_session method on the client and start listening to the audio stream
audio_player = SKAudioPlayer()
print("Mosscap (transcript): ", end="")
async with realtime_client, audio_player:
await realtime_client.update_session(settings=settings, create_response=True)

async for event in realtime_client.receive():
match event.event_type:
case "audio":
await audio_player.add_audio(event.audio)
case "text":
print(event.text.text, end="")
case "service":
# OpenAI Specific events
if event.service_type == ListenEvents.SESSION_UPDATED:
print("Session updated")
if event.service_type == ListenEvents.RESPONSE_CREATED:
print("")
if event.service_type == ListenEvents.ERROR:
logger.error(event.event)


if __name__ == "__main__":
print(
"Instruction: start speaking, when you stop the API should detect you finished and start responding. "
"Press ctrl + c to stop the program."
)
asyncio.run(main())
144 changes: 144 additions & 0 deletions python/samples/concepts/audio/05-chat_with_realtime_api_complex.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
# Copyright (c) Microsoft. All rights reserved.

import asyncio
import logging
from datetime import datetime
from random import randint

from samples.concepts.audio.utils import check_audio_devices
from semantic_kernel import Kernel
from semantic_kernel.connectors.ai import FunctionChoiceBehavior
from semantic_kernel.connectors.ai.open_ai import (
ListenEvents,
OpenAIRealtime,
OpenAIRealtimeExecutionSettings,
TurnDetection,
)
from semantic_kernel.connectors.ai.utils import SKAudioPlayer, SKAudioTrack
from semantic_kernel.contents import ChatHistory
from semantic_kernel.functions import kernel_function

logging.basicConfig(level=logging.WARNING)
aiortc_log = logging.getLogger("aiortc")
aiortc_log.setLevel(logging.WARNING)
aioice_log = logging.getLogger("aioice")
aioice_log.setLevel(logging.WARNING)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

# This simple sample demonstrates how to use the OpenAI Realtime API to create
# a chat bot that can listen and respond directly through audio.
# It requires installing:
# - semantic-kernel[openai_realtime]
# - pyaudio
# - sounddevice
# - pydub
# - aiortc
# e.g. pip install pyaudio sounddevice pydub

# The characterics of your speaker and microphone are a big factor in a smooth conversation
# so you may need to try out different devices for each.
# you can also play around with the turn_detection settings to get the best results.
# It has device id's set in the AudioRecorderStream and AudioPlayerAsync classes,
# so you may need to adjust these for your system.
# you can check the available devices by uncommenting line below the function


check_audio_devices()


@kernel_function
def get_weather(location: str) -> str:
"""Get the weather for a location."""
weather_conditions = ("sunny", "hot", "cloudy", "raining", "freezing", "snowing")
weather = weather_conditions[randint(0, len(weather_conditions) - 1)] # nosec
logger.info(f"@ Getting weather for {location}: {weather}")
return f"The weather in {location} is {weather}."


@kernel_function
def get_date_time() -> str:
"""Get the current date and time."""
logger.info("@ Getting current datetime")
return f"The current date and time is {datetime.now().isoformat()}."


@kernel_function
def goodbye():
"""When the user is done, say goodbye and then call this function."""
logger.info("@ Goodbye has been called!")
raise KeyboardInterrupt


async def main() -> None:
print_transcript = True
# create the Kernel and add a simple function for function calling.
kernel = Kernel()
kernel.add_functions(plugin_name="helpers", functions=[goodbye, get_weather, get_date_time])

# create the audio player and audio track
# both take a device_id parameter, which is the index of the device to use, if None the default device is used
audio_player = SKAudioPlayer(sample_rate=24000, frame_duration=100, channels=1)
audio_track = SKAudioTrack()
# create the realtime client and optionally add the audio output function, this is optional
# you can define the protocol to use, either "websocket" or "webrtc"
# they will behave the same way, even though the underlying protocol is quite different
realtime_client = OpenAIRealtime(
protocol="websocket",
audio_output_callback=audio_player.client_callback,
# audio_track=audio_track,
)

# Create the settings for the session
# The realtime api, does not use a system message, but takes instructions as a parameter for a session
instructions = """
You are a chat bot. Your name is Mosscap and
you have one goal: figure out what people need.
Your full name, should you need to know it, is
Splendid Speckled Mosscap. You communicate
effectively, but you tend to answer with long
flowery prose.
"""
# the key thing to decide on is to enable the server_vad turn detection
# if turn is turned off (by setting turn_detection=None), you will have to send
# the "input_audio_buffer.commit" and "response.create" event to the realtime api
# to signal the end of the user's turn and start the response.
# manual VAD is not part of this sample
settings = OpenAIRealtimeExecutionSettings(
instructions=instructions,
voice="alloy",
turn_detection=TurnDetection(type="server_vad", create_response=True, silence_duration_ms=800, threshold=0.8),
function_choice_behavior=FunctionChoiceBehavior.Auto(),
)
# and we can add a chat history to conversation after starting it
chat_history = ChatHistory()
chat_history.add_user_message("Hi there, who are you?")
chat_history.add_assistant_message("I am Mosscap, a chat bot. I'm trying to figure out what people need.")

# the context manager calls the create_session method on the client and start listening to the audio stream
async with realtime_client, audio_player, audio_track.stream_to_realtime_client(realtime_client):
await realtime_client.update_session(
settings=settings, chat_history=chat_history, kernel=kernel, create_response=True
)
print("Mosscap (transcript): ", end="")
async for event in realtime_client.receive():
match event.event_type:
case "text":
if print_transcript:
print(event.text.text, end="")
case "service":
# OpenAI Specific events
match event.service_type:
case ListenEvents.RESPONSE_CREATED:
if print_transcript:
print("")
case ListenEvents.ERROR:
logger.error(event.event)


if __name__ == "__main__":
print(
"Instruction: start speaking, when you stop the API should detect you finished and start responding. "
"Press ctrl + c to stop the program."
)
asyncio.run(main())
11 changes: 11 additions & 0 deletions python/samples/concepts/audio/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# Copyright (c) Microsoft. All rights reserved.

import logging

import sounddevice as sd

logger = logging.getLogger(__name__)


def check_audio_devices():
logger.debug(sd.query_devices())
Original file line number Diff line number Diff line change
Expand Up @@ -276,7 +276,9 @@ async def get_streaming_chat_message_contents(
for msg in messages:
if msg is not None:
all_messages.append(msg)
if any(isinstance(item, FunctionCallContent) for item in msg.items):
if not function_call_returned and any(
isinstance(item, FunctionCallContent) for item in msg.items
):
function_call_returned = True
yield messages

Expand Down Expand Up @@ -442,7 +444,10 @@ def _get_ai_model_id(self, settings: "PromptExecutionSettings") -> str:
return getattr(settings, "ai_model_id", self.ai_model_id) or self.ai_model_id

def _yield_function_result_messages(self, function_result_messages: list) -> bool:
"""Determine if the function result messages should be yielded."""
"""Determine if the function result messages should be yielded.

If there are messages and if the first message has items, then yield the messages.
"""
return len(function_result_messages) > 0 and len(function_result_messages[0].items) > 0

# endregion
50 changes: 50 additions & 0 deletions python/semantic_kernel/connectors/ai/function_calling_utils.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
# Copyright (c) Microsoft. All rights reserved.

from collections import OrderedDict
from collections.abc import Callable
from copy import deepcopy
from typing import TYPE_CHECKING, Any

from semantic_kernel.contents.chat_message_content import ChatMessageContent
from semantic_kernel.contents.function_result_content import FunctionResultContent
from semantic_kernel.contents.streaming_chat_message_content import StreamingChatMessageContent
from semantic_kernel.contents.utils.author_role import AuthorRole
from semantic_kernel.exceptions.service_exceptions import ServiceInitializationError
from semantic_kernel.utils.experimental_decorator import experimental_function

if TYPE_CHECKING:
from semantic_kernel.connectors.ai.function_choice_behavior import (
Expand All @@ -16,6 +19,7 @@
)
from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings
from semantic_kernel.functions.kernel_function_metadata import KernelFunctionMetadata
from semantic_kernel.kernel import Kernel


def update_settings_from_function_call_configuration(
Expand Down Expand Up @@ -129,3 +133,49 @@ def merge_streaming_function_results(
function_invoke_attempt=function_invoke_attempt,
)
]


@experimental_function
def prepare_settings_for_function_calling(
settings: "PromptExecutionSettings",
settings_class: type["PromptExecutionSettings"],
update_settings_callback: Callable[..., None],
kernel: "Kernel",
) -> "PromptExecutionSettings":
"""Prepare settings for the service.

Args:
settings: Prompt execution settings.
settings_class: The settings class.
update_settings_callback: The callback to update the settings.
kernel: Kernel instance.

Returns:
PromptExecutionSettings of type settings_class.
"""
settings = deepcopy(settings)
if not isinstance(settings, settings_class):
settings = settings_class.from_prompt_execution_settings(settings)

# For backwards compatibility we need to convert the `FunctionCallBehavior` to `FunctionChoiceBehavior`
# if this method is called with a `FunctionCallBehavior` object as part of the settings

from semantic_kernel.connectors.ai.function_call_behavior import FunctionCallBehavior
from semantic_kernel.connectors.ai.function_choice_behavior import FunctionChoiceBehavior

if hasattr(settings, "function_call_behavior") and isinstance(
settings.function_call_behavior, FunctionCallBehavior
):
settings.function_choice_behavior = FunctionChoiceBehavior.from_function_call_behavior(
settings.function_call_behavior
)

if settings.function_choice_behavior:
# Configure the function choice behavior into the settings object
# that will become part of the request to the AI service
settings.function_choice_behavior.configure(
kernel=kernel,
update_settings_callback=update_settings_callback,
settings=settings,
)
return settings
Loading