microsoft · eavanvalkenburg · Jan 8, 2025 · Jan 9, 2025 · Jan 9, 2025 · Jan 9, 2025
diff --git a/docs/decisions/00XX-realtime-api-clients.md b/docs/decisions/00XX-realtime-api-clients.md
@@ -76,6 +76,7 @@
         "SEMANTICKERNEL",
         "OTEL",
         "vectorizable",
-        "desync"
+        "desync",
+        "webrtc"
     ]
 }
@@ -45,6 +45,7 @@ dependencies = [
     "pybars4 ~= 0.9",
     "jinja2 ~= 3.1",
     "nest-asyncio ~= 1.6",
+    "taskgroup >= 0.2.2; python_version < '3.11'",
 ]
 
 ### Optional dependencies
@@ -61,7 +62,8 @@ chroma = [
 ]
 google = [
     "google-cloud-aiplatform ~= 1.60",
-    "google-generativeai ~= 0.7"
+    "google-generativeai ~= 0.7",
+    "google-genai ~= 0.4"
 ]
 hugging_face = [
     "transformers[torch] ~= 4.28",
@@ -123,6 +125,11 @@ dapr = [
     "dapr-ext-fastapi>=1.14.0",
     "flask-dapr>=1.14.0"
 ]
+openai_realtime = [
+    "openai[realtime] ~= 1.0",
+    "aiortc>=1.9.0",
+    "sounddevice>=0.5.1",
+]
 
 [tool.uv]
 prerelease = "if-necessary-or-explicit"
@@ -220,5 +227,3 @@ name = "semantic_kernel"
 [build-system]
 requires = ["flit-core >= 3.9,<4.0"]
 build-backend = "flit_core.buildapi"
-
-
@@ -0,0 +1,87 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+import asyncio
+import logging
+
+from samples.concepts.audio.utils import check_audio_devices
+from semantic_kernel.connectors.ai.open_ai import (
+    ListenEvents,
+    OpenAIRealtime,
+    OpenAIRealtimeExecutionSettings,
+    TurnDetection,
+)
+from semantic_kernel.connectors.ai.utils import SKAudioPlayer
+
+logging.basicConfig(level=logging.WARNING)
+aiortc_log = logging.getLogger("aiortc")
+aiortc_log.setLevel(logging.WARNING)
+aioice_log = logging.getLogger("aioice")
+aioice_log.setLevel(logging.WARNING)
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+# This simple sample demonstrates how to use the OpenAI Realtime API to create
+# a chat bot that can listen and respond directly through audio.
+# It requires installing:
+# - semantic-kernel[openai_realtime]
+# - pyaudio
+# - sounddevice
+# - pydub
+# - aiortc
+# e.g. pip install pyaudio sounddevice pydub
+
+# The characterics of your speaker and microphone are a big factor in a smooth conversation
+# so you may need to try out different devices for each.
+# you can also play around with the turn_detection settings to get the best results.
+# It has device id's set in the AudioRecorderStream and AudioPlayerAsync classes,
+# so you may need to adjust these for your system.
+# you can check the available devices by uncommenting line below the function
+check_audio_devices()
+
+
+async def main() -> None:
+    # create the realtime client and optionally add the audio output function, this is optional
+    # you can define the protocol to use, either "websocket" or "webrtc"
+    # they will behave the same way, even though the underlying protocol is quite different
+    realtime_client = OpenAIRealtime("webrtc")
+    # Create the settings for the session
+    settings = OpenAIRealtimeExecutionSettings(
+        instructions="""
+    You are a chat bot. Your name is Mosscap and
+    you have one goal: figure out what people need.
+    Your full name, should you need to know it, is
+    Splendid Speckled Mosscap. You communicate
+    effectively, but you tend to answer with long
+    flowery prose.
+    """,
+        voice="alloy",
+        turn_detection=TurnDetection(type="server_vad", create_response=True, silence_duration_ms=800, threshold=0.8),
+    )
+    # the context manager calls the create_session method on the client and start listening to the audio stream
+    audio_player = SKAudioPlayer()
+    print("Mosscap (transcript): ", end="")
+    async with realtime_client, audio_player:
+        await realtime_client.update_session(settings=settings, create_response=True)
+
+        async for event in realtime_client.receive():
+            match event.event_type:
+                case "audio":
+                    await audio_player.add_audio(event.audio)
+                case "text":
+                    print(event.text.text, end="")
+                case "service":
+                    # OpenAI Specific events
+                    if event.service_type == ListenEvents.SESSION_UPDATED:
+                        print("Session updated")
+                    if event.service_type == ListenEvents.RESPONSE_CREATED:
+                        print("")
+                    if event.service_type == ListenEvents.ERROR:
+                        logger.error(event.event)
+
+
+if __name__ == "__main__":
+    print(
+        "Instruction: start speaking, when you stop the API should detect you finished and start responding. "
+        "Press ctrl + c to stop the program."
+    )
+    asyncio.run(main())
@@ -0,0 +1,144 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+import asyncio
+import logging
+from datetime import datetime
+from random import randint
+
+from samples.concepts.audio.utils import check_audio_devices
+from semantic_kernel import Kernel
+from semantic_kernel.connectors.ai import FunctionChoiceBehavior
+from semantic_kernel.connectors.ai.open_ai import (
+    ListenEvents,
+    OpenAIRealtime,
+    OpenAIRealtimeExecutionSettings,
+    TurnDetection,
+)
+from semantic_kernel.connectors.ai.utils import SKAudioPlayer, SKAudioTrack
+from semantic_kernel.contents import ChatHistory
+from semantic_kernel.functions import kernel_function
+
+logging.basicConfig(level=logging.WARNING)
+aiortc_log = logging.getLogger("aiortc")
+aiortc_log.setLevel(logging.WARNING)
+aioice_log = logging.getLogger("aioice")
+aioice_log.setLevel(logging.WARNING)
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+# This simple sample demonstrates how to use the OpenAI Realtime API to create
+# a chat bot that can listen and respond directly through audio.
+# It requires installing:
+# - semantic-kernel[openai_realtime]
+# - pyaudio
+# - sounddevice
+# - pydub
+# - aiortc
+# e.g. pip install pyaudio sounddevice pydub
+
+# The characterics of your speaker and microphone are a big factor in a smooth conversation
+# so you may need to try out different devices for each.
+# you can also play around with the turn_detection settings to get the best results.
+# It has device id's set in the AudioRecorderStream and AudioPlayerAsync classes,
+# so you may need to adjust these for your system.
+# you can check the available devices by uncommenting line below the function
+
+
+check_audio_devices()
+
+
+@kernel_function
+def get_weather(location: str) -> str:
+    """Get the weather for a location."""
+    weather_conditions = ("sunny", "hot", "cloudy", "raining", "freezing", "snowing")
+    weather = weather_conditions[randint(0, len(weather_conditions) - 1)]  # nosec
+    logger.info(f"@ Getting weather for {location}: {weather}")
+    return f"The weather in {location} is {weather}."
+
+
+@kernel_function
+def get_date_time() -> str:
+    """Get the current date and time."""
+    logger.info("@ Getting current datetime")
+    return f"The current date and time is {datetime.now().isoformat()}."
+
+
+@kernel_function
+def goodbye():
+    """When the user is done, say goodbye and then call this function."""
+    logger.info("@ Goodbye has been called!")
+    raise KeyboardInterrupt
+
+
+async def main() -> None:
+    print_transcript = True
+    # create the Kernel and add a simple function for function calling.
+    kernel = Kernel()
+    kernel.add_functions(plugin_name="helpers", functions=[goodbye, get_weather, get_date_time])
+
+    # create the audio player and audio track
+    # both take a device_id parameter, which is the index of the device to use, if None the default device is used
+    audio_player = SKAudioPlayer(sample_rate=24000, frame_duration=100, channels=1)
+    audio_track = SKAudioTrack()
+    # create the realtime client and optionally add the audio output function, this is optional
+    # you can define the protocol to use, either "websocket" or "webrtc"
+    # they will behave the same way, even though the underlying protocol is quite different
+    realtime_client = OpenAIRealtime(
+        protocol="websocket",
+        audio_output_callback=audio_player.client_callback,
+        # audio_track=audio_track,
+    )
+
+    # Create the settings for the session
+    # The realtime api, does not use a system message, but takes instructions as a parameter for a session
+    instructions = """
+    You are a chat bot. Your name is Mosscap and
+    you have one goal: figure out what people need.
+    Your full name, should you need to know it, is
+    Splendid Speckled Mosscap. You communicate
+    effectively, but you tend to answer with long
+    flowery prose.
+    """
+    # the key thing to decide on is to enable the server_vad turn detection
+    # if turn is turned off (by setting turn_detection=None), you will have to send
+    # the "input_audio_buffer.commit" and "response.create" event to the realtime api
+    # to signal the end of the user's turn and start the response.
+    # manual VAD is not part of this sample
+    settings = OpenAIRealtimeExecutionSettings(
+        instructions=instructions,
+        voice="alloy",
+        turn_detection=TurnDetection(type="server_vad", create_response=True, silence_duration_ms=800, threshold=0.8),
+        function_choice_behavior=FunctionChoiceBehavior.Auto(),
+    )
+    # and we can add a chat history to conversation after starting it
+    chat_history = ChatHistory()
+    chat_history.add_user_message("Hi there, who are you?")
+    chat_history.add_assistant_message("I am Mosscap, a chat bot. I'm trying to figure out what people need.")
+
+    # the context manager calls the create_session method on the client and start listening to the audio stream
+    async with realtime_client, audio_player, audio_track.stream_to_realtime_client(realtime_client):
+        await realtime_client.update_session(
+            settings=settings, chat_history=chat_history, kernel=kernel, create_response=True
+        )
+        print("Mosscap (transcript): ", end="")
+        async for event in realtime_client.receive():
+            match event.event_type:
+                case "text":
+                    if print_transcript:
+                        print(event.text.text, end="")
+                case "service":
+                    # OpenAI Specific events
+                    match event.service_type:
+                        case ListenEvents.RESPONSE_CREATED:
+                            if print_transcript:
+                                print("")
+                        case ListenEvents.ERROR:
+                            logger.error(event.event)
+
+
+if __name__ == "__main__":
+    print(
+        "Instruction: start speaking, when you stop the API should detect you finished and start responding. "
+        "Press ctrl + c to stop the program."
+    )
+    asyncio.run(main())
@@ -0,0 +1,11 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+import logging
+
+import sounddevice as sd
+
+logger = logging.getLogger(__name__)
+
+
+def check_audio_devices():
+    logger.debug(sd.query_devices())
@@ -276,7 +276,9 @@ async def get_streaming_chat_message_contents(
                     for msg in messages:
                         if msg is not None:
                             all_messages.append(msg)
-                            if any(isinstance(item, FunctionCallContent) for item in msg.items):
+                            if not function_call_returned and any(
+                                isinstance(item, FunctionCallContent) for item in msg.items
+                            ):
                                 function_call_returned = True
                     yield messages
 
@@ -442,7 +444,10 @@ def _get_ai_model_id(self, settings: "PromptExecutionSettings") -> str:
         return getattr(settings, "ai_model_id", self.ai_model_id) or self.ai_model_id
 
     def _yield_function_result_messages(self, function_result_messages: list) -> bool:
-        """Determine if the function result messages should be yielded."""
+        """Determine if the function result messages should be yielded.
+
+        If there are messages and if the first message has items, then yield the messages.
+        """
         return len(function_result_messages) > 0 and len(function_result_messages[0].items) > 0
 
     # endregion
@@ -1,13 +1,16 @@
 # Copyright (c) Microsoft. All rights reserved.
 
 from collections import OrderedDict
+from collections.abc import Callable
+from copy import deepcopy
 from typing import TYPE_CHECKING, Any
 
 from semantic_kernel.contents.chat_message_content import ChatMessageContent
 from semantic_kernel.contents.function_result_content import FunctionResultContent
 from semantic_kernel.contents.streaming_chat_message_content import StreamingChatMessageContent
 from semantic_kernel.contents.utils.author_role import AuthorRole
 from semantic_kernel.exceptions.service_exceptions import ServiceInitializationError
+from semantic_kernel.utils.experimental_decorator import experimental_function
 
 if TYPE_CHECKING:
     from semantic_kernel.connectors.ai.function_choice_behavior import (
@@ -16,6 +19,7 @@
     )
     from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings
     from semantic_kernel.functions.kernel_function_metadata import KernelFunctionMetadata
+    from semantic_kernel.kernel import Kernel
 
 
 def update_settings_from_function_call_configuration(
@@ -129,3 +133,49 @@ def merge_streaming_function_results(
             function_invoke_attempt=function_invoke_attempt,
         )
     ]
+
+
+@experimental_function
+def prepare_settings_for_function_calling(
+    settings: "PromptExecutionSettings",
+    settings_class: type["PromptExecutionSettings"],
+    update_settings_callback: Callable[..., None],
+    kernel: "Kernel",
+) -> "PromptExecutionSettings":
+    """Prepare settings for the service.
+
+    Args:
+        settings: Prompt execution settings.
+        settings_class: The settings class.
+        update_settings_callback: The callback to update the settings.
+        kernel: Kernel instance.
+
+    Returns:
+        PromptExecutionSettings of type settings_class.
+    """
+    settings = deepcopy(settings)
+    if not isinstance(settings, settings_class):
+        settings = settings_class.from_prompt_execution_settings(settings)
+
+    # For backwards compatibility we need to convert the `FunctionCallBehavior` to `FunctionChoiceBehavior`
+    # if this method is called with a `FunctionCallBehavior` object as part of the settings
+
+    from semantic_kernel.connectors.ai.function_call_behavior import FunctionCallBehavior
+    from semantic_kernel.connectors.ai.function_choice_behavior import FunctionChoiceBehavior
+
+    if hasattr(settings, "function_call_behavior") and isinstance(
+        settings.function_call_behavior, FunctionCallBehavior
+    ):
+        settings.function_choice_behavior = FunctionChoiceBehavior.from_function_call_behavior(
+            settings.function_call_behavior
+        )
+
+    if settings.function_choice_behavior:
+        # Configure the function choice behavior into the settings object
+        # that will become part of the request to the AI service
+        settings.function_choice_behavior.configure(
+            kernel=kernel,
+            update_settings_callback=update_settings_callback,
+            settings=settings,
+        )
+    return settings