diff --git a/instrumentation-genai/opentelemetry-instrumentation-openai-v2/README.rst b/instrumentation-genai/opentelemetry-instrumentation-openai-v2/README.rst index 32de3ed255..25e8ec86d8 100644 --- a/instrumentation-genai/opentelemetry-instrumentation-openai-v2/README.rst +++ b/instrumentation-genai/opentelemetry-instrumentation-openai-v2/README.rst @@ -19,7 +19,7 @@ Many LLM platforms support the OpenAI SDK. This means systems such as the follow * - Name - gen_ai.system * - `Azure OpenAI `_ - - ``az.ai.openai`` + - ``azure.ai.openai`` * - `Gemini `_ - ``gemini`` * - `Perplexity `_ @@ -80,7 +80,26 @@ Enabling message content Message content such as the contents of the prompt, completion, function arguments and return values are not captured by default. To capture message content as log events, set the environment variable -`OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT` to `true`. +``OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT`` to one of the following values: + +- ``true`` - Legacy. Used to enable content capturing on ``gen_ai.{role}.message`` and ``gen_ai.choice`` events when + `latest experimental features <#enabling-the-latest-experimental-features>`_ are *not* enabled. +- ``span`` - Used to enable content capturing on *span* attributes when + `latest experimental features <#enabling-the-latest-experimental-features>`_ are enabled. +- ``event`` - Used to enable content capturing on *event* attributes when + `latest experimental features <#enabling-the-latest-experimental-features>`_ are enabled. + +Enabling the latest experimental features +*********************************************** + +To enable the latest experimental features, set the environment variable +``OTEL_SEMCONV_STABILITY_OPT_IN`` to ``gen_ai_latest_experimental``. Or, if you use +``OTEL_SEMCONV_STABILITY_OPT_IN`` to enable other features, append ``,gen_ai_latest_experimental`` to its value. + +Without this setting, OpenAI instrumentation aligns with `Semantic Conventions v1.28.0 `_ +and would not capture additional details introduced in later versions. + +.. note:: Generative AI semantic conventions are still evolving. The latest experimental features will introduce breaking changes in future releases. Uninstrument ************ diff --git a/instrumentation-genai/opentelemetry-instrumentation-openai-v2/examples/manual/.env b/instrumentation-genai/opentelemetry-instrumentation-openai-v2/examples/manual/.env index 1e77ee78c0..16e686f44a 100644 --- a/instrumentation-genai/opentelemetry-instrumentation-openai-v2/examples/manual/.env +++ b/instrumentation-genai/opentelemetry-instrumentation-openai-v2/examples/manual/.env @@ -12,5 +12,19 @@ OPENAI_API_KEY=sk-YOUR_API_KEY OTEL_SERVICE_NAME=opentelemetry-python-openai -# Change to 'false' to hide prompt and completion content -OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT=true +# Remove or change to 'none' to hide prompt and completion content +# Possible values (case insensitive): +# - `span` - record content on span attibutes +# - `event` - record content on event attributes +# - `true` - only used for backward compatibility when +# `gen_ai_latest_experimental` is not set in the +# `OTEL_SEMCONV_STABILITY_OPT_IN` environemnt variable. +# - everything else - don't record content on any signal +OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT=span + +# Enables latest and greatest features available in GenAI semantic conventions. +# Note: since conventions are still in development, using this flag would +# likely result in having breaking changes. +# +# Comment out if you want to use semantic conventions of version 1.36.0. +OTEL_SEMCONV_STABILITY_OPT_IN=gen_ai_latest_experimental diff --git a/instrumentation-genai/opentelemetry-instrumentation-openai-v2/examples/manual/README.rst b/instrumentation-genai/opentelemetry-instrumentation-openai-v2/examples/manual/README.rst index 61e4c4ae8e..cd380f0fbe 100644 --- a/instrumentation-genai/opentelemetry-instrumentation-openai-v2/examples/manual/README.rst +++ b/instrumentation-genai/opentelemetry-instrumentation-openai-v2/examples/manual/README.rst @@ -11,7 +11,8 @@ your OpenAI requests. Note: `.env <.env>`_ file configures additional environment variables: -- ``OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT=true`` configures OpenAI instrumentation to capture prompt and completion contents on events. +- ``OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT=span`` configures OpenAI instrumentation to capture prompt and completion contents on *span* attributes. +- ``OTEL_SEMCONV_STABILITY_OPT_IN=gen_ai_latest_experimental`` enables latest experimental features. Setup ----- diff --git a/instrumentation-genai/opentelemetry-instrumentation-openai-v2/examples/zero-code/.env b/instrumentation-genai/opentelemetry-instrumentation-openai-v2/examples/zero-code/.env index 8f2dd62b91..489353a1b1 100644 --- a/instrumentation-genai/opentelemetry-instrumentation-openai-v2/examples/zero-code/.env +++ b/instrumentation-genai/opentelemetry-instrumentation-openai-v2/examples/zero-code/.env @@ -18,5 +18,19 @@ OTEL_PYTHON_LOGGING_AUTO_INSTRUMENTATION_ENABLED=true # Uncomment if your OTLP endpoint doesn't support logs # OTEL_LOGS_EXPORTER=console -# Change to 'false' to hide prompt and completion content -OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT=true +# Remove or change to 'none' to hide prompt and completion content +# Possible values (case insensitive): +# - `span` - record content on span attibutes +# - `event` - record content on event attributes +# - `true` - only used for backward compatibility when +# `gen_ai_latest_experimental` is not set in the +# `OTEL_SEMCONV_STABILITY_OPT_IN` environemnt variable. +# - everything else - don't record content on any signal +OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT=span + +# Enables latest and greatest features available in GenAI semantic conventions. +# Note: since conventions are still in development, using this flag would +# likely result in having breaking changes. +# +# Comment out if you want to use semantic conventions of version 1.36.0. +OTEL_SEMCONV_STABILITY_OPT_IN=gen_ai_latest_experimental diff --git a/instrumentation-genai/opentelemetry-instrumentation-openai-v2/examples/zero-code/README.rst b/instrumentation-genai/opentelemetry-instrumentation-openai-v2/examples/zero-code/README.rst index 4332c0b7c0..c9e2cdfd7a 100644 --- a/instrumentation-genai/opentelemetry-instrumentation-openai-v2/examples/zero-code/README.rst +++ b/instrumentation-genai/opentelemetry-instrumentation-openai-v2/examples/zero-code/README.rst @@ -13,8 +13,9 @@ your OpenAI requests. Note: `.env <.env>`_ file configures additional environment variables: - ``OTEL_PYTHON_LOGGING_AUTO_INSTRUMENTATION_ENABLED=true`` configures OpenTelemetry SDK to export logs and events. -- ``OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT=true`` configures OpenAI instrumentation to capture prompt and completion contents on events. +- ``OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT=span`` configures OpenAI instrumentation to capture prompt and completion contents on *span* attributes. - ``OTEL_LOGS_EXPORTER=otlp`` to specify exporter type. +- ``OTEL_SEMCONV_STABILITY_OPT_IN=gen_ai_latest_experimental`` enables latest experimental features. Setup ----- diff --git a/instrumentation-genai/opentelemetry-instrumentation-openai-v2/src/opentelemetry/instrumentation/openai_v2/__init__.py b/instrumentation-genai/opentelemetry-instrumentation-openai-v2/src/opentelemetry/instrumentation/openai_v2/__init__.py index ab4b6f9d7b..2396949a12 100644 --- a/instrumentation-genai/opentelemetry-instrumentation-openai-v2/src/opentelemetry/instrumentation/openai_v2/__init__.py +++ b/instrumentation-genai/opentelemetry-instrumentation-openai-v2/src/opentelemetry/instrumentation/openai_v2/__init__.py @@ -47,7 +47,10 @@ from opentelemetry._events import get_event_logger from opentelemetry.instrumentation.instrumentor import BaseInstrumentor from opentelemetry.instrumentation.openai_v2.package import _instruments -from opentelemetry.instrumentation.openai_v2.utils import is_content_enabled +from opentelemetry.instrumentation.openai_v2.utils import ( + get_content_mode, + is_latest_experimental_enabled, +) from opentelemetry.instrumentation.utils import unwrap from opentelemetry.metrics import get_meter from opentelemetry.semconv.schemas import Schemas @@ -71,13 +74,13 @@ def _instrument(self, **kwargs): __name__, "", tracer_provider, - schema_url=Schemas.V1_28_0.value, + schema_url="https://opentelemetry.io/schemas/1.37.0", # TODO: Schemas.V1_37_0.value, ) event_logger_provider = kwargs.get("event_logger_provider") event_logger = get_event_logger( __name__, "", - schema_url=Schemas.V1_28_0.value, + schema_url="https://opentelemetry.io/schemas/1.37.0", # TODO: Schemas.V1_37_0.value, event_logger_provider=event_logger_provider, ) meter_provider = kwargs.get("meter_provider") @@ -85,16 +88,21 @@ def _instrument(self, **kwargs): __name__, "", meter_provider, - schema_url=Schemas.V1_28_0.value, + schema_url="https://opentelemetry.io/schemas/1.37.0", # TODO: Schemas.V1_37_0.value, ) instruments = Instruments(self._meter) + latest_experimental_enabled = is_latest_experimental_enabled() wrap_function_wrapper( module="openai.resources.chat.completions", name="Completions.create", wrapper=chat_completions_create( - tracer, event_logger, instruments, is_content_enabled() + tracer, + event_logger, + instruments, + get_content_mode(latest_experimental_enabled), + latest_experimental_enabled, ), ) @@ -102,7 +110,11 @@ def _instrument(self, **kwargs): module="openai.resources.chat.completions", name="AsyncCompletions.create", wrapper=async_chat_completions_create( - tracer, event_logger, instruments, is_content_enabled() + tracer, + event_logger, + instruments, + get_content_mode(latest_experimental_enabled), + latest_experimental_enabled, ), ) diff --git a/instrumentation-genai/opentelemetry-instrumentation-openai-v2/src/opentelemetry/instrumentation/openai_v2/patch.py b/instrumentation-genai/opentelemetry-instrumentation-openai-v2/src/opentelemetry/instrumentation/openai_v2/patch.py index 072365abb7..434450d9a0 100644 --- a/instrumentation-genai/opentelemetry-instrumentation-openai-v2/src/opentelemetry/instrumentation/openai_v2/patch.py +++ b/instrumentation-genai/opentelemetry-instrumentation-openai-v2/src/opentelemetry/instrumentation/openai_v2/patch.py @@ -13,8 +13,9 @@ # limitations under the License. +import json from timeit import default_timer -from typing import Optional +from typing import List, Optional from openai import Stream @@ -29,12 +30,18 @@ from .instruments import Instruments from .utils import ( - choice_to_event, + ContentCapturingMode, + DataclassEncoder, + OutputMessage, + TextPart, + ToolCallRequestPart, + create_details_event_attributes, get_llm_request_attributes, - handle_span_exception, is_streaming, - message_to_event, - set_span_attribute, + record_exception, + record_input_messages, + record_output_messages, + set_attribute, ) @@ -42,12 +49,23 @@ def chat_completions_create( tracer: Tracer, event_logger: EventLogger, instruments: Instruments, - capture_content: bool, + content_mode: ContentCapturingMode, + latest_experimental_enabled: bool, ): """Wrap the `create` method of the `ChatCompletion` class to trace it.""" def traced_method(wrapped, instance, args, kwargs): - span_attributes = {**get_llm_request_attributes(kwargs, instance)} + span_attributes = { + **get_llm_request_attributes( + kwargs, + instance, + GenAIAttributes.GenAiOperationNameValues.CHAT.value, + latest_experimental_enabled, + ) + } + details_event_attributes = create_details_event_attributes( + span_attributes, latest_experimental_enabled, content_mode + ) span_name = f"{span_attributes[GenAIAttributes.GEN_AI_OPERATION_NAME]} {span_attributes[GenAIAttributes.GEN_AI_REQUEST_MODEL]}" with tracer.start_as_current_span( @@ -56,8 +74,14 @@ def traced_method(wrapped, instance, args, kwargs): attributes=span_attributes, end_on_exit=False, ) as span: - for message in kwargs.get("messages", []): - event_logger.emit(message_to_event(message, capture_content)) + record_input_messages( + kwargs.get("messages", []), + content_mode, + latest_experimental_enabled, + span, + details_event_attributes, + event_logger, + ) start = default_timer() result = None @@ -66,22 +90,48 @@ def traced_method(wrapped, instance, args, kwargs): result = wrapped(*args, **kwargs) if is_streaming(kwargs): return StreamWrapper( - result, span, event_logger, capture_content + result, + span, + details_event_attributes, + event_logger, + content_mode, + latest_experimental_enabled, ) - if span.is_recording(): - _set_response_attributes( - span, result, event_logger, capture_content + _set_response_attributes( + span, + details_event_attributes, + result, + latest_experimental_enabled, + ) + record_output_messages( + getattr(result, "choices", []), + content_mode, + latest_experimental_enabled, + span, + details_event_attributes, + event_logger, + ) + + if details_event_attributes: + event_logger.emit( + Event( + name="gen_ai.client.inference.operation.details", + attributes=details_event_attributes, + trace_id=span.get_span_context().trace_id, + span_id=span.get_span_context().span_id, + trace_flags=span.get_span_context().trace_flags, + ) ) - for choice in getattr(result, "choices", []): - event_logger.emit(choice_to_event(choice, capture_content)) span.end() return result except Exception as error: error_type = type(error).__qualname__ - handle_span_exception(span, error) + record_exception( + span, details_event_attributes, error, event_logger + ) raise finally: duration = max((default_timer() - start), 0) @@ -91,6 +141,7 @@ def traced_method(wrapped, instance, args, kwargs): result, span_attributes, error_type, + latest_experimental_enabled, ) return traced_method @@ -100,12 +151,24 @@ def async_chat_completions_create( tracer: Tracer, event_logger: EventLogger, instruments: Instruments, - capture_content: bool, + content_mode: ContentCapturingMode, + latest_experimental_enabled: bool, ): """Wrap the `create` method of the `AsyncChatCompletion` class to trace it.""" async def traced_method(wrapped, instance, args, kwargs): - span_attributes = {**get_llm_request_attributes(kwargs, instance)} + span_attributes = { + **get_llm_request_attributes( + kwargs, + instance, + GenAIAttributes.GenAiOperationNameValues.CHAT.value, + latest_experimental_enabled, + ) + } + + details_event_attributes = create_details_event_attributes( + span_attributes, latest_experimental_enabled, content_mode + ) span_name = f"{span_attributes[GenAIAttributes.GEN_AI_OPERATION_NAME]} {span_attributes[GenAIAttributes.GEN_AI_REQUEST_MODEL]}" with tracer.start_as_current_span( @@ -114,8 +177,14 @@ async def traced_method(wrapped, instance, args, kwargs): attributes=span_attributes, end_on_exit=False, ) as span: - for message in kwargs.get("messages", []): - event_logger.emit(message_to_event(message, capture_content)) + record_input_messages( + kwargs.get("messages", []), + content_mode, + latest_experimental_enabled, + span, + details_event_attributes, + event_logger, + ) start = default_timer() result = None @@ -124,22 +193,47 @@ async def traced_method(wrapped, instance, args, kwargs): result = await wrapped(*args, **kwargs) if is_streaming(kwargs): return StreamWrapper( - result, span, event_logger, capture_content + result, + span, + details_event_attributes, + event_logger, + content_mode, + latest_experimental_enabled, ) - if span.is_recording(): - _set_response_attributes( - span, result, event_logger, capture_content - ) - for choice in getattr(result, "choices", []): - event_logger.emit(choice_to_event(choice, capture_content)) + _set_response_attributes( + span, + details_event_attributes, + result, + latest_experimental_enabled, + ) + record_output_messages( + getattr(result, "choices", []), + content_mode, + latest_experimental_enabled, + span, + details_event_attributes, + event_logger, + ) + if details_event_attributes: + event_logger.emit( + Event( + name="gen_ai.client.inference.operation.details", + attributes=details_event_attributes, + trace_id=span.get_span_context().trace_id, + span_id=span.get_span_context().span_id, + trace_flags=span.get_span_context().trace_flags, + ) + ) span.end() return result except Exception as error: error_type = type(error).__qualname__ - handle_span_exception(span, error) + record_exception( + span, details_event_attributes, error, event_logger + ) raise finally: duration = max((default_timer() - start), 0) @@ -149,6 +243,7 @@ async def traced_method(wrapped, instance, args, kwargs): result, span_attributes, error_type, + latest_experimental_enabled, ) return traced_method @@ -160,10 +255,16 @@ def _record_metrics( result, span_attributes: dict, error_type: Optional[str], + latest_experimental_enabled: bool, ): + provider_name_attr_name = ( + "gen_ai.provider.name" + if latest_experimental_enabled + else GenAIAttributes.GEN_AI_SYSTEM + ) common_attributes = { GenAIAttributes.GEN_AI_OPERATION_NAME: GenAIAttributes.GenAiOperationNameValues.CHAT.value, - GenAIAttributes.GEN_AI_SYSTEM: GenAIAttributes.GenAiSystemValues.OPENAI.value, + provider_name_attr_name: GenAIAttributes.GenAiSystemValues.OPENAI.value, GenAIAttributes.GEN_AI_REQUEST_MODEL: span_attributes[ GenAIAttributes.GEN_AI_REQUEST_MODEL ], @@ -175,13 +276,21 @@ def _record_metrics( if result and getattr(result, "model", None): common_attributes[GenAIAttributes.GEN_AI_RESPONSE_MODEL] = result.model + service_tier_attr_key = ( + "openai.response.service_tier" + if latest_experimental_enabled + else GenAIAttributes.GEN_AI_OPENAI_RESPONSE_SERVICE_TIER + ) if result and getattr(result, "service_tier", None): - common_attributes[ - GenAIAttributes.GEN_AI_OPENAI_RESPONSE_SERVICE_TIER - ] = result.service_tier + common_attributes[service_tier_attr_key] = result.service_tier + system_fingerprint_attr_key = ( + "openai.response.system_fingerprint" + if latest_experimental_enabled + else GenAIAttributes.GEN_AI_OPENAI_RESPONSE_SYSTEM_FINGERPRINT + ) if result and getattr(result, "system_fingerprint", None): - common_attributes["gen_ai.openai.response.system_fingerprint"] = ( + common_attributes[system_fingerprint_attr_key] = ( result.system_fingerprint ) @@ -221,42 +330,63 @@ def _record_metrics( def _set_response_attributes( - span, result, event_logger: EventLogger, capture_content: bool + span, details_event_attributes, result, latest_experimental_enabled: bool ): - set_span_attribute( - span, GenAIAttributes.GEN_AI_RESPONSE_MODEL, result.model + if not span.is_recording() and details_event_attributes is None: + return + + set_attribute( + span, + details_event_attributes, + GenAIAttributes.GEN_AI_RESPONSE_MODEL, + result.model, ) + # finish reasons if getattr(result, "choices", None): finish_reasons = [] for choice in result.choices: finish_reasons.append(choice.finish_reason or "error") - set_span_attribute( + set_attribute( span, + details_event_attributes, GenAIAttributes.GEN_AI_RESPONSE_FINISH_REASONS, finish_reasons, ) if getattr(result, "id", None): - set_span_attribute(span, GenAIAttributes.GEN_AI_RESPONSE_ID, result.id) + set_attribute( + span, + details_event_attributes, + GenAIAttributes.GEN_AI_RESPONSE_ID, + result.id, + ) + service_tier_attr_key = ( + "openai.response.service_tier" + if latest_experimental_enabled + else GenAIAttributes.GEN_AI_OPENAI_RESPONSE_SERVICE_TIER + ) if getattr(result, "service_tier", None): - set_span_attribute( + set_attribute( span, - GenAIAttributes.GEN_AI_OPENAI_REQUEST_SERVICE_TIER, + details_event_attributes, + service_tier_attr_key, result.service_tier, ) # Get the usage if getattr(result, "usage", None): - set_span_attribute( + set_attribute( span, + details_event_attributes, GenAIAttributes.GEN_AI_USAGE_INPUT_TOKENS, result.usage.prompt_tokens, ) - set_span_attribute( + set_attribute( span, + details_event_attributes, GenAIAttributes.GEN_AI_USAGE_OUTPUT_TOKENS, result.usage.completion_tokens, ) @@ -300,6 +430,7 @@ def append_tool_call(self, tool_call): class StreamWrapper: span: Span + details_event_attributes: Optional[dict] response_id: Optional[str] = None response_model: Optional[str] = None service_tier: Optional[str] = None @@ -311,14 +442,18 @@ def __init__( self, stream: Stream, span: Span, + details_event_attributes: dict, event_logger: EventLogger, - capture_content: bool, + content_mode: ContentCapturingMode, + latest_experimental_enabled: bool, ): self.stream = stream self.span = span + self.details_event_attributes = details_event_attributes self.choice_buffers = [] self._span_started = False - self.capture_content = capture_content + self.content_mode = content_mode + self.latest_experimental_enabled = latest_experimental_enabled self.event_logger = event_logger self.setup() @@ -329,90 +464,173 @@ def setup(self): def cleanup(self): if self._span_started: - if self.span.is_recording(): + if self.span.is_recording() or self.details_event_attributes: if self.response_model: - set_span_attribute( + set_attribute( self.span, + self.details_event_attributes, GenAIAttributes.GEN_AI_RESPONSE_MODEL, self.response_model, ) if self.response_id: - set_span_attribute( + set_attribute( self.span, + self.details_event_attributes, GenAIAttributes.GEN_AI_RESPONSE_ID, self.response_id, ) - set_span_attribute( + set_attribute( self.span, + self.details_event_attributes, GenAIAttributes.GEN_AI_USAGE_INPUT_TOKENS, self.prompt_tokens, ) - set_span_attribute( + set_attribute( self.span, + self.details_event_attributes, GenAIAttributes.GEN_AI_USAGE_OUTPUT_TOKENS, self.completion_tokens, ) - set_span_attribute( + service_tier_attr_key = ( + "openai.response.service_tier" + if self.latest_experimental_enabled + else GenAIAttributes.GEN_AI_OPENAI_RESPONSE_SERVICE_TIER + ) + set_attribute( self.span, - GenAIAttributes.GEN_AI_OPENAI_RESPONSE_SERVICE_TIER, + self.details_event_attributes, + service_tier_attr_key, self.service_tier, ) - set_span_attribute( + set_attribute( self.span, + self.details_event_attributes, GenAIAttributes.GEN_AI_RESPONSE_FINISH_REASONS, self.finish_reasons, ) - for idx, choice in enumerate(self.choice_buffers): - message = {"role": "assistant"} - if self.capture_content and choice.text_content: - message["content"] = "".join(choice.text_content) - if choice.tool_calls_buffers: - tool_calls = [] - for tool_call in choice.tool_calls_buffers: - function = {"name": tool_call.function_name} - if self.capture_content: - function["arguments"] = "".join( - tool_call.arguments - ) - tool_call_dict = { - "id": tool_call.tool_call_id, - "type": "function", - "function": function, - } - tool_calls.append(tool_call_dict) - message["tool_calls"] = tool_calls - - body = { - "index": idx, - "finish_reason": choice.finish_reason or "error", - "message": message, - } - - event_attributes = { - GenAIAttributes.GEN_AI_SYSTEM: GenAIAttributes.GenAiSystemValues.OPENAI.value - } - - # this span is not current, so we need to manually set the context on event - span_ctx = self.span.get_span_context() - self.event_logger.emit( - Event( - name="gen_ai.choice", - attributes=event_attributes, - body=body, - trace_id=span_ctx.trace_id, - span_id=span_ctx.span_id, - trace_flags=span_ctx.trace_flags, + if self.latest_experimental_enabled: + if ( + self.content_mode == ContentCapturingMode.SPAN + and self.span.is_recording() + ): + output_messages = self._prepare_output_messages() + + self.span.set_attribute( + "gen_ai.output.messages", + json.dumps( + output_messages, + ensure_ascii=False, + cls=DataclassEncoder, + ), + ) + # TODO: once logger.enabled is supported, we should use it to optimize + # and, when enabled, can record event even when content is disabled + if ( + self.content_mode == ContentCapturingMode.EVENT + and self.details_event_attributes is not None + ): + output_messages = self._prepare_output_messages() + self.details_event_attributes["gen_ai.output.messages"] = ( + json.dumps( + output_messages, + ensure_ascii=False, + cls=DataclassEncoder, + ) + ) + + self.event_logger.emit( + Event( + name="gen_ai.client.inference.operation.details", + attributes=self.details_event_attributes, + trace_id=self.span.get_span_context().trace_id, + span_id=self.span.get_span_context().span_id, + trace_flags=self.span.get_span_context().trace_flags, + ) + ) + else: + for idx, choice in enumerate(self.choice_buffers): + message = {"role": "assistant"} + if ( + self.content_mode == ContentCapturingMode.EVENT + and choice.text_content + ): + message["content"] = "".join(choice.text_content) + if choice.tool_calls_buffers: + tool_calls = [] + for tool_call in choice.tool_calls_buffers: + function = {"name": tool_call.function_name} + if self.content_mode == ContentCapturingMode.EVENT: + function["arguments"] = "".join( + tool_call.arguments + ) + tool_call_dict = { + "id": tool_call.tool_call_id, + "type": "function", + "function": function, + } + tool_calls.append(tool_call_dict) + message["tool_calls"] = tool_calls + + body = { + "index": idx, + "finish_reason": choice.finish_reason or "error", + "message": message, + } + + event_attributes = { + GenAIAttributes.GEN_AI_SYSTEM: GenAIAttributes.GenAiSystemValues.OPENAI.value + } + + # this span is not current, so we need to manually set the context on event + span_ctx = self.span.get_span_context() + self.event_logger.emit( + Event( + name="gen_ai.choice", + attributes=event_attributes, + body=body, + trace_id=span_ctx.trace_id, + span_id=span_ctx.span_id, + trace_flags=span_ctx.trace_flags, + ) ) - ) self.span.end() self._span_started = False + def _prepare_output_messages(self) -> List[OutputMessage]: + output_messages = [] + for choice in self.choice_buffers: + message = OutputMessage( + finish_reason=choice.finish_reason or "error", + role="assistant", + ) + output_messages.append(message) + + if choice.text_content: + message.parts.append( + TextPart(content="".join(choice.text_content)) + ) + if choice.tool_calls_buffers: + for tool_call in choice.tool_calls_buffers: + part = ToolCallRequestPart( + name=tool_call.function_name, + id=tool_call.tool_call_id, + ) + arguments = "".join(tool_call.arguments) + if arguments: + try: + part.arguments = json.loads(arguments) + except json.JSONDecodeError: + part.arguments = arguments + + message.parts.append(part) + return output_messages + def __enter__(self): self.setup() return self @@ -420,7 +638,12 @@ def __enter__(self): def __exit__(self, exc_type, exc_val, exc_tb): try: if exc_type is not None: - handle_span_exception(self.span, exc_val) + record_exception( + self.span, + self.details_event_attributes, + exc_val, + self.event_logger, + ) finally: self.cleanup() return False # Propagate the exception @@ -432,7 +655,12 @@ async def __aenter__(self): async def __aexit__(self, exc_type, exc_val, exc_tb): try: if exc_type is not None: - handle_span_exception(self.span, exc_val) + record_exception( + self.span, + self.details_event_attributes, + exc_val, + self.event_logger, + ) finally: self.cleanup() return False # Propagate the exception @@ -456,7 +684,12 @@ def __next__(self): self.cleanup() raise except Exception as error: - handle_span_exception(self.span, error) + record_exception( + self.span, + self.details_event_attributes, + error, + self.event_logger, + ) self.cleanup() raise @@ -469,7 +702,12 @@ async def __anext__(self): self.cleanup() raise except Exception as error: - handle_span_exception(self.span, error) + record_exception( + self.span, + self.details_event_attributes, + error, + self.event_logger, + ) self.cleanup() raise diff --git a/instrumentation-genai/opentelemetry-instrumentation-openai-v2/src/opentelemetry/instrumentation/openai_v2/utils.py b/instrumentation-genai/opentelemetry-instrumentation-openai-v2/src/opentelemetry/instrumentation/openai_v2/utils.py index f8a837259e..79549b8f51 100644 --- a/instrumentation-genai/opentelemetry-instrumentation-openai-v2/src/opentelemetry/instrumentation/openai_v2/utils.py +++ b/instrumentation-genai/opentelemetry-instrumentation-openai-v2/src/opentelemetry/instrumentation/openai_v2/utils.py @@ -12,14 +12,20 @@ # See the License for the specific language governing permissions and # limitations under the License. +import dataclasses +import json +import logging +from collections.abc import Iterable +from dataclasses import dataclass, field +from enum import Enum from os import environ -from typing import Mapping, Optional, Union +from typing import Any, List, Mapping, Optional, Union from urllib.parse import urlparse from httpx import URL from openai import NOT_GIVEN -from opentelemetry._events import Event +from opentelemetry._events import Event, EventLogger from opentelemetry.semconv._incubating.attributes import ( gen_ai_attributes as GenAIAttributes, ) @@ -29,22 +35,84 @@ from opentelemetry.semconv.attributes import ( error_attributes as ErrorAttributes, ) +from opentelemetry.trace import Span from opentelemetry.trace.status import Status, StatusCode OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT = ( "OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT" ) +# TODO: reuse common code +OTEL_SEMCONV_STABILITY_OPT_IN = "OTEL_SEMCONV_STABILITY_OPT_IN" +logger = logging.getLogger(__name__) -def is_content_enabled() -> bool: + +class ContentCapturingMode(str, Enum): + SPAN = "span" + EVENT = "event" + NONE = "none" + + +def get_content_mode( + latest_experimental_enabled: bool, +) -> ContentCapturingMode: capture_content = environ.get( - OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT, "false" + OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT, "none" + ).lower() + + if latest_experimental_enabled: + try: + return ContentCapturingMode(capture_content) + except ValueError as ex: + logger.warning( + "Error when parsing `%s` environment variable: {%s}", + OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT, + str(ex), + ) + return ContentCapturingMode.NONE + + else: + # back-compat + return ( + ContentCapturingMode.EVENT + if capture_content == "true" + else ContentCapturingMode.NONE + ) + + +def is_latest_experimental_enabled() -> bool: + stability_opt_in = environ.get(OTEL_SEMCONV_STABILITY_OPT_IN, None) + + return ( + stability_opt_in is not None + and stability_opt_in.lower() == "gen_ai_latest_experimental" ) - return capture_content.lower() == "true" +def create_details_event_attributes( + request_attributes: dict, + latest_experimental_enabled: bool, + content_mode: ContentCapturingMode, +): + # TODO: once logger.enabled is supported, we should use it to optimize + # and, when enabled, can record event even when content is disabled + # for now, let's only enable event when user enabled content on events. + details_event_attributes = ( + request_attributes.copy() + if latest_experimental_enabled + and content_mode == ContentCapturingMode.EVENT + else None + ) + # TODO: switch to proper event name once possible + if details_event_attributes: + details_event_attributes["event.name"] = ( + "gen_ai.client.inference.operation.details" + ) -def extract_tool_calls(item, capture_content): + return details_event_attributes + + +def extract_tool_calls_old(item, content_mode: ContentCapturingMode): tool_calls = get_property_value(item, "tool_calls") if tool_calls is None: return None @@ -69,7 +137,7 @@ def extract_tool_calls(item, capture_content): tool_call_dict["function"]["name"] = name arguments = get_property_value(func, "arguments") - if capture_content and arguments: + if content_mode == ContentCapturingMode.EVENT and arguments: if isinstance(arguments, str): arguments = arguments.replace("\n", "") tool_call_dict["function"]["arguments"] = arguments @@ -78,6 +146,33 @@ def extract_tool_calls(item, capture_content): return calls +def extract_tool_calls_new(tool_calls) -> list["ToolCallRequestPart"]: + parts = [] + for tool_call in tool_calls: + tool_call_part = ToolCallRequestPart() + call_id = get_property_value(tool_call, "id") + if call_id: + tool_call_part.id = call_id + + func = get_property_value(tool_call, "function") + if func: + tool_call_part.function = {} + name = get_property_value(func, "name") + if name: + tool_call_part.name = name + + arguments = get_property_value(func, "arguments") + if arguments: + try: + tool_call_part.arguments = json.loads(arguments) + except json.JSONDecodeError: + tool_call_part.arguments = arguments + + # TODO: support custom + parts.append(tool_call_part) + return parts + + def set_server_address_and_port(client_instance, attributes): base_client = getattr(client_instance, "_client", None) base_url = getattr(base_client, "base_url", None) @@ -104,7 +199,139 @@ def get_property_value(obj, property_name): return getattr(obj, property_name, None) -def message_to_event(message, capture_content): +def record_input_messages( + messages, + content_mode: ContentCapturingMode, + latest_experimental_enabled: bool, + span: Span, + details_event_attributes: dict, + event_logger: EventLogger, +): + if latest_experimental_enabled: + if ( + content_mode == ContentCapturingMode.NONE + or ( + content_mode == ContentCapturingMode.SPAN + and not span.is_recording() + ) + or ( + content_mode == ContentCapturingMode.EVENT + and details_event_attributes is None + ) + ): + return + + chat_messages = json.dumps( + _prepare_input_messages(messages), + ensure_ascii=False, + cls=DataclassEncoder, + ) + + if span.is_recording() and content_mode == ContentCapturingMode.SPAN: + span.set_attribute("gen_ai.input.messages", chat_messages) + elif ( + details_event_attributes is not None + and content_mode == ContentCapturingMode.EVENT + ): + details_event_attributes["gen_ai.input.messages"] = chat_messages + else: + for message in messages: + event_logger.emit(_message_to_event(message, content_mode)) + + +def _prepare_input_messages(messages) -> List["ChatMessage"]: + chat_messages = [] + for message in messages: + role = get_property_value(message, "role") + chat_message = ChatMessage(role=role, parts=[]) + chat_messages.append(chat_message) + + content = get_property_value(message, "content") + + if role == "assistant": + tool_calls = get_property_value(message, "tool_calls") + if tool_calls: + chat_message.parts += extract_tool_calls_new(tool_calls) + if _is_text_part(content): + chat_message.parts.append(TextPart(content=content)) + + elif role == "tool": + tool_call_id = get_property_value(message, "tool_call_id") + chat_message.parts.append( + ToolCallResponsePart(id=tool_call_id, response=content) + ) + + else: + # system, developer, user, fallback + if _is_text_part(content): + chat_message.parts.append(TextPart(content=content)) + return chat_messages + + +def _is_text_part(content: Any) -> bool: + return isinstance(content, str) or ( + isinstance(content, Iterable) + and all(isinstance(part, str) for part in content) + ) + + +def record_output_messages( + choices, + content_mode: ContentCapturingMode, + latest_experimental_enabled: bool, + span: Span, + event_attributes: dict, + event_logger: EventLogger, +): + if latest_experimental_enabled: + if content_mode == ContentCapturingMode.NONE or ( + content_mode == ContentCapturingMode.SPAN + and not span.is_recording() + ): + return + + output_messages = json.dumps( + _prepare_output_messages(choices), + ensure_ascii=False, + cls=DataclassEncoder, + ) + + if content_mode == ContentCapturingMode.SPAN: + span.set_attribute("gen_ai.output.messages", output_messages) + elif ( + content_mode == ContentCapturingMode.EVENT + and event_attributes is not None + ): + event_attributes["gen_ai.output.messages"] = output_messages + else: + for choice in choices: + event_logger.emit(_choice_to_event(choice, content_mode)) + + +def _prepare_output_messages(choices) -> List["OutputMessage"]: + output_messages = [] + for choice in choices: + message = OutputMessage( + finish_reason=choice.finish_reason or "error", + role=( + choice.message.role + if choice.message and choice.message.role + else None + ), + ) + output_messages.append(message) + + if choice.message: + tool_calls = get_property_value(choice.message, "tool_calls") + if tool_calls: + message.parts += extract_tool_calls_new(tool_calls) + content = get_property_value(choice.message, "content") + if _is_text_part(content): + message.parts.append(TextPart(content=content)) + return output_messages + + +def _message_to_event(message, content_mode: ContentCapturingMode): attributes = { GenAIAttributes.GEN_AI_SYSTEM: GenAIAttributes.GenAiSystemValues.OPENAI.value } @@ -112,10 +339,10 @@ def message_to_event(message, capture_content): content = get_property_value(message, "content") body = {} - if capture_content and content: + if content_mode == ContentCapturingMode.EVENT and content: body["content"] = content if role == "assistant": - tool_calls = extract_tool_calls(message, capture_content) + tool_calls = extract_tool_calls_old(message, content_mode) if tool_calls: body = {"tool_calls": tool_calls} elif role == "tool": @@ -130,7 +357,7 @@ def message_to_event(message, capture_content): ) -def choice_to_event(choice, capture_content): +def _choice_to_event(choice, content_mode: ContentCapturingMode): attributes = { GenAIAttributes.GEN_AI_SYSTEM: GenAIAttributes.GenAiSystemValues.OPENAI.value } @@ -148,11 +375,11 @@ def choice_to_event(choice, capture_content): else None ) } - tool_calls = extract_tool_calls(choice.message, capture_content) + tool_calls = extract_tool_calls_old(choice.message, content_mode) if tool_calls: message["tool_calls"] = tool_calls content = get_property_value(choice.message, "content") - if capture_content and content: + if content_mode == ContentCapturingMode.EVENT and content: message["content"] = content body["message"] = message @@ -163,16 +390,16 @@ def choice_to_event(choice, capture_content): ) -def set_span_attributes(span, attributes: dict): - for field, value in attributes.model_dump(by_alias=True).items(): - set_span_attribute(span, field, value) - +def set_attribute(span, event_attributes, name, value): + if not span.is_recording() and event_attributes is None: + return -def set_span_attribute(span, name, value): if non_numerical_value_is_set(value) is False: return span.set_attribute(name, value) + if event_attributes is not None: + event_attributes[name] = value def is_streaming(kwargs): @@ -184,13 +411,21 @@ def non_numerical_value_is_set(value: Optional[Union[bool, str]]): def get_llm_request_attributes( - kwargs, - client_instance, - operation_name=GenAIAttributes.GenAiOperationNameValues.CHAT.value, + kwargs, client_instance, operation_name, latest_experimental_enabled ): + provider_name_attr_key = ( + "gen_ai.provider.name" + if latest_experimental_enabled + else GenAIAttributes.GEN_AI_SYSTEM + ) + request_seed_attr_key = ( + "gen_ai.request.seed" + if latest_experimental_enabled + else GenAIAttributes.GEN_AI_OPENAI_REQUEST_SEED + ) attributes = { GenAIAttributes.GEN_AI_OPERATION_NAME: operation_name, - GenAIAttributes.GEN_AI_SYSTEM: GenAIAttributes.GenAiSystemValues.OPENAI.value, + provider_name_attr_key: GenAIAttributes.GenAiSystemValues.OPENAI.value, GenAIAttributes.GEN_AI_REQUEST_MODEL: kwargs.get("model"), GenAIAttributes.GEN_AI_REQUEST_TEMPERATURE: kwargs.get("temperature"), GenAIAttributes.GEN_AI_REQUEST_TOP_P: kwargs.get("p") @@ -202,26 +437,51 @@ def get_llm_request_attributes( GenAIAttributes.GEN_AI_REQUEST_FREQUENCY_PENALTY: kwargs.get( "frequency_penalty" ), - GenAIAttributes.GEN_AI_OPENAI_REQUEST_SEED: kwargs.get("seed"), + request_seed_attr_key: kwargs.get("seed"), } + output_type_attr_key = ( + "gen_ai.output.type" + if latest_experimental_enabled + else GenAIAttributes.GEN_AI_OPENAI_REQUEST_RESPONSE_FORMAT + ) if (response_format := kwargs.get("response_format")) is not None: # response_format may be string or object with a string in the `type` key if isinstance(response_format, Mapping): if ( response_format_type := response_format.get("type") ) is not None: - attributes[ - GenAIAttributes.GEN_AI_OPENAI_REQUEST_RESPONSE_FORMAT - ] = response_format_type + if response_format_type == "text": + attributes[output_type_attr_key] = ( + "text" # TODO there should be an enum in semconv package + ) + elif ( + response_format_type == "json_schema" + or response_format_type == "json_object" + ): + attributes[output_type_attr_key] = "json" + else: + # should never happen with chat completion API + pass else: - attributes[ - GenAIAttributes.GEN_AI_OPENAI_REQUEST_RESPONSE_FORMAT - ] = response_format + # should never happen with chat completion API + attributes[output_type_attr_key] = response_format set_server_address_and_port(client_instance, attributes) - service_tier = kwargs.get("service_tier") - attributes[GenAIAttributes.GEN_AI_OPENAI_RESPONSE_SERVICE_TIER] = ( + + service_tier_attribute_key = ( + "openai.request.service_tier" + if latest_experimental_enabled + else GenAIAttributes.GEN_AI_OPENAI_REQUEST_SERVICE_TIER + ) + + extra_body = kwargs.get("extra_body", None) + if extra_body and isinstance(extra_body, dict): + service_tier = extra_body.get("service_tier", None) + else: + service_tier = kwargs.get("service_tier", None) + + attributes[service_tier_attribute_key] = ( service_tier if service_tier != "auto" else None ) @@ -229,10 +489,91 @@ def get_llm_request_attributes( return {k: v for k, v in attributes.items() if v is not None} -def handle_span_exception(span, error): - span.set_status(Status(StatusCode.ERROR, str(error))) +def record_exception(span, details_event_attributes, error, event_logger): if span.is_recording(): + span.set_status(Status(StatusCode.ERROR, str(error))) span.set_attribute( ErrorAttributes.ERROR_TYPE, type(error).__qualname__ ) + if details_event_attributes: + details_event_attributes[ErrorAttributes.ERROR_TYPE] = type( + error + ).__qualname__ + event_logger.emit( + Event( + name="gen_ai.client.inference.operation.details", + attributes=details_event_attributes, + trace_id=span.get_span_context().trace_id, + span_id=span.get_span_context().span_id, + trace_flags=span.get_span_context().trace_flags, + ) + ) span.end() + + +@dataclass +class TextPart: + type: str = "text" + content: str = None + + +@dataclass +class ToolCallRequestPart: + type: str = "tool_call" + id: Optional[str] = None + name: str = "" + arguments: Any = None + + +@dataclass +class ToolCallResponsePart: + type: str = "tool_call_response" + id: Optional[str] = None + response: Any = None + + +@dataclass +class GenericPart: + type: str = "" + + +MessagePart = Union[ + TextPart, + ToolCallRequestPart, + ToolCallResponsePart, + GenericPart, +] + + +class Role(str, Enum): + SYSTEM = "system" + USER = "user" + ASSISTANT = "assistant" + TOOL = "tool" + + +@dataclass +class ChatMessage: + role: Union[Role, str] + parts: List[MessagePart] = field(default_factory=list) + + +class FinishReason(str, Enum): + STOP = "stop" + LENGTH = "length" + CONTENT_FILTER = "content_filter" + TOOL_CALL = "tool_call" + ERROR = "error" + + +@dataclass +class OutputMessage(ChatMessage): + finish_reason: Union[FinishReason, str] = "" + + +class DataclassEncoder(json.JSONEncoder): + def default(self, obj): + if dataclasses.is_dataclass(obj): + return dataclasses.asdict(obj) + else: + return super(DataclassEncoder, self).default(obj) diff --git a/instrumentation-genai/opentelemetry-instrumentation-openai-v2/tests/conftest.py b/instrumentation-genai/opentelemetry-instrumentation-openai-v2/tests/conftest.py index 87505046aa..ee67ca3b34 100644 --- a/instrumentation-genai/opentelemetry-instrumentation-openai-v2/tests/conftest.py +++ b/instrumentation-genai/opentelemetry-instrumentation-openai-v2/tests/conftest.py @@ -10,6 +10,7 @@ from opentelemetry.instrumentation.openai_v2 import OpenAIInstrumentor from opentelemetry.instrumentation.openai_v2.utils import ( OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT, + OTEL_SEMCONV_STABILITY_OPT_IN, ) from opentelemetry.sdk._events import EventLoggerProvider from opentelemetry.sdk._logs import LoggerProvider @@ -104,12 +105,33 @@ def vcr_config(): } +@pytest.fixture(scope="function", params=[True, False]) +def latest_experimental_enabled(request): + return request.param + + +@pytest.fixture(scope="function", params=["span", "event"]) +def content_mode(request, latest_experimental_enabled): + return request.param if latest_experimental_enabled else "True" + + @pytest.fixture(scope="function") def instrument_no_content( - tracer_provider, event_logger_provider, meter_provider + tracer_provider, + event_logger_provider, + meter_provider, + latest_experimental_enabled, ): os.environ.update( - {OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT: "False"} + {OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT: "none"} + ) + + os.environ.update( + { + OTEL_SEMCONV_STABILITY_OPT_IN: "gen_ai_latest_experimental" + if latest_experimental_enabled + else "" + } ) instrumentor = OpenAIInstrumentor() @@ -121,15 +143,28 @@ def instrument_no_content( yield instrumentor os.environ.pop(OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT, None) + os.environ.pop(OTEL_SEMCONV_STABILITY_OPT_IN, None) instrumentor.uninstrument() @pytest.fixture(scope="function") def instrument_with_content( - tracer_provider, event_logger_provider, meter_provider + tracer_provider, + event_logger_provider, + meter_provider, + latest_experimental_enabled, + content_mode, ): os.environ.update( - {OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT: "True"} + {OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT: content_mode} + ) + + os.environ.update( + { + OTEL_SEMCONV_STABILITY_OPT_IN: "gen_ai_latest_experimental" + if latest_experimental_enabled + else "" + } ) instrumentor = OpenAIInstrumentor() instrumentor.instrument( @@ -140,15 +175,28 @@ def instrument_with_content( yield instrumentor os.environ.pop(OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT, None) + os.environ.pop(OTEL_SEMCONV_STABILITY_OPT_IN, None) instrumentor.uninstrument() @pytest.fixture(scope="function") def instrument_with_content_unsampled( - span_exporter, event_logger_provider, meter_provider + span_exporter, + event_logger_provider, + meter_provider, + latest_experimental_enabled, + content_mode, ): os.environ.update( - {OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT: "True"} + {OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT: content_mode} + ) + + os.environ.update( + { + OTEL_SEMCONV_STABILITY_OPT_IN: "gen_ai_latest_experimental" + if latest_experimental_enabled + else "" + } ) tracer_provider = TracerProvider(sampler=ALWAYS_OFF) @@ -163,6 +211,7 @@ def instrument_with_content_unsampled( yield instrumentor os.environ.pop(OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT, None) + os.environ.pop(OTEL_SEMCONV_STABILITY_OPT_IN, None) instrumentor.uninstrument() diff --git a/instrumentation-genai/opentelemetry-instrumentation-openai-v2/tests/test_async_chat_completions.py b/instrumentation-genai/opentelemetry-instrumentation-openai-v2/tests/test_async_chat_completions.py index 468caa232c..ed1f9d7d58 100644 --- a/instrumentation-genai/opentelemetry-instrumentation-openai-v2/tests/test_async_chat_completions.py +++ b/instrumentation-genai/opentelemetry-instrumentation-openai-v2/tests/test_async_chat_completions.py @@ -13,13 +13,13 @@ # limitations under the License. # pylint: disable=too-many-locals -from typing import Optional import pytest from openai import APIConnectionError, AsyncOpenAI, NotFoundError -from openai.resources.chat.completions import ChatCompletion -from opentelemetry.sdk.trace import ReadableSpan +from opentelemetry.instrumentation.openai_v2.utils import ( + is_latest_experimental_enabled, +) from opentelemetry.semconv._incubating.attributes import ( error_attributes as ErrorAttributes, ) @@ -32,235 +32,411 @@ from opentelemetry.semconv._incubating.attributes import ( server_attributes as ServerAttributes, ) +from tests.test_utils import ( + DEFAULT_MODEL, + USER_ONLY_EXPECTED_INPUT_MESSAGES, + USER_ONLY_PROMPT, + WEATHER_TOOL_EXPECTED_INPUT_MESSAGES, + WEATHER_TOOL_PROMPT, + assert_all_attributes, + assert_completion_attributes, + assert_log_parent, + assert_messages_attribute, + format_simple_expected_output_message, + get_current_weather_tool_definition, + remove_none_values, +) @pytest.mark.vcr() @pytest.mark.asyncio() async def test_async_chat_completion_with_content( - span_exporter, log_exporter, async_openai_client, instrument_with_content + span_exporter, + log_exporter, + async_openai_client, + instrument_with_content, + content_mode, + vcr, ): - llm_model_value = "gpt-4o-mini" - messages_value = [{"role": "user", "content": "Say this is a test"}] - - response = await async_openai_client.chat.completions.create( - messages=messages_value, model=llm_model_value, stream=False - ) - - spans = span_exporter.get_finished_spans() - assert_completion_attributes(spans[0], llm_model_value, response) + with vcr.use_cassette("test_async_chat_completion_with_content.yaml"): + latest_experimental_enabled = is_latest_experimental_enabled() - logs = log_exporter.get_finished_logs() - assert len(logs) == 2 + response = await async_openai_client.chat.completions.create( + messages=USER_ONLY_PROMPT, model=DEFAULT_MODEL, stream=False + ) - user_message = {"content": messages_value[0]["content"]} - assert_message_in_logs( - logs[0], "gen_ai.user.message", user_message, spans[0] - ) + spans = span_exporter.get_finished_spans() + logs = log_exporter.get_finished_logs() + if content_mode == "event": + assert len(logs) == 1 + assert_completion_attributes( + spans[0], + logs[0].log_record if content_mode == "event" else None, + DEFAULT_MODEL, + response, + latest_experimental_enabled, + ) - choice_event = { - "index": 0, - "finish_reason": "stop", - "message": { - "role": "assistant", - "content": response.choices[0].message.content, - }, - } - assert_message_in_logs(logs[1], "gen_ai.choice", choice_event, spans[0]) + if latest_experimental_enabled: + signal = ( + logs[0].log_record if content_mode == "event" else spans[0] + ) + assert_messages_attribute( + signal.attributes["gen_ai.input.messages"], + USER_ONLY_EXPECTED_INPUT_MESSAGES, + ) + assert_messages_attribute( + signal.attributes["gen_ai.output.messages"], + format_simple_expected_output_message( + response.choices[0].message.content + ), + ) + else: + assert len(logs) == 2 + + user_message = {"content": USER_ONLY_PROMPT[0]["content"]} + assert_message_in_logs( + logs[0], "gen_ai.user.message", user_message, spans[0] + ) + + choice_event = { + "index": 0, + "finish_reason": "stop", + "message": { + "role": "assistant", + "content": response.choices[0].message.content, + }, + } + assert_message_in_logs( + logs[1], "gen_ai.choice", choice_event, spans[0] + ) @pytest.mark.vcr() @pytest.mark.asyncio() async def test_async_chat_completion_no_content( - span_exporter, log_exporter, async_openai_client, instrument_no_content + span_exporter, + log_exporter, + async_openai_client, + instrument_no_content, + vcr, ): - llm_model_value = "gpt-4o-mini" - messages_value = [{"role": "user", "content": "Say this is a test"}] + with vcr.use_cassette("test_async_chat_completion_no_content.yaml"): + latest_experimental_enabled = is_latest_experimental_enabled() - response = await async_openai_client.chat.completions.create( - messages=messages_value, model=llm_model_value, stream=False - ) + response = await async_openai_client.chat.completions.create( + messages=USER_ONLY_PROMPT, model=DEFAULT_MODEL, stream=False + ) - spans = span_exporter.get_finished_spans() - assert_completion_attributes(spans[0], llm_model_value, response) + spans = span_exporter.get_finished_spans() + assert_completion_attributes( + spans[0], + None, + DEFAULT_MODEL, + response, + latest_experimental_enabled, + ) - logs = log_exporter.get_finished_logs() - assert len(logs) == 2 + logs = log_exporter.get_finished_logs() + if latest_experimental_enabled: + assert len(logs) == 0 + assert "gen_ai.input.messages" not in spans[0].attributes + assert "gen_ai.output.messages" not in spans[0].attributes + else: + assert len(logs) == 2 - assert_message_in_logs(logs[0], "gen_ai.user.message", None, spans[0]) + assert_message_in_logs( + logs[0], "gen_ai.user.message", None, spans[0] + ) - choice_event = { - "index": 0, - "finish_reason": "stop", - "message": {"role": "assistant"}, - } - assert_message_in_logs(logs[1], "gen_ai.choice", choice_event, spans[0]) + choice_event = { + "index": 0, + "finish_reason": "stop", + "message": {"role": "assistant"}, + } + assert_message_in_logs( + logs[1], "gen_ai.choice", choice_event, spans[0] + ) @pytest.mark.asyncio() async def test_async_chat_completion_bad_endpoint( - span_exporter, instrument_no_content + span_exporter, + instrument_no_content, + vcr, ): - llm_model_value = "gpt-4o-mini" - messages_value = [{"role": "user", "content": "Say this is a test"}] - - client = AsyncOpenAI(base_url="http://localhost:4242") - - with pytest.raises(APIConnectionError): - await client.chat.completions.create( - messages=messages_value, - model=llm_model_value, - timeout=0.1, + with vcr.use_cassette("test_async_chat_completion_bad_endpoint.yaml"): + latest_experimental_enabled = is_latest_experimental_enabled() + + client = AsyncOpenAI(base_url="http://localhost:4242") + + with pytest.raises(APIConnectionError): + await client.chat.completions.create( + messages=USER_ONLY_PROMPT, + model=DEFAULT_MODEL, + timeout=0.1, + ) + + spans = span_exporter.get_finished_spans() + assert_all_attributes( + spans[0], + None, + DEFAULT_MODEL, + latest_experimental_enabled, + server_address="localhost", + ) + assert 4242 == spans[0].attributes[ServerAttributes.SERVER_PORT] + assert ( + "APIConnectionError" + == spans[0].attributes[ErrorAttributes.ERROR_TYPE] ) - - spans = span_exporter.get_finished_spans() - assert_all_attributes( - spans[0], llm_model_value, server_address="localhost" - ) - assert 4242 == spans[0].attributes[ServerAttributes.SERVER_PORT] - assert ( - "APIConnectionError" == spans[0].attributes[ErrorAttributes.ERROR_TYPE] - ) @pytest.mark.vcr() @pytest.mark.asyncio() async def test_async_chat_completion_404( - span_exporter, async_openai_client, instrument_no_content + span_exporter, + async_openai_client, + instrument_no_content, + vcr, ): - llm_model_value = "this-model-does-not-exist" - messages_value = [{"role": "user", "content": "Say this is a test"}] + with vcr.use_cassette("test_async_chat_completion_404.yaml"): + llm_model_value = "this-model-does-not-exist" - with pytest.raises(NotFoundError): - await async_openai_client.chat.completions.create( - messages=messages_value, - model=llm_model_value, - ) + with pytest.raises(NotFoundError): + await async_openai_client.chat.completions.create( + messages=USER_ONLY_PROMPT, + model=llm_model_value, + ) - spans = span_exporter.get_finished_spans() + spans = span_exporter.get_finished_spans() - assert_all_attributes(spans[0], llm_model_value) - assert "NotFoundError" == spans[0].attributes[ErrorAttributes.ERROR_TYPE] + assert_all_attributes( + spans[0], None, llm_model_value, is_latest_experimental_enabled() + ) + assert ( + "NotFoundError" == spans[0].attributes[ErrorAttributes.ERROR_TYPE] + ) @pytest.mark.vcr() @pytest.mark.asyncio() async def test_async_chat_completion_extra_params( - span_exporter, async_openai_client, instrument_no_content + span_exporter, + log_exporter, + async_openai_client, + instrument_with_content, + content_mode, + vcr, ): - llm_model_value = "gpt-4o-mini" - messages_value = [{"role": "user", "content": "Say this is a test"}] + with vcr.use_cassette("test_async_chat_completion_extra_params.yaml"): + latest_experimental_enabled = is_latest_experimental_enabled() + + response = await async_openai_client.chat.completions.create( + messages=USER_ONLY_PROMPT, + model=DEFAULT_MODEL, + seed=42, + temperature=0.5, + max_tokens=50, + stream=False, + extra_body={"service_tier": "default"}, + response_format={"type": "text"}, + ) - response = await async_openai_client.chat.completions.create( - messages=messages_value, - model=llm_model_value, - seed=42, - temperature=0.5, - max_tokens=50, - stream=False, - extra_body={"service_tier": "default"}, - response_format={"type": "text"}, - ) + spans = span_exporter.get_finished_spans() + logs = log_exporter.get_finished_logs() + if content_mode == "event": + assert len(logs) == 1 + + assert_completion_attributes( + spans[0], + logs[0].log_record if content_mode == "event" else None, + DEFAULT_MODEL, + response, + latest_experimental_enabled, + ) - spans = span_exporter.get_finished_spans() - assert_completion_attributes(spans[0], llm_model_value, response) - assert ( - spans[0].attributes[GenAIAttributes.GEN_AI_OPENAI_REQUEST_SEED] == 42 - ) - assert ( - spans[0].attributes[GenAIAttributes.GEN_AI_REQUEST_TEMPERATURE] == 0.5 - ) - assert spans[0].attributes[GenAIAttributes.GEN_AI_REQUEST_MAX_TOKENS] == 50 - assert ( - spans[0].attributes[GenAIAttributes.GEN_AI_OPENAI_REQUEST_SERVICE_TIER] - == "default" - ) - assert ( - spans[0].attributes[ - GenAIAttributes.GEN_AI_OPENAI_REQUEST_RESPONSE_FORMAT - ] - == "text" - ) + request_seed_attr_key = ( + "gen_ai.request.seed" + if latest_experimental_enabled + else GenAIAttributes.GEN_AI_OPENAI_REQUEST_SEED + ) + assert spans[0].attributes[request_seed_attr_key] == 42 + assert ( + spans[0].attributes[GenAIAttributes.GEN_AI_REQUEST_TEMPERATURE] + == 0.5 + ) + assert ( + spans[0].attributes[GenAIAttributes.GEN_AI_REQUEST_MAX_TOKENS] + == 50 + ) + + service_tier_attr_key = ( + "openai.request.service_tier" + if latest_experimental_enabled + else GenAIAttributes.GEN_AI_OPENAI_REQUEST_SERVICE_TIER + ) + assert spans[0].attributes[service_tier_attr_key] == "default" + + output_type_attr_key = ( + "gen_ai.output.type" + if latest_experimental_enabled + else GenAIAttributes.GEN_AI_OPENAI_REQUEST_RESPONSE_FORMAT + ) + assert spans[0].attributes[output_type_attr_key] == "text" @pytest.mark.vcr() @pytest.mark.asyncio() async def test_async_chat_completion_multiple_choices( - span_exporter, log_exporter, async_openai_client, instrument_with_content + span_exporter, + log_exporter, + async_openai_client, + instrument_with_content, + content_mode, + vcr, ): - llm_model_value = "gpt-4o-mini" - messages_value = [{"role": "user", "content": "Say this is a test"}] - - response = await async_openai_client.chat.completions.create( - messages=messages_value, model=llm_model_value, n=2, stream=False - ) + with vcr.use_cassette("test_async_chat_completion_multiple_choices.yaml"): + latest_experimental_enabled = is_latest_experimental_enabled() - spans = span_exporter.get_finished_spans() - assert_completion_attributes(spans[0], llm_model_value, response) - - logs = log_exporter.get_finished_logs() - assert len(logs) == 3 # 1 user message + 2 choice messages - - user_message = {"content": messages_value[0]["content"]} - assert_message_in_logs( - logs[0], "gen_ai.user.message", user_message, spans[0] - ) + response = await async_openai_client.chat.completions.create( + messages=USER_ONLY_PROMPT, model=DEFAULT_MODEL, n=2, stream=False + ) - choice_event_0 = { - "index": 0, - "finish_reason": "stop", - "message": { - "role": "assistant", - "content": response.choices[0].message.content, - }, - } - assert_message_in_logs(logs[1], "gen_ai.choice", choice_event_0, spans[0]) + spans = span_exporter.get_finished_spans() + logs = log_exporter.get_finished_logs() + if content_mode == "event": + assert len(logs) == 1 + assert_completion_attributes( + spans[0], + logs[0].log_record if content_mode == "event" else None, + DEFAULT_MODEL, + response, + latest_experimental_enabled, + ) - choice_event_1 = { - "index": 1, - "finish_reason": "stop", - "message": { - "role": "assistant", - "content": response.choices[1].message.content, - }, - } - assert_message_in_logs(logs[2], "gen_ai.choice", choice_event_1, spans[0]) + if latest_experimental_enabled: + expected_output_messages = [ + { + "role": "assistant", + "parts": [ + { + "type": "text", + "content": response.choices[0].message.content, + } + ], + "finish_reason": "stop", + }, + { + "role": "assistant", + "parts": [ + { + "type": "text", + "content": response.choices[1].message.content, + } + ], + "finish_reason": "stop", + }, + ] + + signal = ( + logs[0].log_record if content_mode == "event" else spans[0] + ) + assert_messages_attribute( + signal.attributes["gen_ai.output.messages"], + expected_output_messages, + ) + else: + assert len(logs) == 3 # 1 user message + 2 choice messages + + user_message = {"content": USER_ONLY_PROMPT[0]["content"]} + assert_message_in_logs( + logs[0], "gen_ai.user.message", user_message, spans[0] + ) + + choice_event_0 = { + "index": 0, + "finish_reason": "stop", + "message": { + "role": "assistant", + "content": response.choices[0].message.content, + }, + } + assert_message_in_logs( + logs[1], "gen_ai.choice", choice_event_0, spans[0] + ) + + choice_event_1 = { + "index": 1, + "finish_reason": "stop", + "message": { + "role": "assistant", + "content": response.choices[1].message.content, + }, + } + assert_message_in_logs( + logs[2], "gen_ai.choice", choice_event_1, spans[0] + ) @pytest.mark.vcr() @pytest.mark.asyncio() async def test_async_chat_completion_tool_calls_with_content( - span_exporter, log_exporter, async_openai_client, instrument_with_content + span_exporter, + log_exporter, + async_openai_client, + instrument_with_content, + content_mode, + vcr, ): - await chat_completion_tool_call( - span_exporter, log_exporter, async_openai_client, True - ) + with vcr.use_cassette( + "test_async_chat_completion_tool_calls_with_content.yaml" + ): + await chat_completion_tool_call( + span_exporter, + log_exporter, + async_openai_client, + True, + content_mode, + is_latest_experimental_enabled(), + ) @pytest.mark.vcr() @pytest.mark.asyncio() async def test_async_chat_completion_tool_calls_no_content( - span_exporter, log_exporter, async_openai_client, instrument_no_content + span_exporter, + log_exporter, + async_openai_client, + instrument_no_content, + vcr, ): - await chat_completion_tool_call( - span_exporter, log_exporter, async_openai_client, False - ) + with vcr.use_cassette( + "test_async_chat_completion_tool_calls_no_content.yaml" + ): + await chat_completion_tool_call( + span_exporter, + log_exporter, + async_openai_client, + False, + None, + is_latest_experimental_enabled(), + ) async def chat_completion_tool_call( - span_exporter, log_exporter, async_openai_client, expect_content + span_exporter, + log_exporter, + async_openai_client, + expect_content, + content_mode, + latest_experimental_enabled, ): - llm_model_value = "gpt-4o-mini" - messages_value = [ - {"role": "system", "content": "You're a helpful assistant."}, - { - "role": "user", - "content": "What's the weather in Seattle and San Francisco today?", - }, - ] + messages_value = WEATHER_TOOL_PROMPT.copy() response_0 = await async_openai_client.chat.completions.create( messages=messages_value, - model=llm_model_value, + model=DEFAULT_MODEL, tool_choice="auto", tools=[get_current_weather_tool_definition()], ) @@ -293,7 +469,7 @@ async def chat_completion_tool_call( messages_value.append(tool_call_result_1) response_1 = await async_openai_client.chat.completions.create( - messages=messages_value, model=llm_model_value + messages=messages_value, model=DEFAULT_MODEL ) # sanity check @@ -302,335 +478,600 @@ async def chat_completion_tool_call( # validate both calls spans = span_exporter.get_finished_spans() assert len(spans) == 2 - assert_completion_attributes(spans[0], llm_model_value, response_0) - assert_completion_attributes(spans[1], llm_model_value, response_1) logs = log_exporter.get_finished_logs() - assert len(logs) == 9 # 3 logs for first completion, 6 for second + if content_mode == "event": + assert len(logs) == 2 - # call one - system_message = ( - {"content": messages_value[0]["content"]} if expect_content else None - ) - assert_message_in_logs( - logs[0], "gen_ai.system.message", system_message, spans[0] - ) - - user_message = ( - {"content": messages_value[1]["content"]} if expect_content else None + assert_completion_attributes( + spans[0], + logs[0].log_record if content_mode == "event" else None, + DEFAULT_MODEL, + response_0, + latest_experimental_enabled, ) - assert_message_in_logs( - logs[1], "gen_ai.user.message", user_message, spans[0] + assert_completion_attributes( + spans[1], + logs[1].log_record if content_mode == "event" else None, + DEFAULT_MODEL, + response_1, + latest_experimental_enabled, ) - function_call_0 = {"name": "get_current_weather"} - function_call_1 = {"name": "get_current_weather"} - if expect_content: - function_call_0["arguments"] = ( - response_0.choices[0] - .message.tool_calls[0] - .function.arguments.replace("\n", "") - ) - function_call_1["arguments"] = ( - response_0.choices[0] - .message.tool_calls[1] - .function.arguments.replace("\n", "") - ) - - choice_event = { - "index": 0, - "finish_reason": "tool_calls", - "message": { - "role": "assistant", - "tool_calls": [ + if latest_experimental_enabled: + if not expect_content: + pass + else: + # first call + signal_0 = ( + logs[0].log_record if content_mode == "event" else spans[0] + ) + assert_messages_attribute( + signal_0.attributes["gen_ai.input.messages"], + WEATHER_TOOL_EXPECTED_INPUT_MESSAGES, + ) + + first_output = [ + { + "role": "assistant", + "parts": [ + { + "type": "tool_call", + "id": response_0.choices[0] + .message.tool_calls[0] + .id, + "name": "get_current_weather", + "arguments": {"location": "Seattle, WA"}, + }, + { + "type": "tool_call", + "id": response_0.choices[0] + .message.tool_calls[1] + .id, + "name": "get_current_weather", + "arguments": {"location": "San Francisco, CA"}, + }, + ], + "finish_reason": "tool_calls", + } + ] + assert_messages_attribute( + signal_0.attributes["gen_ai.output.messages"], first_output + ) + + # second call + del first_output[0]["finish_reason"] + second_input = [] + second_input += WEATHER_TOOL_EXPECTED_INPUT_MESSAGES.copy() + second_input += first_output + second_input += [ { - "id": response_0.choices[0].message.tool_calls[0].id, - "type": "function", - "function": function_call_0, + "role": "tool", + "parts": [ + { + "type": "tool_call_response", + "id": response_0.choices[0] + .message.tool_calls[0] + .id, + "response": tool_call_result_0["content"], + } + ], }, { - "id": response_0.choices[0].message.tool_calls[1].id, - "type": "function", - "function": function_call_1, + "role": "tool", + "parts": [ + { + "type": "tool_call_response", + "id": response_0.choices[0] + .message.tool_calls[1] + .id, + "response": tool_call_result_1["content"], + } + ], }, - ], - }, - } - assert_message_in_logs(logs[2], "gen_ai.choice", choice_event, spans[0]) + ] + + signal_1 = ( + logs[1].log_record if content_mode == "event" else spans[1] + ) + assert_messages_attribute( + signal_1.attributes["gen_ai.input.messages"], second_input + ) + + assert_messages_attribute( + signal_1.attributes["gen_ai.output.messages"], + [ + { + "role": "assistant", + "parts": [ + { + "type": "text", + "content": response_1.choices[ + 0 + ].message.content, + }, + ], + "finish_reason": "stop", + } + ], + ) + else: + assert len(logs) == 9 # 3 logs for first completion, 6 for second - # call two - system_message = ( - {"content": messages_value[0]["content"]} if expect_content else None - ) - assert_message_in_logs( - logs[3], "gen_ai.system.message", system_message, spans[1] - ) + # call one + system_message = ( + {"content": messages_value[0]["content"]} + if expect_content + else None + ) + assert_message_in_logs( + logs[0], "gen_ai.system.message", system_message, spans[0] + ) - user_message = ( - {"content": messages_value[1]["content"]} if expect_content else None - ) - assert_message_in_logs( - logs[4], "gen_ai.user.message", user_message, spans[1] - ) + user_message = ( + {"content": messages_value[1]["content"]} + if expect_content + else None + ) + assert_message_in_logs( + logs[1], "gen_ai.user.message", user_message, spans[0] + ) - assistant_tool_call = {"tool_calls": messages_value[2]["tool_calls"]} - if not expect_content: - assistant_tool_call["tool_calls"][0]["function"]["arguments"] = None - assistant_tool_call["tool_calls"][1]["function"]["arguments"] = None + function_call_0 = {"name": "get_current_weather"} + function_call_1 = {"name": "get_current_weather"} + if expect_content: + function_call_0["arguments"] = ( + response_0.choices[0] + .message.tool_calls[0] + .function.arguments.replace("\n", "") + ) + function_call_1["arguments"] = ( + response_0.choices[0] + .message.tool_calls[1] + .function.arguments.replace("\n", "") + ) + + choice_event = { + "index": 0, + "finish_reason": "tool_calls", + "message": { + "role": "assistant", + "tool_calls": [ + { + "id": response_0.choices[0].message.tool_calls[0].id, + "type": "function", + "function": function_call_0, + }, + { + "id": response_0.choices[0].message.tool_calls[1].id, + "type": "function", + "function": function_call_1, + }, + ], + }, + } + assert_message_in_logs( + logs[2], "gen_ai.choice", choice_event, spans[0] + ) - assert_message_in_logs( - logs[5], "gen_ai.assistant.message", assistant_tool_call, spans[1] - ) + # call two + system_message = ( + {"content": messages_value[0]["content"]} + if expect_content + else None + ) + assert_message_in_logs( + logs[3], "gen_ai.system.message", system_message, spans[1] + ) - tool_message_0 = { - "id": tool_call_result_0["tool_call_id"], - "content": tool_call_result_0["content"] if expect_content else None, - } + user_message = ( + {"content": messages_value[1]["content"]} + if expect_content + else None + ) + assert_message_in_logs( + logs[4], "gen_ai.user.message", user_message, spans[1] + ) - assert_message_in_logs( - logs[6], "gen_ai.tool.message", tool_message_0, spans[1] - ) + assistant_tool_call = {"tool_calls": messages_value[2]["tool_calls"]} + if not expect_content: + assistant_tool_call["tool_calls"][0]["function"]["arguments"] = ( + None + ) + assistant_tool_call["tool_calls"][1]["function"]["arguments"] = ( + None + ) + + assert_message_in_logs( + logs[5], "gen_ai.assistant.message", assistant_tool_call, spans[1] + ) - tool_message_1 = { - "id": tool_call_result_1["tool_call_id"], - "content": tool_call_result_1["content"] if expect_content else None, - } + tool_message_0 = { + "id": tool_call_result_0["tool_call_id"], + "content": tool_call_result_0["content"] + if expect_content + else None, + } - assert_message_in_logs( - logs[7], "gen_ai.tool.message", tool_message_1, spans[1] - ) + assert_message_in_logs( + logs[6], "gen_ai.tool.message", tool_message_0, spans[1] + ) - message = { - "role": "assistant", - "content": response_1.choices[0].message.content - if expect_content - else None, - } - choice = { - "index": 0, - "finish_reason": "stop", - "message": message, - } - assert_message_in_logs(logs[8], "gen_ai.choice", choice, spans[1]) + tool_message_1 = { + "id": tool_call_result_1["tool_call_id"], + "content": tool_call_result_1["content"] + if expect_content + else None, + } + + assert_message_in_logs( + logs[7], "gen_ai.tool.message", tool_message_1, spans[1] + ) + + message = { + "role": "assistant", + "content": response_1.choices[0].message.content + if expect_content + else None, + } + choice = { + "index": 0, + "finish_reason": "stop", + "message": message, + } + assert_message_in_logs(logs[8], "gen_ai.choice", choice, spans[1]) @pytest.mark.vcr() @pytest.mark.asyncio() async def test_async_chat_completion_streaming( - span_exporter, log_exporter, async_openai_client, instrument_with_content + span_exporter, + log_exporter, + async_openai_client, + instrument_with_content, + content_mode, + vcr, ): - llm_model_value = "gpt-4" - messages_value = [{"role": "user", "content": "Say this is a test"}] - - kwargs = { - "model": llm_model_value, - "messages": messages_value, - "stream": True, - "stream_options": {"include_usage": True}, - } - - response_stream_usage = None - response_stream_model = None - response_stream_id = None - response_stream_result = "" - response = await async_openai_client.chat.completions.create(**kwargs) - async for chunk in response: - if chunk.choices: - response_stream_result += chunk.choices[0].delta.content or "" - - # get the last chunk - if getattr(chunk, "usage", None): - response_stream_usage = chunk.usage - response_stream_model = chunk.model - response_stream_id = chunk.id - - spans = span_exporter.get_finished_spans() - assert_all_attributes( - spans[0], - llm_model_value, - response_stream_id, - response_stream_model, - response_stream_usage.prompt_tokens, - response_stream_usage.completion_tokens, - ) - - logs = log_exporter.get_finished_logs() - assert len(logs) == 2 - - user_message = {"content": "Say this is a test"} - assert_message_in_logs( - logs[0], "gen_ai.user.message", user_message, spans[0] - ) + with vcr.use_cassette("test_async_chat_completion_streaming.yaml"): + latest_experimental_enabled = is_latest_experimental_enabled() + llm_model_value = "gpt-4" + + kwargs = { + "model": llm_model_value, + "messages": USER_ONLY_PROMPT, + "stream": True, + "stream_options": {"include_usage": True}, + } - choice_event = { - "index": 0, - "finish_reason": "stop", - "message": {"role": "assistant", "content": response_stream_result}, - } - assert_message_in_logs(logs[1], "gen_ai.choice", choice_event, spans[0]) + response_stream_usage = None + response_stream_model = None + response_stream_id = None + response_stream_result = "" + response = await async_openai_client.chat.completions.create(**kwargs) + async for chunk in response: + if chunk.choices: + response_stream_result += chunk.choices[0].delta.content or "" + + # get the last chunk + if getattr(chunk, "usage", None): + response_stream_usage = chunk.usage + response_stream_model = chunk.model + response_stream_id = chunk.id + + spans = span_exporter.get_finished_spans() + logs = log_exporter.get_finished_logs() + if content_mode == "events": + assert len(logs) == 1 + + assert_all_attributes( + spans[0], + logs[0].log_record if content_mode == "event" else None, + llm_model_value, + latest_experimental_enabled, + response_stream_id, + response_stream_model, + response_stream_usage.prompt_tokens, + response_stream_usage.completion_tokens, + ) + if latest_experimental_enabled: + signal = ( + logs[0].log_record if content_mode == "event" else spans[0] + ) + assert_messages_attribute( + signal.attributes["gen_ai.input.messages"], + USER_ONLY_EXPECTED_INPUT_MESSAGES, + ) + assert_messages_attribute( + signal.attributes["gen_ai.output.messages"], + format_simple_expected_output_message(response_stream_result), + ) + else: + assert len(logs) == 2 + + user_message = {"content": "Say this is a test"} + assert_message_in_logs( + logs[0], "gen_ai.user.message", user_message, spans[0] + ) + + choice_event = { + "index": 0, + "finish_reason": "stop", + "message": { + "role": "assistant", + "content": response_stream_result, + }, + } + assert_message_in_logs( + logs[1], "gen_ai.choice", choice_event, spans[0] + ) @pytest.mark.vcr() @pytest.mark.asyncio() async def test_async_chat_completion_streaming_not_complete( - span_exporter, log_exporter, async_openai_client, instrument_with_content + span_exporter, + log_exporter, + async_openai_client, + instrument_with_content, + content_mode, + vcr, ): - llm_model_value = "gpt-4" - messages_value = [{"role": "user", "content": "Say this is a test"}] - - kwargs = { - "model": llm_model_value, - "messages": messages_value, - "stream": True, - } - - response_stream_model = None - response_stream_id = None - response_stream_result = "" - response = await async_openai_client.chat.completions.create(**kwargs) - idx = 0 - async for chunk in response: - if chunk.choices: - response_stream_result += chunk.choices[0].delta.content or "" - if idx == 1: - # fake a stop - break - - if chunk.model: - response_stream_model = chunk.model - if chunk.id: - response_stream_id = chunk.id - idx += 1 - - response.close() - spans = span_exporter.get_finished_spans() - assert_all_attributes( - spans[0], llm_model_value, response_stream_id, response_stream_model - ) - - logs = log_exporter.get_finished_logs() - assert len(logs) == 2 - - user_message = {"content": "Say this is a test"} - assert_message_in_logs( - logs[0], "gen_ai.user.message", user_message, spans[0] - ) + with vcr.use_cassette( + "test_async_chat_completion_streaming_not_complete.yaml" + ): + latest_experimental_enabled = is_latest_experimental_enabled() + llm_model_value = "gpt-4" + + kwargs = { + "model": llm_model_value, + "messages": USER_ONLY_PROMPT, + "stream": True, + } - choice_event = { - "index": 0, - "finish_reason": "error", - "message": {"role": "assistant", "content": response_stream_result}, - } - assert_message_in_logs(logs[1], "gen_ai.choice", choice_event, spans[0]) + response_stream_model = None + response_stream_id = None + response_stream_result = "" + response = await async_openai_client.chat.completions.create(**kwargs) + idx = 0 + async for chunk in response: + if chunk.choices: + response_stream_result += chunk.choices[0].delta.content or "" + if idx == 1: + # fake a stop + break + + if chunk.model: + response_stream_model = chunk.model + if chunk.id: + response_stream_id = chunk.id + idx += 1 + + response.close() + spans = span_exporter.get_finished_spans() + logs = log_exporter.get_finished_logs() + if content_mode == "events": + assert len(logs) == 1 + assert_all_attributes( + spans[0], + logs[0].log_record if content_mode == "event" else None, + llm_model_value, + latest_experimental_enabled, + response_stream_id, + response_stream_model, + ) + if latest_experimental_enabled: + signal = ( + logs[0].log_record if content_mode == "event" else spans[0] + ) + assert_messages_attribute( + signal.attributes["gen_ai.input.messages"], + USER_ONLY_EXPECTED_INPUT_MESSAGES, + ) + assert_messages_attribute( + signal.attributes["gen_ai.output.messages"], + format_simple_expected_output_message( + response_stream_result, finish_reason="error" + ), + ) + else: + assert len(logs) == 2 + + user_message = {"content": "Say this is a test"} + assert_message_in_logs( + logs[0], "gen_ai.user.message", user_message, spans[0] + ) + + choice_event = { + "index": 0, + "finish_reason": "error", + "message": { + "role": "assistant", + "content": response_stream_result, + }, + } + assert_message_in_logs( + logs[1], "gen_ai.choice", choice_event, spans[0] + ) @pytest.mark.vcr() @pytest.mark.asyncio() async def test_async_chat_completion_multiple_choices_streaming( - span_exporter, log_exporter, async_openai_client, instrument_with_content + span_exporter, + log_exporter, + async_openai_client, + instrument_with_content, + content_mode, + vcr, ): - llm_model_value = "gpt-4o-mini" - messages_value = [ - {"role": "system", "content": "You're a helpful assistant."}, - { - "role": "user", - "content": "What's the weather in Seattle and San Francisco today?", - }, - ] - - response_0 = await async_openai_client.chat.completions.create( - messages=messages_value, - model=llm_model_value, - n=2, - stream=True, - stream_options={"include_usage": True}, - ) - - # two strings for each choice - response_stream_result = ["", ""] - finish_reasons = ["", ""] - async for chunk in response_0: - if chunk.choices: - for choice in chunk.choices: - response_stream_result[choice.index] += ( - choice.delta.content or "" - ) - if choice.finish_reason: - finish_reasons[choice.index] = choice.finish_reason - - # get the last chunk - if getattr(chunk, "usage", None): - response_stream_usage = chunk.usage - response_stream_model = chunk.model - response_stream_id = chunk.id - - # sanity check - assert "stop" == finish_reasons[0] - - spans = span_exporter.get_finished_spans() - assert_all_attributes( - spans[0], - llm_model_value, - response_stream_id, - response_stream_model, - response_stream_usage.prompt_tokens, - response_stream_usage.completion_tokens, - ) - - logs = log_exporter.get_finished_logs() - assert len(logs) == 4 - - system_message = {"content": messages_value[0]["content"]} - assert_message_in_logs( - logs[0], "gen_ai.system.message", system_message, spans[0] - ) - - user_message = { - "content": "What's the weather in Seattle and San Francisco today?" - } - assert_message_in_logs( - logs[1], "gen_ai.user.message", user_message, spans[0] - ) + with vcr.use_cassette( + "test_async_chat_completion_multiple_choices_streaming.yaml" + ): + latest_experimental_enabled = is_latest_experimental_enabled() + + response_0 = await async_openai_client.chat.completions.create( + messages=WEATHER_TOOL_PROMPT, + model=DEFAULT_MODEL, + n=2, + stream=True, + stream_options={"include_usage": True}, + ) - choice_event_0 = { - "index": 0, - "finish_reason": "stop", - "message": { - "role": "assistant", - "content": "".join(response_stream_result[0]), - }, - } - assert_message_in_logs(logs[2], "gen_ai.choice", choice_event_0, spans[0]) + # two strings for each choice + response_stream_result = ["", ""] + finish_reasons = ["", ""] + async for chunk in response_0: + if chunk.choices: + for choice in chunk.choices: + response_stream_result[choice.index] += ( + choice.delta.content or "" + ) + if choice.finish_reason: + finish_reasons[choice.index] = choice.finish_reason + + # get the last chunk + if getattr(chunk, "usage", None): + response_stream_usage = chunk.usage + response_stream_model = chunk.model + response_stream_id = chunk.id + + # sanity check + assert "stop" == finish_reasons[0] + + spans = span_exporter.get_finished_spans() + logs = log_exporter.get_finished_logs() + + if content_mode == "event": + assert 1 == len(logs) + + assert_all_attributes( + spans[0], + logs[0].log_record if content_mode == "event" else None, + DEFAULT_MODEL, + latest_experimental_enabled, + response_stream_id, + response_stream_model, + response_stream_usage.prompt_tokens, + response_stream_usage.completion_tokens, + ) - choice_event_1 = { - "index": 1, - "finish_reason": "stop", - "message": { - "role": "assistant", - "content": "".join(response_stream_result[1]), - }, - } - assert_message_in_logs(logs[3], "gen_ai.choice", choice_event_1, spans[0]) + if latest_experimental_enabled: + expected_output_messages = [ + { + "role": "assistant", + "parts": [ + { + "type": "text", + "content": "".join(response_stream_result[0]), + } + ], + "finish_reason": "stop", + }, + { + "role": "assistant", + "parts": [ + { + "type": "text", + "content": "".join(response_stream_result[1]), + } + ], + "finish_reason": "stop", + }, + ] + signal = ( + logs[0].log_record if content_mode == "event" else spans[0] + ) + assert_messages_attribute( + signal.attributes["gen_ai.input.messages"], + WEATHER_TOOL_EXPECTED_INPUT_MESSAGES, + ) + assert_messages_attribute( + signal.attributes["gen_ai.output.messages"], + expected_output_messages, + ) + else: + assert len(logs) == 4 + + system_message = {"content": WEATHER_TOOL_PROMPT[0]["content"]} + assert_message_in_logs( + logs[0], "gen_ai.system.message", system_message, spans[0] + ) + + user_message = { + "content": "What's the weather in Seattle and San Francisco today?" + } + assert_message_in_logs( + logs[1], "gen_ai.user.message", user_message, spans[0] + ) + + choice_event_0 = { + "index": 0, + "finish_reason": "stop", + "message": { + "role": "assistant", + "content": "".join(response_stream_result[0]), + }, + } + assert_message_in_logs( + logs[2], "gen_ai.choice", choice_event_0, spans[0] + ) + + choice_event_1 = { + "index": 1, + "finish_reason": "stop", + "message": { + "role": "assistant", + "content": "".join(response_stream_result[1]), + }, + } + assert_message_in_logs( + logs[3], "gen_ai.choice", choice_event_1, spans[0] + ) @pytest.mark.vcr() @pytest.mark.asyncio() async def test_async_chat_completion_multiple_tools_streaming_with_content( - span_exporter, log_exporter, async_openai_client, instrument_with_content + span_exporter, + log_exporter, + async_openai_client, + instrument_with_content, + content_mode, + vcr, ): - await async_chat_completion_multiple_tools_streaming( - span_exporter, log_exporter, async_openai_client, True - ) + with vcr.use_cassette( + "test_async_chat_completion_multiple_tools_streaming_with_content.yaml" + ): + await async_chat_completion_multiple_tools_streaming( + span_exporter, + log_exporter, + async_openai_client, + True, + content_mode, + is_latest_experimental_enabled(), + ) @pytest.mark.vcr() @pytest.mark.asyncio() async def test_async_chat_completion_multiple_tools_streaming_no_content( - span_exporter, log_exporter, async_openai_client, instrument_no_content + span_exporter, + log_exporter, + async_openai_client, + instrument_no_content, + vcr, ): - await async_chat_completion_multiple_tools_streaming( - span_exporter, log_exporter, async_openai_client, False - ) + with vcr.use_cassette( + "test_async_chat_completion_multiple_tools_streaming_no_content.yaml" + ): + await async_chat_completion_multiple_tools_streaming( + span_exporter, + log_exporter, + async_openai_client, + False, + None, + is_latest_experimental_enabled(), + ) @pytest.mark.vcr() @@ -640,63 +1081,108 @@ async def test_async_chat_completion_streaming_unsampled( log_exporter, async_openai_client, instrument_with_content_unsampled, + content_mode, + vcr, ): - llm_model_value = "gpt-4" - messages_value = [{"role": "user", "content": "Say this is a test"}] - - kwargs = { - "model": llm_model_value, - "messages": messages_value, - "stream": True, - "stream_options": {"include_usage": True}, - } - - response_stream_result = "" - response = await async_openai_client.chat.completions.create(**kwargs) - async for chunk in response: - if chunk.choices: - response_stream_result += chunk.choices[0].delta.content or "" - - spans = span_exporter.get_finished_spans() - assert len(spans) == 0 - - logs = log_exporter.get_finished_logs() - assert len(logs) == 2 + with vcr.use_cassette( + "test_async_chat_completion_streaming_unsampled.yaml" + ): + latest_experimental_enabled = is_latest_experimental_enabled() + llm_model_value = "gpt-4" + + kwargs = { + "model": llm_model_value, + "messages": USER_ONLY_PROMPT, + "stream": True, + "stream_options": {"include_usage": True}, + } - user_message = {"content": "Say this is a test"} - assert_message_in_logs(logs[0], "gen_ai.user.message", user_message, None) + response_stream_result = "" + response_stream_id = None + response_stream_usage = None + response = await async_openai_client.chat.completions.create(**kwargs) + async for chunk in response: + if chunk.choices: + response_stream_result += chunk.choices[0].delta.content or "" + # get the last chunk + if getattr(chunk, "usage", None): + response_stream_usage = chunk.usage + response_stream_model = chunk.model + response_stream_id = chunk.id + + spans = span_exporter.get_finished_spans() + assert len(spans) == 0 + + logs = log_exporter.get_finished_logs() + if content_mode == "event": + assert len(logs) == 1 + + assert_all_attributes( + None, + logs[0].log_record if content_mode == "event" else None, + llm_model_value, + latest_experimental_enabled, + response_stream_id, + response_stream_model, + response_stream_usage.prompt_tokens, + response_stream_usage.completion_tokens, + ) - choice_event = { - "index": 0, - "finish_reason": "stop", - "message": {"role": "assistant", "content": response_stream_result}, - } - assert_message_in_logs(logs[1], "gen_ai.choice", choice_event, None) + if latest_experimental_enabled: + if content_mode == "event": + assert_messages_attribute( + logs[0].log_record.attributes["gen_ai.input.messages"], + USER_ONLY_EXPECTED_INPUT_MESSAGES, + ) + assert_messages_attribute( + logs[0].log_record.attributes["gen_ai.output.messages"], + format_simple_expected_output_message( + "".join(response_stream_result) + ), + ) + else: + assert len(logs) == 2 + + user_message = {"content": "Say this is a test"} + assert_message_in_logs( + logs[0], "gen_ai.user.message", user_message, None + ) + + choice_event = { + "index": 0, + "finish_reason": "stop", + "message": { + "role": "assistant", + "content": response_stream_result, + }, + } + assert_message_in_logs( + logs[1], "gen_ai.choice", choice_event, None + ) - assert logs[0].log_record.trace_id is not None - assert logs[0].log_record.span_id is not None - assert logs[0].log_record.trace_flags == 0 + assert logs[0].log_record.trace_id is not None + assert logs[0].log_record.span_id is not None + assert logs[0].log_record.trace_flags == 0 - assert logs[0].log_record.trace_id == logs[1].log_record.trace_id - assert logs[0].log_record.span_id == logs[1].log_record.span_id - assert logs[0].log_record.trace_flags == logs[1].log_record.trace_flags + assert logs[0].log_record.trace_id == logs[1].log_record.trace_id + assert logs[0].log_record.span_id == logs[1].log_record.span_id + assert ( + logs[0].log_record.trace_flags + == logs[1].log_record.trace_flags + ) async def async_chat_completion_multiple_tools_streaming( - span_exporter, log_exporter, async_openai_client, expect_content + span_exporter, + log_exporter, + async_openai_client, + expect_content, + content_mode, + latest_experimental_enabled, ): - llm_model_value = "gpt-4o-mini" - messages_value = [ - {"role": "system", "content": "You're a helpful assistant."}, - { - "role": "user", - "content": "What's the weather in Seattle and San Francisco today?", - }, - ] - response = await async_openai_client.chat.completions.create( - messages=messages_value, - model=llm_model_value, + messages=WEATHER_TOOL_PROMPT, + model=DEFAULT_MODEL, tool_choice="auto", tools=[get_current_weather_tool_definition()], stream=True, @@ -732,68 +1218,114 @@ async def async_chat_completion_multiple_tools_streaming( assert "tool_calls" == finish_reason spans = span_exporter.get_finished_spans() + logs = log_exporter.get_finished_logs() + if content_mode == "event": + assert len(logs) == 1 + assert_all_attributes( spans[0], - llm_model_value, + logs[0].log_record if content_mode == "event" else None, + DEFAULT_MODEL, + latest_experimental_enabled, response_stream_id, response_stream_model, response_stream_usage.prompt_tokens, response_stream_usage.completion_tokens, ) - logs = log_exporter.get_finished_logs() - assert len(logs) == 3 + if latest_experimental_enabled: + if expect_content: + # first call + signal = ( + logs[0].log_record if content_mode == "event" else spans[0] + ) + assert_messages_attribute( + signal.attributes["gen_ai.input.messages"], + WEATHER_TOOL_EXPECTED_INPUT_MESSAGES, + ) + + first_output = [ + { + "role": "assistant", + "parts": [ + { + "type": "tool_call", + "id": tool_call_ids[0], + "name": "get_current_weather", + "arguments": {"location": "Seattle, WA"}, + }, + { + "type": "tool_call", + "id": tool_call_ids[1], + "name": "get_current_weather", + "arguments": {"location": "San Francisco, CA"}, + }, + ], + "finish_reason": "tool_calls", + } + ] + assert_messages_attribute( + signal.attributes["gen_ai.output.messages"], first_output + ) + else: + assert len(logs) == 3 - system_message = ( - {"content": messages_value[0]["content"]} if expect_content else None - ) - assert_message_in_logs( - logs[0], "gen_ai.system.message", system_message, spans[0] - ) + system_message = ( + {"content": WEATHER_TOOL_PROMPT[0]["content"]} + if expect_content + else None + ) + assert_message_in_logs( + logs[0], "gen_ai.system.message", system_message, spans[0] + ) - user_message = ( - {"content": "What's the weather in Seattle and San Francisco today?"} - if expect_content - else None - ) - assert_message_in_logs( - logs[1], "gen_ai.user.message", user_message, spans[0] - ) + user_message = ( + { + "content": "What's the weather in Seattle and San Francisco today?" + } + if expect_content + else None + ) + assert_message_in_logs( + logs[1], "gen_ai.user.message", user_message, spans[0] + ) - choice_event = { - "index": 0, - "finish_reason": "tool_calls", - "message": { - "role": "assistant", - "tool_calls": [ - { - "id": tool_call_ids[0], - "type": "function", - "function": { - "name": tool_names[0], - "arguments": ( - tool_args[0].replace("\n", "") - if expect_content - else None - ), + choice_event = { + "index": 0, + "finish_reason": "tool_calls", + "message": { + "role": "assistant", + "tool_calls": [ + { + "id": tool_call_ids[0], + "type": "function", + "function": { + "name": tool_names[0], + "arguments": ( + tool_args[0].replace("\n", "") + if expect_content + else None + ), + }, }, - }, - { - "id": tool_call_ids[1], - "type": "function", - "function": { - "name": tool_names[1], - "arguments": ( - tool_args[1].replace("\n", "") - if expect_content - else None - ), + { + "id": tool_call_ids[1], + "type": "function", + "function": { + "name": tool_names[1], + "arguments": ( + tool_args[1].replace("\n", "") + if expect_content + else None + ), + }, }, - }, - ], - }, - } - assert_message_in_logs(logs[2], "gen_ai.choice", choice_event, spans[0]) + ], + }, + } + assert_message_in_logs( + logs[2], "gen_ai.choice", choice_event, spans[0] + ) def assert_message_in_logs(log, event_name, expected_content, parent_span): @@ -811,124 +1343,3 @@ def assert_message_in_logs(log, event_name, expected_content, parent_span): expected_content ) assert_log_parent(log, parent_span) - - -def remove_none_values(body): - result = {} - for key, value in body.items(): - if value is None: - continue - if isinstance(value, dict): - result[key] = remove_none_values(value) - elif isinstance(value, list): - result[key] = [remove_none_values(i) for i in value] - else: - result[key] = value - return result - - -def assert_completion_attributes( - span: ReadableSpan, - request_model: str, - response: ChatCompletion, - operation_name: str = "chat", - server_address: str = "api.openai.com", -): - return assert_all_attributes( - span, - request_model, - response.id, - response.model, - response.usage.prompt_tokens, - response.usage.completion_tokens, - operation_name, - server_address, - ) - - -def assert_all_attributes( - span: ReadableSpan, - request_model: str, - response_id: str = None, - response_model: str = None, - input_tokens: Optional[int] = None, - output_tokens: Optional[int] = None, - operation_name: str = "chat", - server_address: str = "api.openai.com", -): - assert span.name == f"{operation_name} {request_model}" - assert ( - operation_name - == span.attributes[GenAIAttributes.GEN_AI_OPERATION_NAME] - ) - assert ( - GenAIAttributes.GenAiSystemValues.OPENAI.value - == span.attributes[GenAIAttributes.GEN_AI_SYSTEM] - ) - assert ( - request_model == span.attributes[GenAIAttributes.GEN_AI_REQUEST_MODEL] - ) - if response_model: - assert ( - response_model - == span.attributes[GenAIAttributes.GEN_AI_RESPONSE_MODEL] - ) - else: - assert GenAIAttributes.GEN_AI_RESPONSE_MODEL not in span.attributes - - if response_id: - assert ( - response_id == span.attributes[GenAIAttributes.GEN_AI_RESPONSE_ID] - ) - else: - assert GenAIAttributes.GEN_AI_RESPONSE_ID not in span.attributes - - if input_tokens: - assert ( - input_tokens - == span.attributes[GenAIAttributes.GEN_AI_USAGE_INPUT_TOKENS] - ) - else: - assert GenAIAttributes.GEN_AI_USAGE_INPUT_TOKENS not in span.attributes - - if output_tokens: - assert ( - output_tokens - == span.attributes[GenAIAttributes.GEN_AI_USAGE_OUTPUT_TOKENS] - ) - else: - assert ( - GenAIAttributes.GEN_AI_USAGE_OUTPUT_TOKENS not in span.attributes - ) - - assert server_address == span.attributes[ServerAttributes.SERVER_ADDRESS] - - -def assert_log_parent(log, span): - if span: - assert log.log_record.trace_id == span.get_span_context().trace_id - assert log.log_record.span_id == span.get_span_context().span_id - assert ( - log.log_record.trace_flags == span.get_span_context().trace_flags - ) - - -def get_current_weather_tool_definition(): - return { - "type": "function", - "function": { - "name": "get_current_weather", - "description": "Get the current weather in a given location", - "parameters": { - "type": "object", - "properties": { - "location": { - "type": "string", - "description": "The city and state, e.g. Boston, MA", - }, - }, - "required": ["location"], - "additionalProperties": False, - }, - }, - } diff --git a/instrumentation-genai/opentelemetry-instrumentation-openai-v2/tests/test_chat_completions.py b/instrumentation-genai/opentelemetry-instrumentation-openai-v2/tests/test_chat_completions.py index 914d5b5b98..f0dbcc4bc9 100644 --- a/instrumentation-genai/opentelemetry-instrumentation-openai-v2/tests/test_chat_completions.py +++ b/instrumentation-genai/opentelemetry-instrumentation-openai-v2/tests/test_chat_completions.py @@ -13,13 +13,13 @@ # limitations under the License. # pylint: disable=too-many-locals -from typing import Optional import pytest from openai import APIConnectionError, NotFoundError, OpenAI -from openai.resources.chat.completions import ChatCompletion -from opentelemetry.sdk.trace import ReadableSpan +from opentelemetry.instrumentation.openai_v2.utils import ( + is_latest_experimental_enabled, +) from opentelemetry.semconv._incubating.attributes import ( error_attributes as ErrorAttributes, ) @@ -33,267 +33,440 @@ server_attributes as ServerAttributes, ) from opentelemetry.semconv._incubating.metrics import gen_ai_metrics +from tests.test_utils import ( + DEFAULT_MODEL, + USER_ONLY_EXPECTED_INPUT_MESSAGES, + USER_ONLY_PROMPT, + WEATHER_TOOL_EXPECTED_INPUT_MESSAGES, + WEATHER_TOOL_PROMPT, + assert_all_attributes, + assert_completion_attributes, + assert_log_parent, + assert_messages_attribute, + format_simple_expected_output_message, + get_current_weather_tool_definition, +) @pytest.mark.vcr() def test_chat_completion_with_content( - span_exporter, log_exporter, openai_client, instrument_with_content + span_exporter, + log_exporter, + openai_client, + instrument_with_content, + content_mode, + vcr, ): - llm_model_value = "gpt-4o-mini" - messages_value = [{"role": "user", "content": "Say this is a test"}] - - response = openai_client.chat.completions.create( - messages=messages_value, model=llm_model_value, stream=False - ) - - spans = span_exporter.get_finished_spans() - assert_completion_attributes(spans[0], llm_model_value, response) + with vcr.use_cassette("test_chat_completion_with_content.yaml"): + latest_experimental_enabled = is_latest_experimental_enabled() - logs = log_exporter.get_finished_logs() - assert len(logs) == 2 + response = openai_client.chat.completions.create( + messages=USER_ONLY_PROMPT, model=DEFAULT_MODEL, stream=False + ) - user_message = {"content": messages_value[0]["content"]} - assert_message_in_logs( - logs[0], "gen_ai.user.message", user_message, spans[0] - ) + spans = span_exporter.get_finished_spans() + logs = log_exporter.get_finished_logs() + if content_mode == "event": + assert len(logs) == 1 + + assert_completion_attributes( + spans[0], + logs[0].log_record if content_mode == "event" else None, + DEFAULT_MODEL, + response, + latest_experimental_enabled, + ) - choice_event = { - "index": 0, - "finish_reason": "stop", - "message": { - "role": "assistant", - "content": response.choices[0].message.content, - }, - } - assert_message_in_logs(logs[1], "gen_ai.choice", choice_event, spans[0]) + if latest_experimental_enabled: + signal = ( + logs[0].log_record if content_mode == "event" else spans[0] + ) + assert_messages_attribute( + signal.attributes["gen_ai.input.messages"], + USER_ONLY_EXPECTED_INPUT_MESSAGES, + ) + assert_messages_attribute( + signal.attributes["gen_ai.output.messages"], + format_simple_expected_output_message( + response.choices[0].message.content + ), + ) + else: + assert len(logs) == 2 + + user_message = {"content": USER_ONLY_PROMPT[0]["content"]} + assert_message_in_logs( + logs[0], "gen_ai.user.message", user_message, spans[0] + ) + + choice_event = { + "index": 0, + "finish_reason": "stop", + "message": { + "role": "assistant", + "content": response.choices[0].message.content, + }, + } + assert_message_in_logs( + logs[1], "gen_ai.choice", choice_event, spans[0] + ) @pytest.mark.vcr() def test_chat_completion_no_content( - span_exporter, log_exporter, openai_client, instrument_no_content + span_exporter, + log_exporter, + openai_client, + instrument_no_content, + vcr, ): - llm_model_value = "gpt-4o-mini" - messages_value = [{"role": "user", "content": "Say this is a test"}] + with vcr.use_cassette("test_chat_completion_no_content.yaml"): + latest_experimental_enabled = is_latest_experimental_enabled() - response = openai_client.chat.completions.create( - messages=messages_value, model=llm_model_value, stream=False - ) + response = openai_client.chat.completions.create( + messages=USER_ONLY_PROMPT, model=DEFAULT_MODEL, stream=False + ) - spans = span_exporter.get_finished_spans() - assert_completion_attributes(spans[0], llm_model_value, response) + spans = span_exporter.get_finished_spans() + assert_completion_attributes( + spans[0], + None, + DEFAULT_MODEL, + response, + latest_experimental_enabled, + ) - logs = log_exporter.get_finished_logs() - assert len(logs) == 2 + logs = log_exporter.get_finished_logs() + if latest_experimental_enabled: + assert len(logs) == 0 + assert "gen_ai.input.messages" not in spans[0].attributes + assert "gen_ai.output.messages" not in spans[0].attributes + else: + assert len(logs) == 2 - assert_message_in_logs(logs[0], "gen_ai.user.message", None, spans[0]) + assert_message_in_logs( + logs[0], "gen_ai.user.message", None, spans[0] + ) - choice_event = { - "index": 0, - "finish_reason": "stop", - "message": {"role": "assistant"}, - } - assert_message_in_logs(logs[1], "gen_ai.choice", choice_event, spans[0]) + choice_event = { + "index": 0, + "finish_reason": "stop", + "message": {"role": "assistant"}, + } + assert_message_in_logs( + logs[1], "gen_ai.choice", choice_event, spans[0] + ) def test_chat_completion_bad_endpoint( - span_exporter, metric_reader, instrument_no_content + span_exporter, + metric_reader, + instrument_no_content, + vcr, ): - llm_model_value = "gpt-4o-mini" - messages_value = [{"role": "user", "content": "Say this is a test"}] - - client = OpenAI(base_url="http://localhost:4242") - - with pytest.raises(APIConnectionError): - client.chat.completions.create( - messages=messages_value, - model=llm_model_value, - timeout=0.1, + with vcr.use_cassette("test_chat_completion_bad_endpoint.yaml"): + latest_experimental_enabled = is_latest_experimental_enabled() + + client = OpenAI(base_url="http://localhost:4242") + + with pytest.raises(APIConnectionError): + client.chat.completions.create( + messages=USER_ONLY_PROMPT, + model=DEFAULT_MODEL, + timeout=0.1, + ) + + spans = span_exporter.get_finished_spans() + assert_all_attributes( + spans[0], + None, + DEFAULT_MODEL, + latest_experimental_enabled, + server_address="localhost", + ) + assert 4242 == spans[0].attributes[ServerAttributes.SERVER_PORT] + assert ( + "APIConnectionError" + == spans[0].attributes[ErrorAttributes.ERROR_TYPE] ) - spans = span_exporter.get_finished_spans() - assert_all_attributes( - spans[0], llm_model_value, server_address="localhost" - ) - assert 4242 == spans[0].attributes[ServerAttributes.SERVER_PORT] - assert ( - "APIConnectionError" == spans[0].attributes[ErrorAttributes.ERROR_TYPE] - ) - - metrics = metric_reader.get_metrics_data().resource_metrics - assert len(metrics) == 1 - - metric_data = metrics[0].scope_metrics[0].metrics - duration_metric = next( - ( - m - for m in metric_data - if m.name == gen_ai_metrics.GEN_AI_CLIENT_OPERATION_DURATION - ), - None, - ) - assert duration_metric is not None - assert duration_metric.data.data_points[0].sum > 0 - assert ( - duration_metric.data.data_points[0].attributes[ - ErrorAttributes.ERROR_TYPE - ] - == "APIConnectionError" - ) + metrics = metric_reader.get_metrics_data().resource_metrics + assert len(metrics) == 1 + + metric_data = metrics[0].scope_metrics[0].metrics + duration_metric = next( + ( + m + for m in metric_data + if m.name == gen_ai_metrics.GEN_AI_CLIENT_OPERATION_DURATION + ), + None, + ) + assert duration_metric is not None + assert duration_metric.data.data_points[0].sum > 0 + assert ( + duration_metric.data.data_points[0].attributes[ + ErrorAttributes.ERROR_TYPE + ] + == "APIConnectionError" + ) @pytest.mark.vcr() def test_chat_completion_404( - span_exporter, openai_client, metric_reader, instrument_no_content + span_exporter, + openai_client, + metric_reader, + instrument_no_content, + vcr, ): - llm_model_value = "this-model-does-not-exist" - messages_value = [{"role": "user", "content": "Say this is a test"}] - - with pytest.raises(NotFoundError): - openai_client.chat.completions.create( - messages=messages_value, - model=llm_model_value, - ) + with vcr.use_cassette("test_chat_completion_404.yaml"): + latest_experimental_enabled = is_latest_experimental_enabled() + llm_model_value = "this-model-does-not-exist" - spans = span_exporter.get_finished_spans() + with pytest.raises(NotFoundError): + openai_client.chat.completions.create( + messages=USER_ONLY_PROMPT, + model=llm_model_value, + ) - assert_all_attributes(spans[0], llm_model_value) - assert "NotFoundError" == spans[0].attributes[ErrorAttributes.ERROR_TYPE] + spans = span_exporter.get_finished_spans() - metrics = metric_reader.get_metrics_data().resource_metrics - assert len(metrics) == 1 + assert_all_attributes( + spans[0], None, llm_model_value, latest_experimental_enabled + ) + assert ( + "NotFoundError" == spans[0].attributes[ErrorAttributes.ERROR_TYPE] + ) - metric_data = metrics[0].scope_metrics[0].metrics - duration_metric = next( - ( - m - for m in metric_data - if m.name == gen_ai_metrics.GEN_AI_CLIENT_OPERATION_DURATION - ), - None, - ) - assert duration_metric is not None - assert duration_metric.data.data_points[0].sum > 0 - assert ( - duration_metric.data.data_points[0].attributes[ - ErrorAttributes.ERROR_TYPE - ] - == "NotFoundError" - ) + metrics = metric_reader.get_metrics_data().resource_metrics + assert len(metrics) == 1 + + metric_data = metrics[0].scope_metrics[0].metrics + duration_metric = next( + ( + m + for m in metric_data + if m.name == gen_ai_metrics.GEN_AI_CLIENT_OPERATION_DURATION + ), + None, + ) + assert duration_metric is not None + assert duration_metric.data.data_points[0].sum > 0 + assert ( + duration_metric.data.data_points[0].attributes[ + ErrorAttributes.ERROR_TYPE + ] + == "NotFoundError" + ) @pytest.mark.vcr() def test_chat_completion_extra_params( - span_exporter, openai_client, instrument_no_content + span_exporter, + openai_client, + instrument_no_content, + vcr, ): - llm_model_value = "gpt-4o-mini" - messages_value = [{"role": "user", "content": "Say this is a test"}] + with vcr.use_cassette("test_chat_completion_extra_params.yaml"): + latest_experimental_enabled = is_latest_experimental_enabled() + + response = openai_client.chat.completions.create( + messages=USER_ONLY_PROMPT, + model=DEFAULT_MODEL, + seed=42, + temperature=0.5, + max_tokens=50, + stream=False, + extra_body={"service_tier": "default"}, + response_format={"type": "text"}, + ) - response = openai_client.chat.completions.create( - messages=messages_value, - model=llm_model_value, - seed=42, - temperature=0.5, - max_tokens=50, - stream=False, - extra_body={"service_tier": "default"}, - response_format={"type": "text"}, - ) + spans = span_exporter.get_finished_spans() + assert_completion_attributes( + spans[0], + None, + DEFAULT_MODEL, + response, + latest_experimental_enabled, + ) - spans = span_exporter.get_finished_spans() - assert_completion_attributes(spans[0], llm_model_value, response) - assert ( - spans[0].attributes[GenAIAttributes.GEN_AI_OPENAI_REQUEST_SEED] == 42 - ) - assert ( - spans[0].attributes[GenAIAttributes.GEN_AI_REQUEST_TEMPERATURE] == 0.5 - ) - assert spans[0].attributes[GenAIAttributes.GEN_AI_REQUEST_MAX_TOKENS] == 50 - assert ( - spans[0].attributes[GenAIAttributes.GEN_AI_OPENAI_REQUEST_SERVICE_TIER] - == "default" - ) - assert ( - spans[0].attributes[ - GenAIAttributes.GEN_AI_OPENAI_REQUEST_RESPONSE_FORMAT - ] - == "text" - ) + request_seed_attr_key = ( + "gen_ai.request.seed" + if latest_experimental_enabled + else GenAIAttributes.GEN_AI_OPENAI_REQUEST_SEED + ) + assert spans[0].attributes[request_seed_attr_key] == 42 + assert ( + spans[0].attributes[GenAIAttributes.GEN_AI_REQUEST_TEMPERATURE] + == 0.5 + ) + assert ( + spans[0].attributes[GenAIAttributes.GEN_AI_REQUEST_MAX_TOKENS] + == 50 + ) + + service_tier_attr_key = ( + "openai.request.service_tier" + if latest_experimental_enabled + else GenAIAttributes.GEN_AI_OPENAI_REQUEST_SERVICE_TIER + ) + assert spans[0].attributes[service_tier_attr_key] == "default" + + output_type_attr_key = ( + "gen_ai.output.type" + if latest_experimental_enabled + else GenAIAttributes.GEN_AI_OPENAI_REQUEST_RESPONSE_FORMAT + ) + assert spans[0].attributes[output_type_attr_key] == "text" @pytest.mark.vcr() def test_chat_completion_multiple_choices( - span_exporter, log_exporter, openai_client, instrument_with_content + span_exporter, + log_exporter, + openai_client, + instrument_with_content, + content_mode, + vcr, ): - llm_model_value = "gpt-4o-mini" - messages_value = [{"role": "user", "content": "Say this is a test"}] + with vcr.use_cassette("test_chat_completion_multiple_choices.yaml"): + latest_experimental_enabled = is_latest_experimental_enabled() - response = openai_client.chat.completions.create( - messages=messages_value, model=llm_model_value, n=2, stream=False - ) - - spans = span_exporter.get_finished_spans() - assert_completion_attributes(spans[0], llm_model_value, response) - - logs = log_exporter.get_finished_logs() - assert len(logs) == 3 # 1 user message + 2 choice messages - - user_message = {"content": messages_value[0]["content"]} - assert_message_in_logs( - logs[0], "gen_ai.user.message", user_message, spans[0] - ) - - choice_event_0 = { - "index": 0, - "finish_reason": "stop", - "message": { - "role": "assistant", - "content": response.choices[0].message.content, - }, - } - assert_message_in_logs(logs[1], "gen_ai.choice", choice_event_0, spans[0]) + response = openai_client.chat.completions.create( + messages=USER_ONLY_PROMPT, model=DEFAULT_MODEL, n=2, stream=False + ) - choice_event_1 = { - "index": 1, - "finish_reason": "stop", - "message": { - "role": "assistant", - "content": response.choices[1].message.content, - }, - } - assert_message_in_logs(logs[2], "gen_ai.choice", choice_event_1, spans[0]) + spans = span_exporter.get_finished_spans() + logs = log_exporter.get_finished_logs() + if content_mode == "event": + assert len(logs) == 1 + assert_completion_attributes( + spans[0], + logs[0].log_record if content_mode == "event" else None, + DEFAULT_MODEL, + response, + latest_experimental_enabled, + ) + if latest_experimental_enabled: + expected_output_messages = [ + { + "role": "assistant", + "parts": [ + { + "type": "text", + "content": response.choices[0].message.content, + } + ], + "finish_reason": "stop", + }, + { + "role": "assistant", + "parts": [ + { + "type": "text", + "content": response.choices[1].message.content, + } + ], + "finish_reason": "stop", + }, + ] + + signal = ( + logs[0].log_record if content_mode == "event" else spans[0] + ) + assert_messages_attribute( + signal.attributes["gen_ai.input.messages"], + USER_ONLY_EXPECTED_INPUT_MESSAGES, + ) + assert_messages_attribute( + signal.attributes["gen_ai.output.messages"], + expected_output_messages, + ) + else: + assert len(logs) == 3 # 1 user message + 2 choice messages + + user_message = {"content": USER_ONLY_PROMPT[0]["content"]} + assert_message_in_logs( + logs[0], "gen_ai.user.message", user_message, spans[0] + ) + + choice_event_0 = { + "index": 0, + "finish_reason": "stop", + "message": { + "role": "assistant", + "content": response.choices[0].message.content, + }, + } + assert_message_in_logs( + logs[1], "gen_ai.choice", choice_event_0, spans[0] + ) + + choice_event_1 = { + "index": 1, + "finish_reason": "stop", + "message": { + "role": "assistant", + "content": response.choices[1].message.content, + }, + } + assert_message_in_logs( + logs[2], "gen_ai.choice", choice_event_1, spans[0] + ) @pytest.mark.vcr() def test_chat_completion_tool_calls_with_content( - span_exporter, log_exporter, openai_client, instrument_with_content + span_exporter, + log_exporter, + openai_client, + instrument_with_content, + content_mode, + vcr, ): - chat_completion_tool_call(span_exporter, log_exporter, openai_client, True) + with vcr.use_cassette("test_chat_completion_tool_calls_with_content.yaml"): + chat_completion_tool_call( + span_exporter, + log_exporter, + openai_client, + True, + content_mode, + is_latest_experimental_enabled(), + ) @pytest.mark.vcr() def test_chat_completion_tool_calls_no_content( - span_exporter, log_exporter, openai_client, instrument_no_content + span_exporter, + log_exporter, + openai_client, + instrument_no_content, + vcr, ): - chat_completion_tool_call( - span_exporter, log_exporter, openai_client, False - ) + with vcr.use_cassette("test_chat_completion_tool_calls_no_content.yaml"): + chat_completion_tool_call( + span_exporter, + log_exporter, + openai_client, + False, + None, + is_latest_experimental_enabled(), + ) def chat_completion_tool_call( - span_exporter, log_exporter, openai_client, expect_content + span_exporter, + log_exporter, + openai_client, + expect_content, + content_mode, + latest_experimental_enabled, ): - llm_model_value = "gpt-4o-mini" - messages_value = [ - {"role": "system", "content": "You're a helpful assistant."}, - { - "role": "user", - "content": "What's the weather in Seattle and San Francisco today?", - }, - ] - + messages_value = WEATHER_TOOL_PROMPT.copy() response_0 = openai_client.chat.completions.create( messages=messages_value, - model=llm_model_value, + model=DEFAULT_MODEL, tool_choice="auto", tools=[get_current_weather_tool_definition()], ) @@ -326,7 +499,7 @@ def chat_completion_tool_call( messages_value.append(tool_call_result_1) response_1 = openai_client.chat.completions.create( - messages=messages_value, model=llm_model_value + messages=messages_value, model=DEFAULT_MODEL ) # sanity check @@ -335,328 +508,577 @@ def chat_completion_tool_call( # validate both calls spans = span_exporter.get_finished_spans() assert len(spans) == 2 - assert_completion_attributes(spans[0], llm_model_value, response_0) - assert_completion_attributes(spans[1], llm_model_value, response_1) logs = log_exporter.get_finished_logs() - assert len(logs) == 9 # 3 logs for first completion, 6 for second - - # call one - system_message = ( - {"content": messages_value[0]["content"]} if expect_content else None - ) - assert_message_in_logs( - logs[0], "gen_ai.system.message", system_message, spans[0] - ) + if content_mode == "event": + assert len(logs) == 2 - user_message = ( - {"content": messages_value[1]["content"]} if expect_content else None + assert_completion_attributes( + spans[0], + logs[0].log_record if content_mode == "event" else None, + DEFAULT_MODEL, + response_0, + latest_experimental_enabled, ) - assert_message_in_logs( - logs[1], "gen_ai.user.message", user_message, spans[0] + assert_completion_attributes( + spans[1], + logs[1].log_record if content_mode == "event" else None, + DEFAULT_MODEL, + response_1, + latest_experimental_enabled, ) - function_call_0 = {"name": "get_current_weather"} - function_call_1 = {"name": "get_current_weather"} - if expect_content: - function_call_0["arguments"] = ( - response_0.choices[0] - .message.tool_calls[0] - .function.arguments.replace("\n", "") - ) - function_call_1["arguments"] = ( - response_0.choices[0] - .message.tool_calls[1] - .function.arguments.replace("\n", "") - ) - - choice_event = { - "index": 0, - "finish_reason": "tool_calls", - "message": { - "role": "assistant", - "tool_calls": [ + if latest_experimental_enabled: + if not expect_content: + pass + else: + # first call + signal_0 = ( + logs[0].log_record if content_mode == "event" else spans[0] + ) + assert_messages_attribute( + signal_0.attributes["gen_ai.input.messages"], + WEATHER_TOOL_EXPECTED_INPUT_MESSAGES, + ) + + first_output = [ { - "id": response_0.choices[0].message.tool_calls[0].id, - "type": "function", - "function": function_call_0, + "role": "assistant", + "parts": [ + { + "type": "tool_call", + "id": response_0.choices[0] + .message.tool_calls[0] + .id, + "name": "get_current_weather", + "arguments": {"location": "Seattle, WA"}, + }, + { + "type": "tool_call", + "id": response_0.choices[0] + .message.tool_calls[1] + .id, + "name": "get_current_weather", + "arguments": {"location": "San Francisco, CA"}, + }, + ], + "finish_reason": "tool_calls", + } + ] + + assert_messages_attribute( + signal_0.attributes["gen_ai.output.messages"], first_output + ) + + # second call + del first_output[0]["finish_reason"] + second_input = [] + second_input += WEATHER_TOOL_EXPECTED_INPUT_MESSAGES.copy() + second_input += first_output + second_input += [ + { + "role": "tool", + "parts": [ + { + "type": "tool_call_response", + "id": response_0.choices[0] + .message.tool_calls[0] + .id, + "response": tool_call_result_0["content"], + } + ], }, { - "id": response_0.choices[0].message.tool_calls[1].id, - "type": "function", - "function": function_call_1, + "role": "tool", + "parts": [ + { + "type": "tool_call_response", + "id": response_0.choices[0] + .message.tool_calls[1] + .id, + "response": tool_call_result_1["content"], + } + ], }, - ], - }, - } - assert_message_in_logs(logs[2], "gen_ai.choice", choice_event, spans[0]) + ] + + signal_1 = ( + logs[1].log_record if content_mode == "event" else spans[1] + ) + assert_messages_attribute( + signal_1.attributes["gen_ai.input.messages"], second_input + ) + + assert_messages_attribute( + signal_1.attributes["gen_ai.output.messages"], + format_simple_expected_output_message( + response_1.choices[0].message.content + ), + ) + else: + assert len(logs) == 9 # 3 logs for first completion, 6 for second - # call two - system_message = ( - {"content": messages_value[0]["content"]} if expect_content else None - ) - assert_message_in_logs( - logs[3], "gen_ai.system.message", system_message, spans[1] - ) + # call one + system_message = ( + {"content": messages_value[0]["content"]} + if expect_content + else None + ) + assert_message_in_logs( + logs[0], "gen_ai.system.message", system_message, spans[0] + ) - user_message = ( - {"content": messages_value[1]["content"]} if expect_content else None - ) - assert_message_in_logs( - logs[4], "gen_ai.user.message", user_message, spans[1] - ) + user_message = ( + {"content": messages_value[1]["content"]} + if expect_content + else None + ) + assert_message_in_logs( + logs[1], "gen_ai.user.message", user_message, spans[0] + ) - assistant_tool_call = {"tool_calls": messages_value[2]["tool_calls"]} - if not expect_content: - assistant_tool_call["tool_calls"][0]["function"]["arguments"] = None - assistant_tool_call["tool_calls"][1]["function"]["arguments"] = None + function_call_0 = {"name": "get_current_weather"} + function_call_1 = {"name": "get_current_weather"} + if expect_content: + function_call_0["arguments"] = ( + response_0.choices[0] + .message.tool_calls[0] + .function.arguments.replace("\n", "") + ) + function_call_1["arguments"] = ( + response_0.choices[0] + .message.tool_calls[1] + .function.arguments.replace("\n", "") + ) + + choice_event = { + "index": 0, + "finish_reason": "tool_calls", + "message": { + "role": "assistant", + "tool_calls": [ + { + "id": response_0.choices[0].message.tool_calls[0].id, + "type": "function", + "function": function_call_0, + }, + { + "id": response_0.choices[0].message.tool_calls[1].id, + "type": "function", + "function": function_call_1, + }, + ], + }, + } + assert_message_in_logs( + logs[2], "gen_ai.choice", choice_event, spans[0] + ) - assert_message_in_logs( - logs[5], "gen_ai.assistant.message", assistant_tool_call, spans[1] - ) + # call two + system_message = ( + {"content": messages_value[0]["content"]} + if expect_content + else None + ) + assert_message_in_logs( + logs[3], "gen_ai.system.message", system_message, spans[1] + ) - tool_message_0 = { - "id": tool_call_result_0["tool_call_id"], - "content": tool_call_result_0["content"] if expect_content else None, - } + user_message = ( + {"content": messages_value[1]["content"]} + if expect_content + else None + ) + assert_message_in_logs( + logs[4], "gen_ai.user.message", user_message, spans[1] + ) - assert_message_in_logs( - logs[6], "gen_ai.tool.message", tool_message_0, spans[1] - ) + assistant_tool_call = {"tool_calls": messages_value[2]["tool_calls"]} + if not expect_content: + assistant_tool_call["tool_calls"][0]["function"]["arguments"] = ( + None + ) + assistant_tool_call["tool_calls"][1]["function"]["arguments"] = ( + None + ) + + assert_message_in_logs( + logs[5], "gen_ai.assistant.message", assistant_tool_call, spans[1] + ) - tool_message_1 = { - "id": tool_call_result_1["tool_call_id"], - "content": tool_call_result_1["content"] if expect_content else None, - } + tool_message_0 = { + "id": tool_call_result_0["tool_call_id"], + "content": tool_call_result_0["content"] + if expect_content + else None, + } - assert_message_in_logs( - logs[7], "gen_ai.tool.message", tool_message_1, spans[1] - ) + assert_message_in_logs( + logs[6], "gen_ai.tool.message", tool_message_0, spans[1] + ) - message = { - "role": "assistant", - "content": response_1.choices[0].message.content - if expect_content - else None, - } - choice = { - "index": 0, - "finish_reason": "stop", - "message": message, - } - assert_message_in_logs(logs[8], "gen_ai.choice", choice, spans[1]) + tool_message_1 = { + "id": tool_call_result_1["tool_call_id"], + "content": tool_call_result_1["content"] + if expect_content + else None, + } + + assert_message_in_logs( + logs[7], "gen_ai.tool.message", tool_message_1, spans[1] + ) + + message = { + "role": "assistant", + "content": response_1.choices[0].message.content + if expect_content + else None, + } + choice = { + "index": 0, + "finish_reason": "stop", + "message": message, + } + assert_message_in_logs(logs[8], "gen_ai.choice", choice, spans[1]) @pytest.mark.vcr() def test_chat_completion_streaming( - span_exporter, log_exporter, openai_client, instrument_with_content + span_exporter, + log_exporter, + openai_client, + instrument_with_content, + content_mode, + vcr, ): - llm_model_value = "gpt-4" - messages_value = [{"role": "user", "content": "Say this is a test"}] - - kwargs = { - "model": llm_model_value, - "messages": messages_value, - "stream": True, - "stream_options": {"include_usage": True}, - } - - response_stream_usage = None - response_stream_model = None - response_stream_id = None - response_stream_result = "" - response = openai_client.chat.completions.create(**kwargs) - for chunk in response: - if chunk.choices: - response_stream_result += chunk.choices[0].delta.content or "" - - # get the last chunk - if getattr(chunk, "usage", None): - response_stream_usage = chunk.usage - response_stream_model = chunk.model - response_stream_id = chunk.id - - spans = span_exporter.get_finished_spans() - assert_all_attributes( - spans[0], - llm_model_value, - response_stream_id, - response_stream_model, - response_stream_usage.prompt_tokens, - response_stream_usage.completion_tokens, - ) - - logs = log_exporter.get_finished_logs() - assert len(logs) == 2 - - user_message = {"content": "Say this is a test"} - assert_message_in_logs( - logs[0], "gen_ai.user.message", user_message, spans[0] - ) + with vcr.use_cassette("test_chat_completion_streaming.yaml"): + latest_experimental_enabled = is_latest_experimental_enabled() + llm_model_value = "gpt-4" + + kwargs = { + "model": llm_model_value, + "messages": USER_ONLY_PROMPT, + "stream": True, + "stream_options": {"include_usage": True}, + } - choice_event = { - "index": 0, - "finish_reason": "stop", - "message": {"role": "assistant", "content": response_stream_result}, - } - assert_message_in_logs(logs[1], "gen_ai.choice", choice_event, spans[0]) + response_stream_usage = None + response_stream_model = None + response_stream_id = None + response_stream_result = "" + response = openai_client.chat.completions.create(**kwargs) + for chunk in response: + if chunk.choices: + response_stream_result += chunk.choices[0].delta.content or "" + + # get the last chunk + if getattr(chunk, "usage", None): + response_stream_usage = chunk.usage + response_stream_model = chunk.model + response_stream_id = chunk.id + + spans = span_exporter.get_finished_spans() + logs = log_exporter.get_finished_logs() + if content_mode == "events": + assert len(logs) == 1 + + assert_all_attributes( + spans[0], + logs[0].log_record if content_mode == "event" else None, + llm_model_value, + latest_experimental_enabled, + response_stream_id, + response_stream_model, + response_stream_usage.prompt_tokens, + response_stream_usage.completion_tokens, + ) + if latest_experimental_enabled: + signal = ( + logs[0].log_record if content_mode == "event" else spans[0] + ) + assert_messages_attribute( + signal.attributes["gen_ai.input.messages"], + USER_ONLY_EXPECTED_INPUT_MESSAGES, + ) + assert_messages_attribute( + signal.attributes["gen_ai.output.messages"], + format_simple_expected_output_message(response_stream_result), + ) + else: + assert len(logs) == 2 + + user_message = {"content": "Say this is a test"} + assert_message_in_logs( + logs[0], "gen_ai.user.message", user_message, spans[0] + ) + + choice_event = { + "index": 0, + "finish_reason": "stop", + "message": { + "role": "assistant", + "content": response_stream_result, + }, + } + assert_message_in_logs( + logs[1], "gen_ai.choice", choice_event, spans[0] + ) @pytest.mark.vcr() def test_chat_completion_streaming_not_complete( - span_exporter, log_exporter, openai_client, instrument_with_content + span_exporter, + log_exporter, + openai_client, + instrument_with_content, + content_mode, + vcr, ): - llm_model_value = "gpt-4" - messages_value = [{"role": "user", "content": "Say this is a test"}] - - kwargs = { - "model": llm_model_value, - "messages": messages_value, - "stream": True, - } - - response_stream_model = None - response_stream_id = None - response_stream_result = "" - response = openai_client.chat.completions.create(**kwargs) - for idx, chunk in enumerate(response): - if chunk.choices: - response_stream_result += chunk.choices[0].delta.content or "" - if idx == 1: - # fake a stop - break - - if chunk.model: - response_stream_model = chunk.model - if chunk.id: - response_stream_id = chunk.id - - response.close() - spans = span_exporter.get_finished_spans() - assert_all_attributes( - spans[0], llm_model_value, response_stream_id, response_stream_model - ) - - logs = log_exporter.get_finished_logs() - assert len(logs) == 2 - - user_message = {"content": "Say this is a test"} - assert_message_in_logs( - logs[0], "gen_ai.user.message", user_message, spans[0] - ) + with vcr.use_cassette("test_chat_completion_streaming_not_complete.yaml"): + latest_experimental_enabled = is_latest_experimental_enabled() + llm_model_value = "gpt-4" + + kwargs = { + "model": llm_model_value, + "messages": USER_ONLY_PROMPT, + "stream": True, + } - choice_event = { - "index": 0, - "finish_reason": "error", - "message": {"role": "assistant", "content": response_stream_result}, - } - assert_message_in_logs(logs[1], "gen_ai.choice", choice_event, spans[0]) + response_stream_model = None + response_stream_id = None + response_stream_result = "" + response = openai_client.chat.completions.create(**kwargs) + for idx, chunk in enumerate(response): + if chunk.choices: + response_stream_result += chunk.choices[0].delta.content or "" + if idx == 1: + # fake a stop + break + + if chunk.model: + response_stream_model = chunk.model + if chunk.id: + response_stream_id = chunk.id + + response.close() + spans = span_exporter.get_finished_spans() + logs = log_exporter.get_finished_logs() + if content_mode == "events": + assert len(logs) == 1 + assert_all_attributes( + spans[0], + logs[0].log_record if content_mode == "event" else None, + llm_model_value, + latest_experimental_enabled, + response_stream_id, + response_stream_model, + ) + if latest_experimental_enabled: + signal = ( + logs[0].log_record if content_mode == "event" else spans[0] + ) + assert_messages_attribute( + signal.attributes["gen_ai.input.messages"], + USER_ONLY_EXPECTED_INPUT_MESSAGES, + ) + assert_messages_attribute( + signal.attributes["gen_ai.output.messages"], + format_simple_expected_output_message( + response_stream_result, finish_reason="error" + ), + ) + else: + assert len(logs) == 2 + + user_message = {"content": "Say this is a test"} + assert_message_in_logs( + logs[0], "gen_ai.user.message", user_message, spans[0] + ) + + choice_event = { + "index": 0, + "finish_reason": "error", + "message": { + "role": "assistant", + "content": response_stream_result, + }, + } + assert_message_in_logs( + logs[1], "gen_ai.choice", choice_event, spans[0] + ) @pytest.mark.vcr() def test_chat_completion_multiple_choices_streaming( - span_exporter, log_exporter, openai_client, instrument_with_content + span_exporter, + log_exporter, + openai_client, + instrument_with_content, + content_mode, + vcr, ): - llm_model_value = "gpt-4o-mini" - messages_value = [ - {"role": "system", "content": "You're a helpful assistant."}, - { - "role": "user", - "content": "What's the weather in Seattle and San Francisco today?", - }, - ] - - response_0 = openai_client.chat.completions.create( - messages=messages_value, - model=llm_model_value, - n=2, - stream=True, - stream_options={"include_usage": True}, - ) - - # two strings for each choice - response_stream_result = ["", ""] - finish_reasons = ["", ""] - for chunk in response_0: - if chunk.choices: - for choice in chunk.choices: - response_stream_result[choice.index] += ( - choice.delta.content or "" - ) - if choice.finish_reason: - finish_reasons[choice.index] = choice.finish_reason - - # get the last chunk - if getattr(chunk, "usage", None): - response_stream_usage = chunk.usage - response_stream_model = chunk.model - response_stream_id = chunk.id - - # sanity check - assert "stop" == finish_reasons[0] - - spans = span_exporter.get_finished_spans() - assert_all_attributes( - spans[0], - llm_model_value, - response_stream_id, - response_stream_model, - response_stream_usage.prompt_tokens, - response_stream_usage.completion_tokens, - ) - - logs = log_exporter.get_finished_logs() - assert len(logs) == 4 - - system_message = {"content": messages_value[0]["content"]} - assert_message_in_logs( - logs[0], "gen_ai.system.message", system_message, spans[0] - ) - - user_message = { - "content": "What's the weather in Seattle and San Francisco today?" - } - assert_message_in_logs( - logs[1], "gen_ai.user.message", user_message, spans[0] - ) - - choice_event_0 = { - "index": 0, - "finish_reason": "stop", - "message": { - "role": "assistant", - "content": "".join(response_stream_result[0]), - }, - } - assert_message_in_logs(logs[2], "gen_ai.choice", choice_event_0, spans[0]) + with vcr.use_cassette( + "test_chat_completion_multiple_choices_streaming.yaml" + ): + latest_experimental_enabled = is_latest_experimental_enabled() + response_0 = openai_client.chat.completions.create( + messages=WEATHER_TOOL_PROMPT, + model=DEFAULT_MODEL, + n=2, + stream=True, + stream_options={"include_usage": True}, + ) - choice_event_1 = { - "index": 1, - "finish_reason": "stop", - "message": { - "role": "assistant", - "content": "".join(response_stream_result[1]), - }, - } - assert_message_in_logs(logs[3], "gen_ai.choice", choice_event_1, spans[0]) + # two strings for each choice + response_stream_result = ["", ""] + finish_reasons = ["", ""] + for chunk in response_0: + if chunk.choices: + for choice in chunk.choices: + response_stream_result[choice.index] += ( + choice.delta.content or "" + ) + if choice.finish_reason: + finish_reasons[choice.index] = choice.finish_reason + + # get the last chunk + if getattr(chunk, "usage", None): + response_stream_usage = chunk.usage + response_stream_model = chunk.model + response_stream_id = chunk.id + + # sanity check + assert "stop" == finish_reasons[0] + + spans = span_exporter.get_finished_spans() + logs = log_exporter.get_finished_logs() + if content_mode == "events": + assert len(logs) == 1 + assert_all_attributes( + spans[0], + logs[0].log_record if content_mode == "event" else None, + DEFAULT_MODEL, + latest_experimental_enabled, + response_stream_id, + response_stream_model, + response_stream_usage.prompt_tokens, + response_stream_usage.completion_tokens, + ) + if latest_experimental_enabled: + expected_output_messages = [ + { + "role": "assistant", + "parts": [ + { + "type": "text", + "content": "".join(response_stream_result[0]), + } + ], + "finish_reason": "stop", + }, + { + "role": "assistant", + "parts": [ + { + "type": "text", + "content": "".join(response_stream_result[1]), + } + ], + "finish_reason": "stop", + }, + ] + signal = ( + logs[0].log_record if content_mode == "event" else spans[0] + ) + assert_messages_attribute( + signal.attributes["gen_ai.input.messages"], + WEATHER_TOOL_EXPECTED_INPUT_MESSAGES, + ) + assert_messages_attribute( + signal.attributes["gen_ai.output.messages"], + expected_output_messages, + ) + else: + assert len(logs) == 4 + + system_message = {"content": WEATHER_TOOL_PROMPT[0]["content"]} + assert_message_in_logs( + logs[0], "gen_ai.system.message", system_message, spans[0] + ) + + user_message = { + "content": "What's the weather in Seattle and San Francisco today?" + } + assert_message_in_logs( + logs[1], "gen_ai.user.message", user_message, spans[0] + ) + + choice_event_0 = { + "index": 0, + "finish_reason": "stop", + "message": { + "role": "assistant", + "content": "".join(response_stream_result[0]), + }, + } + assert_message_in_logs( + logs[2], "gen_ai.choice", choice_event_0, spans[0] + ) + + choice_event_1 = { + "index": 1, + "finish_reason": "stop", + "message": { + "role": "assistant", + "content": "".join(response_stream_result[1]), + }, + } + assert_message_in_logs( + logs[3], "gen_ai.choice", choice_event_1, spans[0] + ) @pytest.mark.vcr() def test_chat_completion_multiple_tools_streaming_with_content( - span_exporter, log_exporter, openai_client, instrument_with_content + span_exporter, + log_exporter, + openai_client, + instrument_with_content, + content_mode, + vcr, ): - chat_completion_multiple_tools_streaming( - span_exporter, log_exporter, openai_client, True - ) + with vcr.use_cassette( + "test_chat_completion_multiple_tools_streaming_with_content.yaml" + ): + chat_completion_multiple_tools_streaming( + span_exporter, + log_exporter, + openai_client, + True, + content_mode, + is_latest_experimental_enabled(), + ) @pytest.mark.vcr() def test_chat_completion_multiple_tools_streaming_no_content( - span_exporter, log_exporter, openai_client, instrument_no_content + span_exporter, + log_exporter, + openai_client, + instrument_no_content, + vcr, ): - chat_completion_multiple_tools_streaming( - span_exporter, log_exporter, openai_client, False - ) + with vcr.use_cassette( + "test_chat_completion_multiple_tools_streaming_no_content.yaml" + ): + chat_completion_multiple_tools_streaming( + span_exporter, + log_exporter, + openai_client, + False, + None, + is_latest_experimental_enabled(), + ) @pytest.mark.vcr() @@ -665,57 +1087,88 @@ def test_chat_completion_with_content_span_unsampled( log_exporter, openai_client, instrument_with_content_unsampled, + content_mode, + vcr, ): - llm_model_value = "gpt-4o-mini" - messages_value = [{"role": "user", "content": "Say this is a test"}] + with vcr.use_cassette( + "test_chat_completion_with_content_span_unsampled.yaml" + ): + latest_experimental_enabled = is_latest_experimental_enabled() - response = openai_client.chat.completions.create( - messages=messages_value, model=llm_model_value, stream=False - ) + response = openai_client.chat.completions.create( + messages=USER_ONLY_PROMPT, model=DEFAULT_MODEL, stream=False + ) - spans = span_exporter.get_finished_spans() - assert len(spans) == 0 + spans = span_exporter.get_finished_spans() + assert len(spans) == 0 - logs = log_exporter.get_finished_logs() - assert len(logs) == 2 + logs = log_exporter.get_finished_logs() + if content_mode == "event": + assert len(logs) == 1 - user_message = {"content": messages_value[0]["content"]} - assert_message_in_logs(logs[0], "gen_ai.user.message", user_message, None) + assert_completion_attributes( + None, + logs[0].log_record if content_mode == "event" else None, + DEFAULT_MODEL, + response, + latest_experimental_enabled, + ) - choice_event = { - "index": 0, - "finish_reason": "stop", - "message": { - "role": "assistant", - "content": response.choices[0].message.content, - }, - } - assert_message_in_logs(logs[1], "gen_ai.choice", choice_event, None) + if latest_experimental_enabled: + if content_mode == "event": + assert_messages_attribute( + logs[0].log_record.attributes["gen_ai.input.messages"], + USER_ONLY_EXPECTED_INPUT_MESSAGES, + ) + assert_messages_attribute( + logs[0].log_record.attributes["gen_ai.output.messages"], + format_simple_expected_output_message( + response.choices[0].message.content + ), + ) + else: + assert len(logs) == 2 + + user_message = {"content": USER_ONLY_PROMPT[0]["content"]} + assert_message_in_logs( + logs[0], "gen_ai.user.message", user_message, None + ) + + choice_event = { + "index": 0, + "finish_reason": "stop", + "message": { + "role": "assistant", + "content": response.choices[0].message.content, + }, + } + assert_message_in_logs( + logs[1], "gen_ai.choice", choice_event, None + ) - assert logs[0].log_record.trace_id is not None - assert logs[0].log_record.span_id is not None - assert logs[0].log_record.trace_flags == 0 + assert logs[0].log_record.trace_id is not None + assert logs[0].log_record.span_id is not None + assert logs[0].log_record.trace_flags == 0 - assert logs[0].log_record.trace_id == logs[1].log_record.trace_id - assert logs[0].log_record.span_id == logs[1].log_record.span_id - assert logs[0].log_record.trace_flags == logs[1].log_record.trace_flags + assert logs[0].log_record.trace_id == logs[1].log_record.trace_id + assert logs[0].log_record.span_id == logs[1].log_record.span_id + assert ( + logs[0].log_record.trace_flags + == logs[1].log_record.trace_flags + ) def chat_completion_multiple_tools_streaming( - span_exporter, log_exporter, openai_client, expect_content + span_exporter, + log_exporter, + openai_client, + expect_content, + content_mode, + latest_experimental_enabled, ): - llm_model_value = "gpt-4o-mini" - messages_value = [ - {"role": "system", "content": "You're a helpful assistant."}, - { - "role": "user", - "content": "What's the weather in Seattle and San Francisco today?", - }, - ] - response = openai_client.chat.completions.create( - messages=messages_value, - model=llm_model_value, + messages=WEATHER_TOOL_PROMPT, + model=DEFAULT_MODEL, tool_choice="auto", tools=[get_current_weather_tool_definition()], stream=True, @@ -751,64 +1204,109 @@ def chat_completion_multiple_tools_streaming( assert "tool_calls" == finish_reason spans = span_exporter.get_finished_spans() + logs = log_exporter.get_finished_logs() + if content_mode == "event": + assert len(logs) == 1 + assert_all_attributes( spans[0], - llm_model_value, + logs[0].log_record if content_mode == "event" else None, + DEFAULT_MODEL, + latest_experimental_enabled, response_stream_id, response_stream_model, response_stream_usage.prompt_tokens, response_stream_usage.completion_tokens, ) - logs = log_exporter.get_finished_logs() - assert len(logs) == 3 + if latest_experimental_enabled: + if expect_content: + # first call + signal = ( + logs[0].log_record if content_mode == "event" else spans[0] + ) + assert_messages_attribute( + signal.attributes["gen_ai.input.messages"], WEATHER_TOOL_EXPECTED_INPUT_MESSAGES + ) + + first_output = [ + { + "role": "assistant", + "parts": [ + { + "type": "tool_call", + "id": tool_call_ids[0], + "name": "get_current_weather", + "arguments": {"location": "Seattle, WA"}, + }, + { + "type": "tool_call", + "id": tool_call_ids[1], + "name": "get_current_weather", + "arguments": {"location": "San Francisco, CA"}, + }, + ], + "finish_reason": "tool_calls", + } + ] + assert_messages_attribute( + signal.attributes["gen_ai.output.messages"], first_output + ) + else: + assert len(logs) == 3 - system_message = ( - {"content": messages_value[0]["content"]} if expect_content else None - ) - assert_message_in_logs( - logs[0], "gen_ai.system.message", system_message, spans[0] - ) + system_message = ( + {"content": WEATHER_TOOL_PROMPT[0]["content"]} + if expect_content + else None + ) + assert_message_in_logs( + logs[0], "gen_ai.system.message", system_message, spans[0] + ) - user_message = ( - {"content": "What's the weather in Seattle and San Francisco today?"} - if expect_content - else None - ) - assert_message_in_logs( - logs[1], "gen_ai.user.message", user_message, spans[0] - ) + user_message = ( + { + "content": "What's the weather in Seattle and San Francisco today?" + } + if expect_content + else None + ) + assert_message_in_logs( + logs[1], "gen_ai.user.message", user_message, spans[0] + ) - choice_event = { - "index": 0, - "finish_reason": "tool_calls", - "message": { - "role": "assistant", - "tool_calls": [ - { - "id": tool_call_ids[0], - "type": "function", - "function": { - "name": tool_names[0], - "arguments": tool_args[0].replace("\n", "") - if expect_content - else None, + choice_event = { + "index": 0, + "finish_reason": "tool_calls", + "message": { + "role": "assistant", + "tool_calls": [ + { + "id": tool_call_ids[0], + "type": "function", + "function": { + "name": tool_names[0], + "arguments": tool_args[0].replace("\n", "") + if expect_content + else None, + }, }, - }, - { - "id": tool_call_ids[1], - "type": "function", - "function": { - "name": tool_names[1], - "arguments": tool_args[1].replace("\n", "") - if expect_content - else None, + { + "id": tool_call_ids[1], + "type": "function", + "function": { + "name": tool_names[1], + "arguments": tool_args[1].replace("\n", "") + if expect_content + else None, + }, }, - }, - ], - }, - } - assert_message_in_logs(logs[2], "gen_ai.choice", choice_event, spans[0]) + ], + }, + } + assert_message_in_logs( + logs[2], "gen_ai.choice", choice_event, spans[0] + ) def assert_message_in_logs(log, event_name, expected_content, parent_span): @@ -840,110 +1338,3 @@ def remove_none_values(body): else: result[key] = value return result - - -def assert_completion_attributes( - span: ReadableSpan, - request_model: str, - response: ChatCompletion, - operation_name: str = "chat", - server_address: str = "api.openai.com", -): - return assert_all_attributes( - span, - request_model, - response.id, - response.model, - response.usage.prompt_tokens, - response.usage.completion_tokens, - operation_name, - server_address, - ) - - -def assert_all_attributes( - span: ReadableSpan, - request_model: str, - response_id: str = None, - response_model: str = None, - input_tokens: Optional[int] = None, - output_tokens: Optional[int] = None, - operation_name: str = "chat", - server_address: str = "api.openai.com", -): - assert span.name == f"{operation_name} {request_model}" - assert ( - operation_name - == span.attributes[GenAIAttributes.GEN_AI_OPERATION_NAME] - ) - assert ( - GenAIAttributes.GenAiSystemValues.OPENAI.value - == span.attributes[GenAIAttributes.GEN_AI_SYSTEM] - ) - assert ( - request_model == span.attributes[GenAIAttributes.GEN_AI_REQUEST_MODEL] - ) - if response_model: - assert ( - response_model - == span.attributes[GenAIAttributes.GEN_AI_RESPONSE_MODEL] - ) - else: - assert GenAIAttributes.GEN_AI_RESPONSE_MODEL not in span.attributes - - if response_id: - assert ( - response_id == span.attributes[GenAIAttributes.GEN_AI_RESPONSE_ID] - ) - else: - assert GenAIAttributes.GEN_AI_RESPONSE_ID not in span.attributes - - if input_tokens: - assert ( - input_tokens - == span.attributes[GenAIAttributes.GEN_AI_USAGE_INPUT_TOKENS] - ) - else: - assert GenAIAttributes.GEN_AI_USAGE_INPUT_TOKENS not in span.attributes - - if output_tokens: - assert ( - output_tokens - == span.attributes[GenAIAttributes.GEN_AI_USAGE_OUTPUT_TOKENS] - ) - else: - assert ( - GenAIAttributes.GEN_AI_USAGE_OUTPUT_TOKENS not in span.attributes - ) - - assert server_address == span.attributes[ServerAttributes.SERVER_ADDRESS] - - -def assert_log_parent(log, span): - if span: - assert log.log_record.trace_id == span.get_span_context().trace_id - assert log.log_record.span_id == span.get_span_context().span_id - assert ( - log.log_record.trace_flags == span.get_span_context().trace_flags - ) - - -def get_current_weather_tool_definition(): - return { - "type": "function", - "function": { - "name": "get_current_weather", - "description": "Get the current weather in a given location", - "parameters": { - "type": "object", - "properties": { - "location": { - "type": "string", - "description": "The city and state, e.g. Boston, MA", - }, - }, - "required": ["location"], - "additionalProperties": False, - }, - }, - } diff --git a/instrumentation-genai/opentelemetry-instrumentation-openai-v2/tests/test_chat_metrics.py b/instrumentation-genai/opentelemetry-instrumentation-openai-v2/tests/test_chat_metrics.py index ffcd99c5b4..76a89a90c8 100644 --- a/instrumentation-genai/opentelemetry-instrumentation-openai-v2/tests/test_chat_metrics.py +++ b/instrumentation-genai/opentelemetry-instrumentation-openai-v2/tests/test_chat_metrics.py @@ -1,5 +1,8 @@ import pytest +from opentelemetry.instrumentation.openai_v2.utils import ( + is_latest_experimental_enabled, +) from opentelemetry.semconv._incubating.attributes import ( gen_ai_attributes as GenAIAttributes, ) @@ -7,6 +10,11 @@ server_attributes as ServerAttributes, ) from opentelemetry.semconv._incubating.metrics import gen_ai_metrics +from tests.test_utils import ( + DEFAULT_MODEL, + USER_ONLY_EXPECTED_INPUT_MESSAGES, + USER_ONLY_PROMPT, +) _DURATION_BUCKETS = ( 0.01, @@ -42,15 +50,21 @@ ) -def assert_all_metric_attributes(data_point): +def assert_all_metric_attributes(data_point, latest_experimental_enabled): assert GenAIAttributes.GEN_AI_OPERATION_NAME in data_point.attributes assert ( data_point.attributes[GenAIAttributes.GEN_AI_OPERATION_NAME] == GenAIAttributes.GenAiOperationNameValues.CHAT.value ) - assert GenAIAttributes.GEN_AI_SYSTEM in data_point.attributes + + provider_name_attr_name = ( + "gen_ai.provider.name" + if latest_experimental_enabled + else GenAIAttributes.GEN_AI_SYSTEM + ) + assert provider_name_attr_name in data_point.attributes assert ( - data_point.attributes[GenAIAttributes.GEN_AI_SYSTEM] + data_point.attributes[provider_name_attr_name] == GenAIAttributes.GenAiSystemValues.OPENAI.value ) assert GenAIAttributes.GEN_AI_REQUEST_MODEL in data_point.attributes @@ -63,21 +77,25 @@ def assert_all_metric_attributes(data_point): data_point.attributes[GenAIAttributes.GEN_AI_RESPONSE_MODEL] == "gpt-4o-mini-2024-07-18" ) - assert "gen_ai.openai.response.system_fingerprint" in data_point.attributes - assert ( - data_point.attributes["gen_ai.openai.response.system_fingerprint"] - == "fp_0ba0d124f1" + + system_fingerprint_attr_key = ( + "openai.response.system_fingerprint" + if latest_experimental_enabled + else GenAIAttributes.GEN_AI_OPENAI_RESPONSE_SYSTEM_FINGERPRINT ) + assert system_fingerprint_attr_key in data_point.attributes assert ( - GenAIAttributes.GEN_AI_OPENAI_RESPONSE_SERVICE_TIER - in data_point.attributes + data_point.attributes[system_fingerprint_attr_key] == "fp_0ba0d124f1" ) - assert ( - data_point.attributes[ - GenAIAttributes.GEN_AI_OPENAI_RESPONSE_SERVICE_TIER - ] - == "default" + + service_tier_attr_key = ( + "openai.response.service_tier" + if latest_experimental_enabled + else GenAIAttributes.GEN_AI_OPENAI_RESPONSE_SERVICE_TIER ) + assert service_tier_attr_key in data_point.attributes + assert service_tier_attr_key in data_point.attributes + assert data_point.attributes[service_tier_attr_key] == "default" assert ( data_point.attributes[ServerAttributes.SERVER_ADDRESS] == "api.openai.com" @@ -86,142 +104,154 @@ def assert_all_metric_attributes(data_point): @pytest.mark.vcr() def test_chat_completion_metrics( - metric_reader, openai_client, instrument_with_content + metric_reader, openai_client, instrument_with_content, vcr ): - llm_model_value = "gpt-4o-mini" - messages_value = [{"role": "user", "content": "Say this is a test"}] + with vcr.use_cassette("test_chat_completion_metrics.yaml"): + latest_experimental_enabled = is_latest_experimental_enabled() - openai_client.chat.completions.create( - messages=messages_value, model=llm_model_value, stream=False - ) + openai_client.chat.completions.create( + messages=USER_ONLY_PROMPT, model=DEFAULT_MODEL, stream=False + ) - metrics = metric_reader.get_metrics_data().resource_metrics - assert len(metrics) == 1 + metrics = metric_reader.get_metrics_data().resource_metrics + assert len(metrics) == 1 - metric_data = metrics[0].scope_metrics[0].metrics - assert len(metric_data) == 2 + metric_data = metrics[0].scope_metrics[0].metrics + assert len(metric_data) == 2 - duration_metric = next( - ( - m - for m in metric_data - if m.name == gen_ai_metrics.GEN_AI_CLIENT_OPERATION_DURATION - ), - None, - ) - assert duration_metric is not None - - duration_point = duration_metric.data.data_points[0] - assert duration_point.sum > 0 - assert_all_metric_attributes(duration_point) - assert duration_point.explicit_bounds == _DURATION_BUCKETS - - token_usage_metric = next( - ( - m - for m in metric_data - if m.name == gen_ai_metrics.GEN_AI_CLIENT_TOKEN_USAGE - ), - None, - ) - assert token_usage_metric is not None - - input_token_usage = next( - ( - d - for d in token_usage_metric.data.data_points - if d.attributes[GenAIAttributes.GEN_AI_TOKEN_TYPE] - == GenAIAttributes.GenAiTokenTypeValues.INPUT.value - ), - None, - ) - assert input_token_usage is not None - assert input_token_usage.sum == 12 - - assert input_token_usage.explicit_bounds == _TOKEN_USAGE_BUCKETS - assert input_token_usage.bucket_counts[2] == 1 - assert_all_metric_attributes(input_token_usage) - - output_token_usage = next( - ( - d - for d in token_usage_metric.data.data_points - if d.attributes[GenAIAttributes.GEN_AI_TOKEN_TYPE] - == GenAIAttributes.GenAiTokenTypeValues.COMPLETION.value - ), - None, - ) - assert output_token_usage is not None - assert output_token_usage.sum == 5 - # assert against buckets [1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864] - assert output_token_usage.bucket_counts[2] == 1 - assert_all_metric_attributes(output_token_usage) + duration_metric = next( + ( + m + for m in metric_data + if m.name == gen_ai_metrics.GEN_AI_CLIENT_OPERATION_DURATION + ), + None, + ) + assert duration_metric is not None + + duration_point = duration_metric.data.data_points[0] + assert duration_point.sum > 0 + assert_all_metric_attributes( + duration_point, latest_experimental_enabled + ) + assert duration_point.explicit_bounds == _DURATION_BUCKETS + + token_usage_metric = next( + ( + m + for m in metric_data + if m.name == gen_ai_metrics.GEN_AI_CLIENT_TOKEN_USAGE + ), + None, + ) + assert token_usage_metric is not None + + input_token_usage = next( + ( + d + for d in token_usage_metric.data.data_points + if d.attributes[GenAIAttributes.GEN_AI_TOKEN_TYPE] + == GenAIAttributes.GenAiTokenTypeValues.INPUT.value + ), + None, + ) + assert input_token_usage is not None + assert input_token_usage.sum == 12 + + assert input_token_usage.explicit_bounds == _TOKEN_USAGE_BUCKETS + assert input_token_usage.bucket_counts[2] == 1 + assert_all_metric_attributes( + input_token_usage, latest_experimental_enabled + ) + + output_token_usage = next( + ( + d + for d in token_usage_metric.data.data_points + if d.attributes[GenAIAttributes.GEN_AI_TOKEN_TYPE] + == GenAIAttributes.GenAiTokenTypeValues.COMPLETION.value + ), + None, + ) + assert output_token_usage is not None + assert output_token_usage.sum == 5 + # assert against buckets [1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864] + assert output_token_usage.bucket_counts[2] == 1 + assert_all_metric_attributes( + output_token_usage, latest_experimental_enabled + ) @pytest.mark.vcr() @pytest.mark.asyncio() async def test_async_chat_completion_metrics( - metric_reader, async_openai_client, instrument_with_content + metric_reader, async_openai_client, instrument_with_content, vcr ): - llm_model_value = "gpt-4o-mini" - messages_value = [{"role": "user", "content": "Say this is a test"}] + with vcr.use_cassette("test_async_chat_completion_metrics.yaml"): + latest_experimental_enabled = is_latest_experimental_enabled() - await async_openai_client.chat.completions.create( - messages=messages_value, model=llm_model_value, stream=False - ) + await async_openai_client.chat.completions.create( + messages=USER_ONLY_PROMPT, model=DEFAULT_MODEL, stream=False + ) - metrics = metric_reader.get_metrics_data().resource_metrics - assert len(metrics) == 1 + metrics = metric_reader.get_metrics_data().resource_metrics + assert len(metrics) == 1 - metric_data = metrics[0].scope_metrics[0].metrics - assert len(metric_data) == 2 + metric_data = metrics[0].scope_metrics[0].metrics + assert len(metric_data) == 2 - duration_metric = next( - ( - m - for m in metric_data - if m.name == gen_ai_metrics.GEN_AI_CLIENT_OPERATION_DURATION - ), - None, - ) - assert duration_metric is not None - assert duration_metric.data.data_points[0].sum > 0 - assert_all_metric_attributes(duration_metric.data.data_points[0]) - - token_usage_metric = next( - ( - m - for m in metric_data - if m.name == gen_ai_metrics.GEN_AI_CLIENT_TOKEN_USAGE - ), - None, - ) - assert token_usage_metric is not None - - input_token_usage = next( - ( - d - for d in token_usage_metric.data.data_points - if d.attributes[GenAIAttributes.GEN_AI_TOKEN_TYPE] - == GenAIAttributes.GenAiTokenTypeValues.INPUT.value - ), - None, - ) + duration_metric = next( + ( + m + for m in metric_data + if m.name == gen_ai_metrics.GEN_AI_CLIENT_OPERATION_DURATION + ), + None, + ) + assert duration_metric is not None + assert duration_metric.data.data_points[0].sum > 0 + assert_all_metric_attributes( + duration_metric.data.data_points[0], latest_experimental_enabled + ) - assert input_token_usage is not None - assert input_token_usage.sum == 12 - assert_all_metric_attributes(input_token_usage) - - output_token_usage = next( - ( - d - for d in token_usage_metric.data.data_points - if d.attributes[GenAIAttributes.GEN_AI_TOKEN_TYPE] - == GenAIAttributes.GenAiTokenTypeValues.COMPLETION.value - ), - None, - ) + token_usage_metric = next( + ( + m + for m in metric_data + if m.name == gen_ai_metrics.GEN_AI_CLIENT_TOKEN_USAGE + ), + None, + ) + assert token_usage_metric is not None + + input_token_usage = next( + ( + d + for d in token_usage_metric.data.data_points + if d.attributes[GenAIAttributes.GEN_AI_TOKEN_TYPE] + == GenAIAttributes.GenAiTokenTypeValues.INPUT.value + ), + None, + ) + + assert input_token_usage is not None + assert input_token_usage.sum == 12 + assert_all_metric_attributes( + input_token_usage, latest_experimental_enabled + ) + + output_token_usage = next( + ( + d + for d in token_usage_metric.data.data_points + if d.attributes[GenAIAttributes.GEN_AI_TOKEN_TYPE] + == GenAIAttributes.GenAiTokenTypeValues.COMPLETION.value + ), + None, + ) - assert output_token_usage is not None - assert output_token_usage.sum == 12 - assert_all_metric_attributes(output_token_usage) + assert output_token_usage is not None + assert output_token_usage.sum == 12 + assert_all_metric_attributes( + output_token_usage, latest_experimental_enabled + ) diff --git a/instrumentation-genai/opentelemetry-instrumentation-openai-v2/tests/test_utils.py b/instrumentation-genai/opentelemetry-instrumentation-openai-v2/tests/test_utils.py new file mode 100644 index 0000000000..7a6d28e2f5 --- /dev/null +++ b/instrumentation-genai/opentelemetry-instrumentation-openai-v2/tests/test_utils.py @@ -0,0 +1,303 @@ +import json +from typing import Optional + +from openai.resources.chat.completions import ChatCompletion + +from opentelemetry.sdk._logs import LogRecord +from opentelemetry.sdk.trace import ReadableSpan +from opentelemetry.semconv._incubating.attributes import ( + gen_ai_attributes as GenAIAttributes, +) +from opentelemetry.semconv._incubating.attributes import ( + server_attributes as ServerAttributes, +) + +DEFAULT_MODEL = "gpt-4o-mini" +USER_ONLY_PROMPT = [{"role": "user", "content": "Say this is a test"}] +USER_ONLY_EXPECTED_INPUT_MESSAGES = [ + { + "role": "user", + "parts": [ + { + "type": "text", + "content": USER_ONLY_PROMPT[0]["content"], + } + ], + } +] +WEATHER_TOOL_PROMPT = [ + {"role": "system", "content": "You're a helpful assistant."}, + { + "role": "user", + "content": "What's the weather in Seattle and San Francisco today?", + }, +] +WEATHER_TOOL_EXPECTED_INPUT_MESSAGES = [ + { + "role": "system", + "parts": [ + { + "type": "text", + "content": WEATHER_TOOL_PROMPT[0]["content"], + } + ], + }, + { + "role": "user", + "parts": [ + { + "type": "text", + "content": WEATHER_TOOL_PROMPT[1]["content"], + } + ], + }, +] + + +def assert_all_attributes( + span: ReadableSpan, + details_event: LogRecord, + request_model: str, + latest_experimental_enabled: bool, + response_id: str = None, + response_model: str = None, + input_tokens: Optional[int] = None, + output_tokens: Optional[int] = None, + operation_name: str = "chat", + server_address: str = "api.openai.com", +): + if span: + assert span.name == f"{operation_name} {request_model}" + if details_event: + assert ( + "gen_ai.client.inference.operation.details" + == details_event.attributes["event.name"] + ) + + if span: + assert ( + operation_name + == span.attributes[GenAIAttributes.GEN_AI_OPERATION_NAME] + ) + if details_event: + assert ( + operation_name + == details_event.attributes[GenAIAttributes.GEN_AI_OPERATION_NAME] + ) + + provider_name_attr_name = ( + "gen_ai.provider.name" + if latest_experimental_enabled + else GenAIAttributes.GEN_AI_SYSTEM + ) + if span: + assert ( + GenAIAttributes.GenAiSystemValues.OPENAI.value + == span.attributes[provider_name_attr_name] + ) + if details_event: + assert ( + GenAIAttributes.GenAiSystemValues.OPENAI.value + == details_event.attributes[provider_name_attr_name] + ) + + if span: + assert ( + request_model + == span.attributes[GenAIAttributes.GEN_AI_REQUEST_MODEL] + ) + if details_event: + assert ( + request_model + == details_event.attributes[GenAIAttributes.GEN_AI_REQUEST_MODEL] + ) + + if response_model: + if span: + assert ( + response_model + == span.attributes[GenAIAttributes.GEN_AI_RESPONSE_MODEL] + ) + if details_event: + assert ( + response_model + == details_event.attributes[ + GenAIAttributes.GEN_AI_RESPONSE_MODEL + ] + ) + else: + if span: + assert GenAIAttributes.GEN_AI_RESPONSE_MODEL not in span.attributes + if details_event: + assert ( + GenAIAttributes.GEN_AI_RESPONSE_MODEL + not in details_event.attributes + ) + + if response_id: + if span: + assert ( + response_id + == span.attributes[GenAIAttributes.GEN_AI_RESPONSE_ID] + ) + if details_event: + assert ( + response_id + == details_event.attributes[GenAIAttributes.GEN_AI_RESPONSE_ID] + ) + else: + if span: + assert GenAIAttributes.GEN_AI_RESPONSE_ID not in span.attributes + if details_event: + assert ( + GenAIAttributes.GEN_AI_RESPONSE_MODEL + not in details_event.attributes + ) + + if input_tokens: + if span: + assert ( + input_tokens + == span.attributes[GenAIAttributes.GEN_AI_USAGE_INPUT_TOKENS] + ) + if details_event: + assert ( + input_tokens + == details_event.attributes[ + GenAIAttributes.GEN_AI_USAGE_INPUT_TOKENS + ] + ) + else: + if span: + assert ( + GenAIAttributes.GEN_AI_USAGE_INPUT_TOKENS + not in span.attributes + ) + if details_event: + assert ( + GenAIAttributes.GEN_AI_USAGE_INPUT_TOKENS + not in details_event.attributes + ) + + if output_tokens: + if span: + assert ( + output_tokens + == span.attributes[GenAIAttributes.GEN_AI_USAGE_OUTPUT_TOKENS] + ) + if details_event: + assert ( + output_tokens + == details_event.attributes[ + GenAIAttributes.GEN_AI_USAGE_OUTPUT_TOKENS + ] + ) + else: + if span: + assert ( + GenAIAttributes.GEN_AI_USAGE_OUTPUT_TOKENS + not in span.attributes + ) + if details_event: + assert ( + GenAIAttributes.GEN_AI_USAGE_OUTPUT_TOKENS + not in details_event.attributes + ) + + if span: + assert ( + server_address == span.attributes[ServerAttributes.SERVER_ADDRESS] + ) + if details_event: + assert ( + server_address + == details_event.attributes[ServerAttributes.SERVER_ADDRESS] + ) + + +def assert_log_parent(log, span): + if span: + assert log.log_record.trace_id == span.get_span_context().trace_id + assert log.log_record.span_id == span.get_span_context().span_id + assert ( + log.log_record.trace_flags == span.get_span_context().trace_flags + ) + + +def get_current_weather_tool_definition(): + return { + "type": "function", + "function": { + "name": "get_current_weather", + "description": "Get the current weather in a given location", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "The city and state, e.g. Boston, MA", + }, + }, + "required": ["location"], + "additionalProperties": False, + }, + }, + } + + +def remove_none_values(body): + result = {} + for key, value in body.items(): + if value is None: + continue + if isinstance(value, dict): + result[key] = remove_none_values(value) + elif isinstance(value, list): + result[key] = [remove_none_values(i) for i in value] + else: + result[key] = value + return result + + +def assert_completion_attributes( + span: ReadableSpan, + details_event: LogRecord, + request_model: str, + response: ChatCompletion, + latest_experimental_enabled: bool, + operation_name: str = "chat", + server_address: str = "api.openai.com", +): + return assert_all_attributes( + span, + details_event, + request_model, + latest_experimental_enabled, + response.id, + response.model, + response.usage.prompt_tokens, + response.usage.completion_tokens, + operation_name, + server_address, + ) + + +def assert_messages_attribute(actual, expected): + assert json.loads(actual) == expected + + +def format_simple_expected_output_message( + content: str, finish_reason: str = "stop" +): + return [ + { + "role": "assistant", + "parts": [ + { + "type": "text", + "content": content, + } + ], + "finish_reason": finish_reason, + } + ]