From 66e09627e1ec6a26c5a9169a20c21c5d47a75801 Mon Sep 17 00:00:00 2001 From: Fabian Schindler Date: Tue, 20 Jan 2026 14:42:27 +0100 Subject: [PATCH 1/5] feat(integrations): openai: detect and report the time to first token metric (TTFT) as `gen_ai.response.time_to_first_token` --- sentry_sdk/consts.py | 6 ++++++ sentry_sdk/integrations/openai.py | 28 ++++++++++++++++++++++++++-- 2 files changed, 32 insertions(+), 2 deletions(-) diff --git a/sentry_sdk/consts.py b/sentry_sdk/consts.py index e53533018f..a932a05a60 100644 --- a/sentry_sdk/consts.py +++ b/sentry_sdk/consts.py @@ -518,6 +518,12 @@ class SPANDATA: Example: ["The weather in Paris is rainy and overcast, with temperatures around 57°F", "The weather in London is sunny and warm, with temperatures around 65°F"] """ + GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN = "gen_ai.response.time_to_first_token" + """ + The time it took to receive the first token from the model. + Example: 0.1 + """ + GEN_AI_RESPONSE_TOOL_CALLS = "gen_ai.response.tool_calls" """ The tool calls in the model's response. diff --git a/sentry_sdk/integrations/openai.py b/sentry_sdk/integrations/openai.py index 66dc4a1c48..abb64e6b88 100644 --- a/sentry_sdk/integrations/openai.py +++ b/sentry_sdk/integrations/openai.py @@ -1,5 +1,6 @@ import sys from functools import wraps +import time import sentry_sdk from sentry_sdk import consts @@ -249,6 +250,7 @@ def _set_output_data( response: "Any", kwargs: "dict[str, Any]", integration: "OpenAIIntegration", + start_time: "Optional[float]" = None, finish_span: bool = True, ) -> None: if hasattr(response, "model"): @@ -263,6 +265,8 @@ def _set_output_data( if messages is not None and isinstance(messages, str): messages = [messages] + ttft: "Optional[float]" = None + if hasattr(response, "choices"): if should_send_default_pii() and integration.include_prompts: response_text = [ @@ -320,6 +324,7 @@ def _set_output_data( old_iterator = response._iterator def new_iterator() -> "Iterator[ChatCompletionChunk]": + nonlocal ttft count_tokens_manually = True for x in old_iterator: with capture_internal_exceptions(): @@ -330,6 +335,8 @@ def new_iterator() -> "Iterator[ChatCompletionChunk]": if hasattr(choice, "delta") and hasattr( choice.delta, "content" ): + if start_time is not None and ttft is None: + ttft = time.perf_counter() - start_time content = choice.delta.content if len(data_buf) <= choice_index: data_buf.append([]) @@ -338,6 +345,8 @@ def new_iterator() -> "Iterator[ChatCompletionChunk]": # OpenAI responses API elif hasattr(x, "delta"): + if start_time is not None and ttft is None: + ttft = time.perf_counter() - start_time if len(data_buf) == 0: data_buf.append([]) data_buf[0].append(x.delta or "") @@ -356,6 +365,10 @@ def new_iterator() -> "Iterator[ChatCompletionChunk]": yield x with capture_internal_exceptions(): + if ttft is not None: + set_data_normalized( + span, SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN, ttft + ) if len(data_buf) > 0: all_responses = ["".join(chunk) for chunk in data_buf] if should_send_default_pii() and integration.include_prompts: @@ -375,6 +388,7 @@ def new_iterator() -> "Iterator[ChatCompletionChunk]": span.__exit__(None, None, None) async def new_iterator_async() -> "AsyncIterator[ChatCompletionChunk]": + nonlocal ttft count_tokens_manually = True async for x in old_iterator: with capture_internal_exceptions(): @@ -385,6 +399,8 @@ async def new_iterator_async() -> "AsyncIterator[ChatCompletionChunk]": if hasattr(choice, "delta") and hasattr( choice.delta, "content" ): + if start_time is not None and ttft is None: + ttft = time.perf_counter() - start_time content = choice.delta.content if len(data_buf) <= choice_index: data_buf.append([]) @@ -393,6 +409,8 @@ async def new_iterator_async() -> "AsyncIterator[ChatCompletionChunk]": # OpenAI responses API elif hasattr(x, "delta"): + if start_time is not None and ttft is None: + ttft = time.perf_counter() - start_time if len(data_buf) == 0: data_buf.append([]) data_buf[0].append(x.delta or "") @@ -411,6 +429,10 @@ async def new_iterator_async() -> "AsyncIterator[ChatCompletionChunk]": yield x with capture_internal_exceptions(): + if ttft is not None: + set_data_normalized( + span, SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN, ttft + ) if len(data_buf) > 0: all_responses = ["".join(chunk) for chunk in data_buf] if should_send_default_pii() and integration.include_prompts: @@ -465,9 +487,10 @@ def _new_chat_completion_common(f: "Any", *args: "Any", **kwargs: "Any") -> "Any _set_input_data(span, kwargs, operation, integration) + start_time = time.perf_counter() response = yield f, args, kwargs - _set_output_data(span, response, kwargs, integration, finish_span=True) + _set_output_data(span, response, kwargs, integration, start_time, finish_span=True) return response @@ -645,9 +668,10 @@ def _new_responses_create_common(f: "Any", *args: "Any", **kwargs: "Any") -> "An _set_input_data(span, kwargs, operation, integration) + start_time = time.perf_counter() response = yield f, args, kwargs - _set_output_data(span, response, kwargs, integration, finish_span=True) + _set_output_data(span, response, kwargs, integration, start_time, finish_span=True) return response From 77e9e2015a62fa3d23876c31702e4e22be70e403 Mon Sep 17 00:00:00 2001 From: Fabian Schindler Date: Tue, 27 Jan 2026 10:19:50 +0100 Subject: [PATCH 2/5] feat(integrations): openai-agents: record TTFT for ai_spans --- .../openai_agents/patches/models.py | 15 ++- .../openai_agents/spans/ai_client.py | 5 + .../openai_agents/test_openai_agents.py | 97 +++++++++++++++++++ 3 files changed, 116 insertions(+), 1 deletion(-) diff --git a/sentry_sdk/integrations/openai_agents/patches/models.py b/sentry_sdk/integrations/openai_agents/patches/models.py index 5f18a859e2..063155858e 100644 --- a/sentry_sdk/integrations/openai_agents/patches/models.py +++ b/sentry_sdk/integrations/openai_agents/patches/models.py @@ -1,5 +1,5 @@ import copy -import sys +import time from functools import wraps from sentry_sdk.integrations import DidNotEnable @@ -149,8 +149,21 @@ async def wrapped_stream_response(*args: "Any", **kwargs: "Any") -> "Any": span.set_data(SPANDATA.GEN_AI_RESPONSE_STREAMING, True) streaming_response = None + ttft_recorded = False async for event in original_stream_response(*args, **kwargs): + # Detect first content token (text delta event) + if not ttft_recorded and hasattr(event, "delta"): + start_time = getattr( + agent, "_sentry_chat_ttft_start_time", None + ) + if start_time is not None: + ttft = time.perf_counter() - start_time + span.set_data( + SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN, ttft + ) + ttft_recorded = True + # Capture the full response from ResponseCompletedEvent if hasattr(event, "response"): streaming_response = event.response diff --git a/sentry_sdk/integrations/openai_agents/spans/ai_client.py b/sentry_sdk/integrations/openai_agents/spans/ai_client.py index c099f133f4..364728bf23 100644 --- a/sentry_sdk/integrations/openai_agents/spans/ai_client.py +++ b/sentry_sdk/integrations/openai_agents/spans/ai_client.py @@ -1,3 +1,5 @@ +import time + import sentry_sdk from sentry_sdk.consts import OP, SPANDATA @@ -36,6 +38,9 @@ def ai_client_span( # TODO-anton: remove hardcoded stuff and replace something that also works for embedding and so on span.set_data(SPANDATA.GEN_AI_OPERATION_NAME, "chat") + # Store start time for TTFT calculation on the agent object + agent._sentry_chat_ttft_start_time = time.perf_counter() + _set_agent_data(span, agent) _set_input_data(span, get_response_kwargs) diff --git a/tests/integrations/openai_agents/test_openai_agents.py b/tests/integrations/openai_agents/test_openai_agents.py index c66c53b2ef..d62fd0dbd3 100644 --- a/tests/integrations/openai_agents/test_openai_agents.py +++ b/tests/integrations/openai_agents/test_openai_agents.py @@ -2202,3 +2202,100 @@ async def test_streaming_span_update_captures_response_data( assert span._data["gen_ai.usage.input_tokens"] == 10 assert span._data["gen_ai.usage.output_tokens"] == 20 assert span._data["gen_ai.response.model"] == "gpt-4-streaming" + + +@pytest.mark.asyncio +async def test_streaming_ttft_on_chat_span(sentry_init, test_agent): + """ + Test that time-to-first-token (TTFT) is recorded on chat spans during streaming. + + TTFT is triggered by events with a `delta` attribute, which includes: + - ResponseTextDeltaEvent (text output) + - ResponseAudioDeltaEvent (audio output) + - ResponseReasoningTextDeltaEvent (reasoning/thinking) + - ResponseFunctionCallArgumentsDeltaEvent (function call args) + - and other delta events... + + Events WITHOUT delta (like ResponseCompletedEvent, ResponseCreatedEvent, etc.) + should NOT trigger TTFT. + """ + import time + + sentry_init( + integrations=[OpenAIAgentsIntegration()], + traces_sample_rate=1.0, + ) + + # Create a mock model that returns a stream_response generator + class MockModel: + model = "gpt-4" + + async def stream_response(self, *args, **kwargs): + # First event: ResponseCreatedEvent (no delta - should NOT trigger TTFT) + created_event = MagicMock(spec=["type", "sequence_number"]) + created_event.type = "response.created" + yield created_event + + # Simulate server-side processing delay before first token + await asyncio.sleep(0.05) # 50ms delay + + # Second event: ResponseTextDeltaEvent (HAS delta - triggers TTFT) + # This simulates the first actual content token + text_delta_event = MagicMock(spec=["delta", "type", "content_index"]) + text_delta_event.delta = "Hello" + text_delta_event.type = "response.output_text.delta" + yield text_delta_event + await asyncio.sleep(0.05) # 50ms delay + + # Third event: more text content (also has delta, but TTFT already recorded) + text_delta_event2 = MagicMock(spec=["delta", "type", "content_index"]) + text_delta_event2.delta = " world!" + text_delta_event2.type = "response.output_text.delta" + yield text_delta_event2 + + # Final event: ResponseCompletedEvent (has response, no delta) + completed_event = MagicMock(spec=["response", "type", "sequence_number"]) + completed_event.response = MagicMock() + completed_event.response.model = "gpt-4" + completed_event.response.usage = Usage( + requests=1, + input_tokens=10, + output_tokens=5, + total_tokens=15, + ) + completed_event.response.output = [] + yield completed_event + + mock_model = MockModel() + + with sentry_sdk.start_transaction(name="test_ttft", sampled=True) as transaction: + # Simulate calling the wrapped stream_response logic + from sentry_sdk.integrations.openai_agents.spans import ai_client_span + + with ai_client_span(test_agent, {}) as span: + span.set_data(SPANDATA.GEN_AI_RESPONSE_STREAMING, True) + + ttft_recorded = False + start_time = getattr(test_agent, "_sentry_chat_ttft_start_time", None) + + async for event in mock_model.stream_response(): + # This is the same logic used in the actual integration + if ( + not ttft_recorded + and hasattr(event, "delta") + and start_time is not None + ): + ttft = time.perf_counter() - start_time + span.set_data(SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN, ttft) + ttft_recorded = True + + # Verify TTFT is recorded on the chat span (inside transaction context) + chat_spans = [ + s for s in transaction._span_recorder.spans if s.op == "gen_ai.chat" + ] + assert len(chat_spans) >= 1 + chat_span = chat_spans[0] + assert SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN in chat_span._data + ttft_value = chat_span._data[SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN] + # TTFT should be at least 40ms (our simulated delay minus some variance) but reasonable + assert 0.04 < ttft_value < 1.0, f"TTFT {ttft_value} should be around 50ms" From f3f3f24cd47ce32aeb0707a0bcf66008f618281b Mon Sep 17 00:00:00 2001 From: Fabian Schindler Date: Tue, 27 Jan 2026 10:50:25 +0100 Subject: [PATCH 3/5] fix: using local variable to track start time --- .../integrations/openai_agents/patches/models.py | 12 +++++------- .../integrations/openai_agents/spans/ai_client.py | 5 ----- .../integrations/openai_agents/test_openai_agents.py | 9 +++------ 3 files changed, 8 insertions(+), 18 deletions(-) diff --git a/sentry_sdk/integrations/openai_agents/patches/models.py b/sentry_sdk/integrations/openai_agents/patches/models.py index 063155858e..9b57a55f1f 100644 --- a/sentry_sdk/integrations/openai_agents/patches/models.py +++ b/sentry_sdk/integrations/openai_agents/patches/models.py @@ -150,18 +150,16 @@ async def wrapped_stream_response(*args: "Any", **kwargs: "Any") -> "Any": streaming_response = None ttft_recorded = False + # Capture start time locally to avoid race conditions with concurrent requests + start_time = time.perf_counter() async for event in original_stream_response(*args, **kwargs): # Detect first content token (text delta event) if not ttft_recorded and hasattr(event, "delta"): - start_time = getattr( - agent, "_sentry_chat_ttft_start_time", None + ttft = time.perf_counter() - start_time + span.set_data( + SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN, ttft ) - if start_time is not None: - ttft = time.perf_counter() - start_time - span.set_data( - SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN, ttft - ) ttft_recorded = True # Capture the full response from ResponseCompletedEvent diff --git a/sentry_sdk/integrations/openai_agents/spans/ai_client.py b/sentry_sdk/integrations/openai_agents/spans/ai_client.py index 364728bf23..c099f133f4 100644 --- a/sentry_sdk/integrations/openai_agents/spans/ai_client.py +++ b/sentry_sdk/integrations/openai_agents/spans/ai_client.py @@ -1,5 +1,3 @@ -import time - import sentry_sdk from sentry_sdk.consts import OP, SPANDATA @@ -38,9 +36,6 @@ def ai_client_span( # TODO-anton: remove hardcoded stuff and replace something that also works for embedding and so on span.set_data(SPANDATA.GEN_AI_OPERATION_NAME, "chat") - # Store start time for TTFT calculation on the agent object - agent._sentry_chat_ttft_start_time = time.perf_counter() - _set_agent_data(span, agent) _set_input_data(span, get_response_kwargs) diff --git a/tests/integrations/openai_agents/test_openai_agents.py b/tests/integrations/openai_agents/test_openai_agents.py index d62fd0dbd3..795637986b 100644 --- a/tests/integrations/openai_agents/test_openai_agents.py +++ b/tests/integrations/openai_agents/test_openai_agents.py @@ -2276,15 +2276,12 @@ async def stream_response(self, *args, **kwargs): span.set_data(SPANDATA.GEN_AI_RESPONSE_STREAMING, True) ttft_recorded = False - start_time = getattr(test_agent, "_sentry_chat_ttft_start_time", None) + # Capture start time locally (same as production code after race condition fix) + start_time = time.perf_counter() async for event in mock_model.stream_response(): # This is the same logic used in the actual integration - if ( - not ttft_recorded - and hasattr(event, "delta") - and start_time is not None - ): + if not ttft_recorded and hasattr(event, "delta"): ttft = time.perf_counter() - start_time span.set_data(SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN, ttft) ttft_recorded = True From 092635cba4aaf643f40ef94f9c4c86ce8f387596 Mon Sep 17 00:00:00 2001 From: Fabian Schindler Date: Tue, 27 Jan 2026 11:24:32 +0100 Subject: [PATCH 4/5] test: fix test case to be actually useful --- .../openai_agents/test_openai_agents.py | 45 ++++++++++--------- 1 file changed, 24 insertions(+), 21 deletions(-) diff --git a/tests/integrations/openai_agents/test_openai_agents.py b/tests/integrations/openai_agents/test_openai_agents.py index 795637986b..81f4b54e1d 100644 --- a/tests/integrations/openai_agents/test_openai_agents.py +++ b/tests/integrations/openai_agents/test_openai_agents.py @@ -2219,17 +2219,23 @@ async def test_streaming_ttft_on_chat_span(sentry_init, test_agent): Events WITHOUT delta (like ResponseCompletedEvent, ResponseCreatedEvent, etc.) should NOT trigger TTFT. """ - import time + from sentry_sdk.integrations.openai_agents.patches.models import ( + _create_get_model_wrapper, + ) sentry_init( integrations=[OpenAIAgentsIntegration()], traces_sample_rate=1.0, ) - # Create a mock model that returns a stream_response generator + # Create a mock model with stream_response and get_response class MockModel: model = "gpt-4" + async def get_response(self, *args, **kwargs): + # Not used in this test, but required by the wrapper + pass + async def stream_response(self, *args, **kwargs): # First event: ResponseCreatedEvent (no delta - should NOT trigger TTFT) created_event = MagicMock(spec=["type", "sequence_number"]) @@ -2240,12 +2246,10 @@ async def stream_response(self, *args, **kwargs): await asyncio.sleep(0.05) # 50ms delay # Second event: ResponseTextDeltaEvent (HAS delta - triggers TTFT) - # This simulates the first actual content token text_delta_event = MagicMock(spec=["delta", "type", "content_index"]) text_delta_event.delta = "Hello" text_delta_event.type = "response.output_text.delta" yield text_delta_event - await asyncio.sleep(0.05) # 50ms delay # Third event: more text content (also has delta, but TTFT already recorded) text_delta_event2 = MagicMock(spec=["delta", "type", "content_index"]) @@ -2266,33 +2270,32 @@ async def stream_response(self, *args, **kwargs): completed_event.response.output = [] yield completed_event - mock_model = MockModel() - - with sentry_sdk.start_transaction(name="test_ttft", sampled=True) as transaction: - # Simulate calling the wrapped stream_response logic - from sentry_sdk.integrations.openai_agents.spans import ai_client_span + # Create a mock original _get_model that returns our mock model + def mock_get_model(agent, run_config): + return MockModel() - with ai_client_span(test_agent, {}) as span: - span.set_data(SPANDATA.GEN_AI_RESPONSE_STREAMING, True) + # Wrap it with our integration wrapper + wrapped_get_model = _create_get_model_wrapper(mock_get_model) - ttft_recorded = False - # Capture start time locally (same as production code after race condition fix) - start_time = time.perf_counter() + with sentry_sdk.start_transaction(name="test_ttft", sampled=True) as transaction: + # Get the wrapped model (this applies the stream_response wrapper) + wrapped_model = wrapped_get_model(None, test_agent, MagicMock()) - async for event in mock_model.stream_response(): - # This is the same logic used in the actual integration - if not ttft_recorded and hasattr(event, "delta"): - ttft = time.perf_counter() - start_time - span.set_data(SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN, ttft) - ttft_recorded = True + # Call the wrapped stream_response and consume all events + async for _event in wrapped_model.stream_response(): + pass - # Verify TTFT is recorded on the chat span (inside transaction context) + # Verify TTFT is recorded on the chat span (must be inside transaction context) chat_spans = [ s for s in transaction._span_recorder.spans if s.op == "gen_ai.chat" ] assert len(chat_spans) >= 1 chat_span = chat_spans[0] + assert SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN in chat_span._data ttft_value = chat_span._data[SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN] # TTFT should be at least 40ms (our simulated delay minus some variance) but reasonable assert 0.04 < ttft_value < 1.0, f"TTFT {ttft_value} should be around 50ms" + + # Verify streaming flag is set + assert chat_span._data.get(SPANDATA.GEN_AI_RESPONSE_STREAMING) is True From 8a6c6f134209141f301ab9843e068a67f07fea3a Mon Sep 17 00:00:00 2001 From: Fabian Schindler Date: Tue, 27 Jan 2026 12:45:09 +0100 Subject: [PATCH 5/5] test: add TTFT tests for openai --- tests/integrations/openai/test_openai.py | 200 +++++++++++++++++++++++ 1 file changed, 200 insertions(+) diff --git a/tests/integrations/openai/test_openai.py b/tests/integrations/openai/test_openai.py index 814289c887..505b2bb561 100644 --- a/tests/integrations/openai/test_openai.py +++ b/tests/integrations/openai/test_openai.py @@ -1559,3 +1559,203 @@ def test_openai_message_truncation(sentry_init, capture_events): if SPANDATA.GEN_AI_REQUEST_MESSAGES in span_meta: messages_meta = span_meta[SPANDATA.GEN_AI_REQUEST_MESSAGES] assert "len" in messages_meta.get("", {}) + + +# noinspection PyTypeChecker +def test_streaming_chat_completion_ttft(sentry_init, capture_events): + """ + Test that streaming chat completions capture time-to-first-token (TTFT). + """ + sentry_init( + integrations=[OpenAIIntegration()], + traces_sample_rate=1.0, + ) + events = capture_events() + + client = OpenAI(api_key="z") + returned_stream = Stream(cast_to=None, response=None, client=client) + returned_stream._iterator = [ + ChatCompletionChunk( + id="1", + choices=[ + DeltaChoice( + index=0, delta=ChoiceDelta(content="Hello"), finish_reason=None + ) + ], + created=100000, + model="model-id", + object="chat.completion.chunk", + ), + ChatCompletionChunk( + id="1", + choices=[ + DeltaChoice( + index=0, delta=ChoiceDelta(content=" world"), finish_reason="stop" + ) + ], + created=100000, + model="model-id", + object="chat.completion.chunk", + ), + ] + + client.chat.completions._post = mock.Mock(return_value=returned_stream) + + with start_transaction(name="openai tx"): + response_stream = client.chat.completions.create( + model="some-model", messages=[{"role": "user", "content": "Say hello"}] + ) + # Consume the stream + for _ in response_stream: + pass + + (tx,) = events + span = tx["spans"][0] + assert span["op"] == "gen_ai.chat" + + # Verify TTFT is captured + assert SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN in span["data"] + ttft = span["data"][SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN] + assert isinstance(ttft, float) + assert ttft > 0 + + +# noinspection PyTypeChecker +@pytest.mark.asyncio +async def test_streaming_chat_completion_ttft_async(sentry_init, capture_events): + """ + Test that async streaming chat completions capture time-to-first-token (TTFT). + """ + sentry_init( + integrations=[OpenAIIntegration()], + traces_sample_rate=1.0, + ) + events = capture_events() + + client = AsyncOpenAI(api_key="z") + returned_stream = AsyncStream(cast_to=None, response=None, client=client) + returned_stream._iterator = async_iterator( + [ + ChatCompletionChunk( + id="1", + choices=[ + DeltaChoice( + index=0, delta=ChoiceDelta(content="Hello"), finish_reason=None + ) + ], + created=100000, + model="model-id", + object="chat.completion.chunk", + ), + ChatCompletionChunk( + id="1", + choices=[ + DeltaChoice( + index=0, + delta=ChoiceDelta(content=" world"), + finish_reason="stop", + ) + ], + created=100000, + model="model-id", + object="chat.completion.chunk", + ), + ] + ) + + client.chat.completions._post = AsyncMock(return_value=returned_stream) + + with start_transaction(name="openai tx"): + response_stream = await client.chat.completions.create( + model="some-model", messages=[{"role": "user", "content": "Say hello"}] + ) + # Consume the stream + async for _ in response_stream: + pass + + (tx,) = events + span = tx["spans"][0] + assert span["op"] == "gen_ai.chat" + + # Verify TTFT is captured + assert SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN in span["data"] + ttft = span["data"][SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN] + assert isinstance(ttft, float) + assert ttft > 0 + + +# noinspection PyTypeChecker +@pytest.mark.skipif(SKIP_RESPONSES_TESTS, reason="Responses API not available") +def test_streaming_responses_api_ttft(sentry_init, capture_events): + """ + Test that streaming responses API captures time-to-first-token (TTFT). + """ + sentry_init( + integrations=[OpenAIIntegration()], + traces_sample_rate=1.0, + ) + events = capture_events() + + client = OpenAI(api_key="z") + returned_stream = Stream(cast_to=None, response=None, client=client) + returned_stream._iterator = EXAMPLE_RESPONSES_STREAM + client.responses._post = mock.Mock(return_value=returned_stream) + + with start_transaction(name="openai tx"): + response_stream = client.responses.create( + model="some-model", + input="hello", + stream=True, + ) + # Consume the stream + for _ in response_stream: + pass + + (tx,) = events + span = tx["spans"][0] + assert span["op"] == "gen_ai.responses" + + # Verify TTFT is captured + assert SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN in span["data"] + ttft = span["data"][SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN] + assert isinstance(ttft, float) + assert ttft > 0 + + +# noinspection PyTypeChecker +@pytest.mark.asyncio +@pytest.mark.skipif(SKIP_RESPONSES_TESTS, reason="Responses API not available") +async def test_streaming_responses_api_ttft_async(sentry_init, capture_events): + """ + Test that async streaming responses API captures time-to-first-token (TTFT). + """ + sentry_init( + integrations=[OpenAIIntegration()], + traces_sample_rate=1.0, + ) + events = capture_events() + + client = AsyncOpenAI(api_key="z") + returned_stream = AsyncStream(cast_to=None, response=None, client=client) + returned_stream._iterator = async_iterator(EXAMPLE_RESPONSES_STREAM) + client.responses._post = AsyncMock(return_value=returned_stream) + + with start_transaction(name="openai tx"): + response_stream = await client.responses.create( + model="some-model", + input="hello", + stream=True, + ) + # Consume the stream + async for _ in response_stream: + pass + + (tx,) = events + span = tx["spans"][0] + assert span["op"] == "gen_ai.responses" + + # Verify TTFT is captured + assert SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN in span["data"] + ttft = span["data"][SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN] + assert isinstance(ttft, float) + assert ttft > 0