Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions sentry_sdk/consts.py
Original file line number Diff line number Diff line change
Expand Up @@ -518,6 +518,12 @@ class SPANDATA:
Example: ["The weather in Paris is rainy and overcast, with temperatures around 57°F", "The weather in London is sunny and warm, with temperatures around 65°F"]
"""

GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN = "gen_ai.response.time_to_first_token"
"""
The time it took to receive the first token from the model.
Example: 0.1
"""

GEN_AI_RESPONSE_TOOL_CALLS = "gen_ai.response.tool_calls"
"""
The tool calls in the model's response.
Expand Down
28 changes: 26 additions & 2 deletions sentry_sdk/integrations/openai.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import sys
from functools import wraps
import time

import sentry_sdk
from sentry_sdk import consts
Expand Down Expand Up @@ -249,6 +250,7 @@ def _set_output_data(
response: "Any",
kwargs: "dict[str, Any]",
integration: "OpenAIIntegration",
start_time: "Optional[float]" = None,
finish_span: bool = True,
) -> None:
if hasattr(response, "model"):
Expand All @@ -263,6 +265,8 @@ def _set_output_data(
if messages is not None and isinstance(messages, str):
messages = [messages]

ttft: "Optional[float]" = None

if hasattr(response, "choices"):
if should_send_default_pii() and integration.include_prompts:
response_text = [
Expand Down Expand Up @@ -320,6 +324,7 @@ def _set_output_data(
old_iterator = response._iterator

def new_iterator() -> "Iterator[ChatCompletionChunk]":
nonlocal ttft
count_tokens_manually = True
for x in old_iterator:
with capture_internal_exceptions():
Expand All @@ -330,6 +335,8 @@ def new_iterator() -> "Iterator[ChatCompletionChunk]":
if hasattr(choice, "delta") and hasattr(
choice.delta, "content"
):
if start_time is not None and ttft is None:
ttft = time.perf_counter() - start_time
content = choice.delta.content
if len(data_buf) <= choice_index:
data_buf.append([])
Expand All @@ -338,6 +345,8 @@ def new_iterator() -> "Iterator[ChatCompletionChunk]":

# OpenAI responses API
elif hasattr(x, "delta"):
if start_time is not None and ttft is None:
ttft = time.perf_counter() - start_time
if len(data_buf) == 0:
data_buf.append([])
data_buf[0].append(x.delta or "")
Expand All @@ -356,6 +365,10 @@ def new_iterator() -> "Iterator[ChatCompletionChunk]":
yield x

with capture_internal_exceptions():
if ttft is not None:
set_data_normalized(
span, SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN, ttft
)
if len(data_buf) > 0:
all_responses = ["".join(chunk) for chunk in data_buf]
if should_send_default_pii() and integration.include_prompts:
Expand All @@ -375,6 +388,7 @@ def new_iterator() -> "Iterator[ChatCompletionChunk]":
span.__exit__(None, None, None)

async def new_iterator_async() -> "AsyncIterator[ChatCompletionChunk]":
nonlocal ttft
count_tokens_manually = True
async for x in old_iterator:
with capture_internal_exceptions():
Expand All @@ -385,6 +399,8 @@ async def new_iterator_async() -> "AsyncIterator[ChatCompletionChunk]":
if hasattr(choice, "delta") and hasattr(
choice.delta, "content"
):
if start_time is not None and ttft is None:
ttft = time.perf_counter() - start_time
content = choice.delta.content
if len(data_buf) <= choice_index:
data_buf.append([])
Expand All @@ -393,6 +409,8 @@ async def new_iterator_async() -> "AsyncIterator[ChatCompletionChunk]":

# OpenAI responses API
elif hasattr(x, "delta"):
if start_time is not None and ttft is None:
ttft = time.perf_counter() - start_time
if len(data_buf) == 0:
data_buf.append([])
data_buf[0].append(x.delta or "")
Expand All @@ -411,6 +429,10 @@ async def new_iterator_async() -> "AsyncIterator[ChatCompletionChunk]":
yield x

with capture_internal_exceptions():
if ttft is not None:
set_data_normalized(
span, SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN, ttft
)
if len(data_buf) > 0:
all_responses = ["".join(chunk) for chunk in data_buf]
if should_send_default_pii() and integration.include_prompts:
Expand Down Expand Up @@ -465,9 +487,10 @@ def _new_chat_completion_common(f: "Any", *args: "Any", **kwargs: "Any") -> "Any

_set_input_data(span, kwargs, operation, integration)

start_time = time.perf_counter()
response = yield f, args, kwargs

_set_output_data(span, response, kwargs, integration, finish_span=True)
_set_output_data(span, response, kwargs, integration, start_time, finish_span=True)

return response

Expand Down Expand Up @@ -645,9 +668,10 @@ def _new_responses_create_common(f: "Any", *args: "Any", **kwargs: "Any") -> "An

_set_input_data(span, kwargs, operation, integration)

start_time = time.perf_counter()
response = yield f, args, kwargs

_set_output_data(span, response, kwargs, integration, finish_span=True)
_set_output_data(span, response, kwargs, integration, start_time, finish_span=True)

return response

Expand Down
13 changes: 12 additions & 1 deletion sentry_sdk/integrations/openai_agents/patches/models.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import copy
import sys
import time
from functools import wraps

from sentry_sdk.integrations import DidNotEnable
Expand Down Expand Up @@ -149,8 +149,19 @@ async def wrapped_stream_response(*args: "Any", **kwargs: "Any") -> "Any":
span.set_data(SPANDATA.GEN_AI_RESPONSE_STREAMING, True)

streaming_response = None
ttft_recorded = False
# Capture start time locally to avoid race conditions with concurrent requests
start_time = time.perf_counter()

async for event in original_stream_response(*args, **kwargs):
# Detect first content token (text delta event)
if not ttft_recorded and hasattr(event, "delta"):
ttft = time.perf_counter() - start_time
span.set_data(
SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN, ttft
)
ttft_recorded = True

# Capture the full response from ResponseCompletedEvent
if hasattr(event, "response"):
streaming_response = event.response
Expand Down
200 changes: 200 additions & 0 deletions tests/integrations/openai/test_openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -1559,3 +1559,203 @@ def test_openai_message_truncation(sentry_init, capture_events):
if SPANDATA.GEN_AI_REQUEST_MESSAGES in span_meta:
messages_meta = span_meta[SPANDATA.GEN_AI_REQUEST_MESSAGES]
assert "len" in messages_meta.get("", {})


# noinspection PyTypeChecker
def test_streaming_chat_completion_ttft(sentry_init, capture_events):
"""
Test that streaming chat completions capture time-to-first-token (TTFT).
"""
sentry_init(
integrations=[OpenAIIntegration()],
traces_sample_rate=1.0,
)
events = capture_events()

client = OpenAI(api_key="z")
returned_stream = Stream(cast_to=None, response=None, client=client)
returned_stream._iterator = [
ChatCompletionChunk(
id="1",
choices=[
DeltaChoice(
index=0, delta=ChoiceDelta(content="Hello"), finish_reason=None
)
],
created=100000,
model="model-id",
object="chat.completion.chunk",
),
ChatCompletionChunk(
id="1",
choices=[
DeltaChoice(
index=0, delta=ChoiceDelta(content=" world"), finish_reason="stop"
)
],
created=100000,
model="model-id",
object="chat.completion.chunk",
),
]

client.chat.completions._post = mock.Mock(return_value=returned_stream)

with start_transaction(name="openai tx"):
response_stream = client.chat.completions.create(
model="some-model", messages=[{"role": "user", "content": "Say hello"}]
)
# Consume the stream
for _ in response_stream:
pass

(tx,) = events
span = tx["spans"][0]
assert span["op"] == "gen_ai.chat"

# Verify TTFT is captured
assert SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN in span["data"]
ttft = span["data"][SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN]
assert isinstance(ttft, float)
assert ttft > 0


# noinspection PyTypeChecker
@pytest.mark.asyncio
async def test_streaming_chat_completion_ttft_async(sentry_init, capture_events):
"""
Test that async streaming chat completions capture time-to-first-token (TTFT).
"""
sentry_init(
integrations=[OpenAIIntegration()],
traces_sample_rate=1.0,
)
events = capture_events()

client = AsyncOpenAI(api_key="z")
returned_stream = AsyncStream(cast_to=None, response=None, client=client)
returned_stream._iterator = async_iterator(
[
ChatCompletionChunk(
id="1",
choices=[
DeltaChoice(
index=0, delta=ChoiceDelta(content="Hello"), finish_reason=None
)
],
created=100000,
model="model-id",
object="chat.completion.chunk",
),
ChatCompletionChunk(
id="1",
choices=[
DeltaChoice(
index=0,
delta=ChoiceDelta(content=" world"),
finish_reason="stop",
)
],
created=100000,
model="model-id",
object="chat.completion.chunk",
),
]
)

client.chat.completions._post = AsyncMock(return_value=returned_stream)

with start_transaction(name="openai tx"):
response_stream = await client.chat.completions.create(
model="some-model", messages=[{"role": "user", "content": "Say hello"}]
)
# Consume the stream
async for _ in response_stream:
pass

(tx,) = events
span = tx["spans"][0]
assert span["op"] == "gen_ai.chat"

# Verify TTFT is captured
assert SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN in span["data"]
ttft = span["data"][SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN]
assert isinstance(ttft, float)
assert ttft > 0


# noinspection PyTypeChecker
@pytest.mark.skipif(SKIP_RESPONSES_TESTS, reason="Responses API not available")
def test_streaming_responses_api_ttft(sentry_init, capture_events):
"""
Test that streaming responses API captures time-to-first-token (TTFT).
"""
sentry_init(
integrations=[OpenAIIntegration()],
traces_sample_rate=1.0,
)
events = capture_events()

client = OpenAI(api_key="z")
returned_stream = Stream(cast_to=None, response=None, client=client)
returned_stream._iterator = EXAMPLE_RESPONSES_STREAM
client.responses._post = mock.Mock(return_value=returned_stream)

with start_transaction(name="openai tx"):
response_stream = client.responses.create(
model="some-model",
input="hello",
stream=True,
)
# Consume the stream
for _ in response_stream:
pass

(tx,) = events
span = tx["spans"][0]
assert span["op"] == "gen_ai.responses"

# Verify TTFT is captured
assert SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN in span["data"]
ttft = span["data"][SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN]
assert isinstance(ttft, float)
assert ttft > 0


# noinspection PyTypeChecker
@pytest.mark.asyncio
@pytest.mark.skipif(SKIP_RESPONSES_TESTS, reason="Responses API not available")
async def test_streaming_responses_api_ttft_async(sentry_init, capture_events):
"""
Test that async streaming responses API captures time-to-first-token (TTFT).
"""
sentry_init(
integrations=[OpenAIIntegration()],
traces_sample_rate=1.0,
)
events = capture_events()

client = AsyncOpenAI(api_key="z")
returned_stream = AsyncStream(cast_to=None, response=None, client=client)
returned_stream._iterator = async_iterator(EXAMPLE_RESPONSES_STREAM)
client.responses._post = AsyncMock(return_value=returned_stream)

with start_transaction(name="openai tx"):
response_stream = await client.responses.create(
model="some-model",
input="hello",
stream=True,
)
# Consume the stream
async for _ in response_stream:
pass

(tx,) = events
span = tx["spans"][0]
assert span["op"] == "gen_ai.responses"

# Verify TTFT is captured
assert SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN in span["data"]
ttft = span["data"][SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN]
assert isinstance(ttft, float)
assert ttft > 0
Loading
Loading