Skip to content

Commit 0c81d3a

Browse files
feat(integrations): OpenAI/OpenAI Agents detect and report the time to first token metric (TTFT) as gen_ai.response.time_to_first_token (#5348)
This records the time to first token (TTFT) metric for generation spans (ai_client). For OpenAI Agents it works in this way: we measure the time between we start the request until we get the first event with a `delta` attribute (which signifies actual content) #### Issues * Closes https://linear.app/getsentry/issue/TET-1757/time-to-first-tokenchunk-in-openaiopenai-agents-sdk
1 parent 595e84c commit 0c81d3a

File tree

5 files changed

+341
-3
lines changed

5 files changed

+341
-3
lines changed

sentry_sdk/consts.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -518,6 +518,12 @@ class SPANDATA:
518518
Example: ["The weather in Paris is rainy and overcast, with temperatures around 57°F", "The weather in London is sunny and warm, with temperatures around 65°F"]
519519
"""
520520

521+
GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN = "gen_ai.response.time_to_first_token"
522+
"""
523+
The time it took to receive the first token from the model.
524+
Example: 0.1
525+
"""
526+
521527
GEN_AI_RESPONSE_TOOL_CALLS = "gen_ai.response.tool_calls"
522528
"""
523529
The tool calls in the model's response.

sentry_sdk/integrations/openai.py

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import sys
22
from functools import wraps
3+
import time
34

45
import sentry_sdk
56
from sentry_sdk import consts
@@ -249,6 +250,7 @@ def _set_output_data(
249250
response: "Any",
250251
kwargs: "dict[str, Any]",
251252
integration: "OpenAIIntegration",
253+
start_time: "Optional[float]" = None,
252254
finish_span: bool = True,
253255
) -> None:
254256
if hasattr(response, "model"):
@@ -263,6 +265,8 @@ def _set_output_data(
263265
if messages is not None and isinstance(messages, str):
264266
messages = [messages]
265267

268+
ttft: "Optional[float]" = None
269+
266270
if hasattr(response, "choices"):
267271
if should_send_default_pii() and integration.include_prompts:
268272
response_text = [
@@ -320,6 +324,7 @@ def _set_output_data(
320324
old_iterator = response._iterator
321325

322326
def new_iterator() -> "Iterator[ChatCompletionChunk]":
327+
nonlocal ttft
323328
count_tokens_manually = True
324329
for x in old_iterator:
325330
with capture_internal_exceptions():
@@ -330,6 +335,8 @@ def new_iterator() -> "Iterator[ChatCompletionChunk]":
330335
if hasattr(choice, "delta") and hasattr(
331336
choice.delta, "content"
332337
):
338+
if start_time is not None and ttft is None:
339+
ttft = time.perf_counter() - start_time
333340
content = choice.delta.content
334341
if len(data_buf) <= choice_index:
335342
data_buf.append([])
@@ -338,6 +345,8 @@ def new_iterator() -> "Iterator[ChatCompletionChunk]":
338345

339346
# OpenAI responses API
340347
elif hasattr(x, "delta"):
348+
if start_time is not None and ttft is None:
349+
ttft = time.perf_counter() - start_time
341350
if len(data_buf) == 0:
342351
data_buf.append([])
343352
data_buf[0].append(x.delta or "")
@@ -356,6 +365,10 @@ def new_iterator() -> "Iterator[ChatCompletionChunk]":
356365
yield x
357366

358367
with capture_internal_exceptions():
368+
if ttft is not None:
369+
set_data_normalized(
370+
span, SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN, ttft
371+
)
359372
if len(data_buf) > 0:
360373
all_responses = ["".join(chunk) for chunk in data_buf]
361374
if should_send_default_pii() and integration.include_prompts:
@@ -375,6 +388,7 @@ def new_iterator() -> "Iterator[ChatCompletionChunk]":
375388
span.__exit__(None, None, None)
376389

377390
async def new_iterator_async() -> "AsyncIterator[ChatCompletionChunk]":
391+
nonlocal ttft
378392
count_tokens_manually = True
379393
async for x in old_iterator:
380394
with capture_internal_exceptions():
@@ -385,6 +399,8 @@ async def new_iterator_async() -> "AsyncIterator[ChatCompletionChunk]":
385399
if hasattr(choice, "delta") and hasattr(
386400
choice.delta, "content"
387401
):
402+
if start_time is not None and ttft is None:
403+
ttft = time.perf_counter() - start_time
388404
content = choice.delta.content
389405
if len(data_buf) <= choice_index:
390406
data_buf.append([])
@@ -393,6 +409,8 @@ async def new_iterator_async() -> "AsyncIterator[ChatCompletionChunk]":
393409

394410
# OpenAI responses API
395411
elif hasattr(x, "delta"):
412+
if start_time is not None and ttft is None:
413+
ttft = time.perf_counter() - start_time
396414
if len(data_buf) == 0:
397415
data_buf.append([])
398416
data_buf[0].append(x.delta or "")
@@ -411,6 +429,10 @@ async def new_iterator_async() -> "AsyncIterator[ChatCompletionChunk]":
411429
yield x
412430

413431
with capture_internal_exceptions():
432+
if ttft is not None:
433+
set_data_normalized(
434+
span, SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN, ttft
435+
)
414436
if len(data_buf) > 0:
415437
all_responses = ["".join(chunk) for chunk in data_buf]
416438
if should_send_default_pii() and integration.include_prompts:
@@ -465,9 +487,10 @@ def _new_chat_completion_common(f: "Any", *args: "Any", **kwargs: "Any") -> "Any
465487

466488
_set_input_data(span, kwargs, operation, integration)
467489

490+
start_time = time.perf_counter()
468491
response = yield f, args, kwargs
469492

470-
_set_output_data(span, response, kwargs, integration, finish_span=True)
493+
_set_output_data(span, response, kwargs, integration, start_time, finish_span=True)
471494

472495
return response
473496

@@ -645,9 +668,10 @@ def _new_responses_create_common(f: "Any", *args: "Any", **kwargs: "Any") -> "An
645668

646669
_set_input_data(span, kwargs, operation, integration)
647670

671+
start_time = time.perf_counter()
648672
response = yield f, args, kwargs
649673

650-
_set_output_data(span, response, kwargs, integration, finish_span=True)
674+
_set_output_data(span, response, kwargs, integration, start_time, finish_span=True)
651675

652676
return response
653677

sentry_sdk/integrations/openai_agents/patches/models.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import copy
2-
import sys
2+
import time
33
from functools import wraps
44

55
from sentry_sdk.integrations import DidNotEnable
@@ -149,8 +149,19 @@ async def wrapped_stream_response(*args: "Any", **kwargs: "Any") -> "Any":
149149
span.set_data(SPANDATA.GEN_AI_RESPONSE_STREAMING, True)
150150

151151
streaming_response = None
152+
ttft_recorded = False
153+
# Capture start time locally to avoid race conditions with concurrent requests
154+
start_time = time.perf_counter()
152155

153156
async for event in original_stream_response(*args, **kwargs):
157+
# Detect first content token (text delta event)
158+
if not ttft_recorded and hasattr(event, "delta"):
159+
ttft = time.perf_counter() - start_time
160+
span.set_data(
161+
SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN, ttft
162+
)
163+
ttft_recorded = True
164+
154165
# Capture the full response from ResponseCompletedEvent
155166
if hasattr(event, "response"):
156167
streaming_response = event.response

tests/integrations/openai/test_openai.py

Lines changed: 200 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1559,3 +1559,203 @@ def test_openai_message_truncation(sentry_init, capture_events):
15591559
if SPANDATA.GEN_AI_REQUEST_MESSAGES in span_meta:
15601560
messages_meta = span_meta[SPANDATA.GEN_AI_REQUEST_MESSAGES]
15611561
assert "len" in messages_meta.get("", {})
1562+
1563+
1564+
# noinspection PyTypeChecker
1565+
def test_streaming_chat_completion_ttft(sentry_init, capture_events):
1566+
"""
1567+
Test that streaming chat completions capture time-to-first-token (TTFT).
1568+
"""
1569+
sentry_init(
1570+
integrations=[OpenAIIntegration()],
1571+
traces_sample_rate=1.0,
1572+
)
1573+
events = capture_events()
1574+
1575+
client = OpenAI(api_key="z")
1576+
returned_stream = Stream(cast_to=None, response=None, client=client)
1577+
returned_stream._iterator = [
1578+
ChatCompletionChunk(
1579+
id="1",
1580+
choices=[
1581+
DeltaChoice(
1582+
index=0, delta=ChoiceDelta(content="Hello"), finish_reason=None
1583+
)
1584+
],
1585+
created=100000,
1586+
model="model-id",
1587+
object="chat.completion.chunk",
1588+
),
1589+
ChatCompletionChunk(
1590+
id="1",
1591+
choices=[
1592+
DeltaChoice(
1593+
index=0, delta=ChoiceDelta(content=" world"), finish_reason="stop"
1594+
)
1595+
],
1596+
created=100000,
1597+
model="model-id",
1598+
object="chat.completion.chunk",
1599+
),
1600+
]
1601+
1602+
client.chat.completions._post = mock.Mock(return_value=returned_stream)
1603+
1604+
with start_transaction(name="openai tx"):
1605+
response_stream = client.chat.completions.create(
1606+
model="some-model", messages=[{"role": "user", "content": "Say hello"}]
1607+
)
1608+
# Consume the stream
1609+
for _ in response_stream:
1610+
pass
1611+
1612+
(tx,) = events
1613+
span = tx["spans"][0]
1614+
assert span["op"] == "gen_ai.chat"
1615+
1616+
# Verify TTFT is captured
1617+
assert SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN in span["data"]
1618+
ttft = span["data"][SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN]
1619+
assert isinstance(ttft, float)
1620+
assert ttft > 0
1621+
1622+
1623+
# noinspection PyTypeChecker
1624+
@pytest.mark.asyncio
1625+
async def test_streaming_chat_completion_ttft_async(sentry_init, capture_events):
1626+
"""
1627+
Test that async streaming chat completions capture time-to-first-token (TTFT).
1628+
"""
1629+
sentry_init(
1630+
integrations=[OpenAIIntegration()],
1631+
traces_sample_rate=1.0,
1632+
)
1633+
events = capture_events()
1634+
1635+
client = AsyncOpenAI(api_key="z")
1636+
returned_stream = AsyncStream(cast_to=None, response=None, client=client)
1637+
returned_stream._iterator = async_iterator(
1638+
[
1639+
ChatCompletionChunk(
1640+
id="1",
1641+
choices=[
1642+
DeltaChoice(
1643+
index=0, delta=ChoiceDelta(content="Hello"), finish_reason=None
1644+
)
1645+
],
1646+
created=100000,
1647+
model="model-id",
1648+
object="chat.completion.chunk",
1649+
),
1650+
ChatCompletionChunk(
1651+
id="1",
1652+
choices=[
1653+
DeltaChoice(
1654+
index=0,
1655+
delta=ChoiceDelta(content=" world"),
1656+
finish_reason="stop",
1657+
)
1658+
],
1659+
created=100000,
1660+
model="model-id",
1661+
object="chat.completion.chunk",
1662+
),
1663+
]
1664+
)
1665+
1666+
client.chat.completions._post = AsyncMock(return_value=returned_stream)
1667+
1668+
with start_transaction(name="openai tx"):
1669+
response_stream = await client.chat.completions.create(
1670+
model="some-model", messages=[{"role": "user", "content": "Say hello"}]
1671+
)
1672+
# Consume the stream
1673+
async for _ in response_stream:
1674+
pass
1675+
1676+
(tx,) = events
1677+
span = tx["spans"][0]
1678+
assert span["op"] == "gen_ai.chat"
1679+
1680+
# Verify TTFT is captured
1681+
assert SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN in span["data"]
1682+
ttft = span["data"][SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN]
1683+
assert isinstance(ttft, float)
1684+
assert ttft > 0
1685+
1686+
1687+
# noinspection PyTypeChecker
1688+
@pytest.mark.skipif(SKIP_RESPONSES_TESTS, reason="Responses API not available")
1689+
def test_streaming_responses_api_ttft(sentry_init, capture_events):
1690+
"""
1691+
Test that streaming responses API captures time-to-first-token (TTFT).
1692+
"""
1693+
sentry_init(
1694+
integrations=[OpenAIIntegration()],
1695+
traces_sample_rate=1.0,
1696+
)
1697+
events = capture_events()
1698+
1699+
client = OpenAI(api_key="z")
1700+
returned_stream = Stream(cast_to=None, response=None, client=client)
1701+
returned_stream._iterator = EXAMPLE_RESPONSES_STREAM
1702+
client.responses._post = mock.Mock(return_value=returned_stream)
1703+
1704+
with start_transaction(name="openai tx"):
1705+
response_stream = client.responses.create(
1706+
model="some-model",
1707+
input="hello",
1708+
stream=True,
1709+
)
1710+
# Consume the stream
1711+
for _ in response_stream:
1712+
pass
1713+
1714+
(tx,) = events
1715+
span = tx["spans"][0]
1716+
assert span["op"] == "gen_ai.responses"
1717+
1718+
# Verify TTFT is captured
1719+
assert SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN in span["data"]
1720+
ttft = span["data"][SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN]
1721+
assert isinstance(ttft, float)
1722+
assert ttft > 0
1723+
1724+
1725+
# noinspection PyTypeChecker
1726+
@pytest.mark.asyncio
1727+
@pytest.mark.skipif(SKIP_RESPONSES_TESTS, reason="Responses API not available")
1728+
async def test_streaming_responses_api_ttft_async(sentry_init, capture_events):
1729+
"""
1730+
Test that async streaming responses API captures time-to-first-token (TTFT).
1731+
"""
1732+
sentry_init(
1733+
integrations=[OpenAIIntegration()],
1734+
traces_sample_rate=1.0,
1735+
)
1736+
events = capture_events()
1737+
1738+
client = AsyncOpenAI(api_key="z")
1739+
returned_stream = AsyncStream(cast_to=None, response=None, client=client)
1740+
returned_stream._iterator = async_iterator(EXAMPLE_RESPONSES_STREAM)
1741+
client.responses._post = AsyncMock(return_value=returned_stream)
1742+
1743+
with start_transaction(name="openai tx"):
1744+
response_stream = await client.responses.create(
1745+
model="some-model",
1746+
input="hello",
1747+
stream=True,
1748+
)
1749+
# Consume the stream
1750+
async for _ in response_stream:
1751+
pass
1752+
1753+
(tx,) = events
1754+
span = tx["spans"][0]
1755+
assert span["op"] == "gen_ai.responses"
1756+
1757+
# Verify TTFT is captured
1758+
assert SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN in span["data"]
1759+
ttft = span["data"][SPANDATA.GEN_AI_RESPONSE_TIME_TO_FIRST_TOKEN]
1760+
assert isinstance(ttft, float)
1761+
assert ttft > 0

0 commit comments

Comments
 (0)