Skip to content

Commit 6a4ede6

Browse files
authored
Improve metrics log (#4297)
* improve metrics log * minor * fix according to review * more comments * minor * minor
1 parent 96c9857 commit 6a4ede6

File tree

4 files changed

+71
-39
lines changed

4 files changed

+71
-39
lines changed

lmdeploy/metrics/loggers.py

Lines changed: 33 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -73,10 +73,10 @@ def record_specdecode(self, stats: SpeculativeDecodingStats):
7373
def record_finish(self, stats: RequestStats):
7474
pass
7575

76-
def log_spec_msg(self):
76+
def get_spec_msg(self):
7777
"""Get spec decoding logging msg."""
7878
if self.num_drafts == 0:
79-
return
79+
return None
8080

8181
draft_acceptance_rate = (self.num_accepted_tokens / self.num_draft_tokens *
8282
100 if self.num_draft_tokens > 0 else float('nan'))
@@ -97,7 +97,6 @@ def log_spec_msg(self):
9797

9898
def log(self):
9999
now = time.perf_counter()
100-
spec_msg = self.log_spec_msg()
101100

102101
# skip logging if no tokens were processed
103102
if self.total_prompt_tokens == 0 and self.total_generation_tokens == 0:
@@ -108,23 +107,27 @@ def log(self):
108107
prompt_throughput = self.total_prompt_tokens / (now - self.last_log_time)
109108
generation_throughput = self.total_generation_tokens / (now - self.last_log_time)
110109
scheduler_stats = self.last_scheduler_stats
111-
self._reset(now)
110+
scheduler_stats.num_api_waiting_reqs = scheduler_stats.num_total_reqs - \
111+
scheduler_stats.num_completed_reqs - scheduler_stats.num_api_routed_reqs
112+
spec_msg = self.get_spec_msg()
112113

113114
# format and print
114-
log_msg = (f"[{datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')} "
115-
f'DP{self.dp_rank}] '
116-
f'Avg prompt throughput: {prompt_throughput:.1f} tokens/s, '
117-
f'Avg generation throughput: {generation_throughput:.1f} tokens/s, '
118-
f'Finished: {scheduler_stats.num_finished_reqs} reqs, '
119-
f'Unfinished: {scheduler_stats.num_total_reqs-scheduler_stats.num_finished_reqs} reqs, '
120-
f'Running: {scheduler_stats.num_running_reqs} reqs, '
121-
f'Waiting: {scheduler_stats.num_waiting_reqs} reqs, '
122-
f'GPU KV cache usage: {scheduler_stats.gpu_cache_usage * 100 :.1f}%, '
123-
f'Prefix cache hit rate: {scheduler_stats.prefix_cache_hit_rate * 100 :.1f}%')
115+
log_msg = (
116+
f"[{datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')} DP{self.dp_rank}] "
117+
f'Avg thr (in/out): {prompt_throughput:.1f} / {generation_throughput:.1f} tokens/s, '
118+
f'API server (completed/routed/waiting): {scheduler_stats.num_completed_reqs} / '
119+
f'{scheduler_stats.num_api_routed_reqs} / {scheduler_stats.num_api_waiting_reqs}, '
120+
f'Engine (running/waiting): {scheduler_stats.num_running_reqs} / {scheduler_stats.num_waiting_reqs}, '
121+
f'KV cache: {scheduler_stats.gpu_cache_usage * 100 :.1f}%, ')
122+
123+
if scheduler_stats.prefix_cache_hit_rate != 0:
124+
log_msg += f'Prefix cache hit rate: {scheduler_stats.prefix_cache_hit_rate * 100 :.1f}%, '
124125

125126
if spec_msg is not None:
126-
log_msg += ', ' + spec_msg
127+
log_msg += spec_msg
128+
127129
print(log_msg, flush=True)
130+
self._reset(now)
128131

129132

130133
class PrometheusStatLogger(StatLoggerBase):
@@ -154,13 +157,18 @@ def __init__(self, model_name: str, max_model_len: int, dp_rank: int = 0):
154157
#
155158
# Scheduler stats
156159
#
157-
self.gauge_scheduler_finished = prometheus_client.Gauge(name='lmdeploy:num_requests_finished',
158-
documentation='Number of current finished requests.',
159-
labelnames=labelnames).labels(*labelvalues)
160+
self.gauge_scheduler_completed = prometheus_client.Gauge(name='lmdeploy:num_requests_completed',
161+
documentation='Number of current completed requests.',
162+
labelnames=labelnames).labels(*labelvalues)
163+
164+
self.gauge_scheduler_api_routed = prometheus_client.Gauge(
165+
name='lmdeploy:num_api_requests_routed',
166+
documentation='Number of requests routed to request handles.',
167+
labelnames=labelnames).labels(*labelvalues)
160168

161-
self.gauge_scheduler_unfinished = prometheus_client.Gauge(
162-
name='lmdeploy:num_requests_unfinished',
163-
documentation='Number of current unfinished requests.',
169+
self.gauge_scheduler_api_waiting = prometheus_client.Gauge(
170+
name='lmdeploy:num_api_requests_waiting',
171+
documentation='Number of requests waiting for free request handles.',
164172
labelnames=labelnames).labels(*labelvalues)
165173

166174
self.gauge_scheduler_running = prometheus_client.Gauge(
@@ -300,8 +308,10 @@ def __init__(self, model_name: str, max_model_len: int, dp_rank: int = 0):
300308

301309
def record_schedule(self, stats: SchedulerStats) -> None:
302310
"""Report schedule metrics to prometheus."""
303-
self.gauge_scheduler_finished.set(stats.num_finished_reqs)
304-
self.gauge_scheduler_unfinished.set(stats.num_total_reqs - stats.num_finished_reqs)
311+
self.gauge_scheduler_completed.set(stats.num_completed_reqs)
312+
self.gauge_scheduler_api_routed.set(stats.num_api_routed_reqs)
313+
self.gauge_scheduler_api_waiting.set(stats.num_total_reqs - stats.num_completed_reqs -
314+
stats.num_api_routed_reqs)
305315
self.gauge_scheduler_running.set(stats.num_running_reqs)
306316
self.gauge_scheduler_waiting.set(stats.num_waiting_reqs)
307317
self.gauge_gpu_cache_usage.set(stats.gpu_cache_usage)

lmdeploy/metrics/metrics_processor.py

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -93,13 +93,21 @@ def queue_update(self, update_data: tuple):
9393
return
9494
self.metrics_queue.put_nowait(update_data)
9595

96-
def increment_total_requests(self):
97-
"""Increment total requests."""
96+
def increase_total_requests(self):
97+
"""Increase total requests."""
9898
self.scheduler_stats.num_total_reqs += 1
9999

100-
def increment_finished_requests(self):
101-
"""Increment finished requests."""
102-
self.scheduler_stats.num_finished_reqs += 1
100+
def increase_completed_requests(self):
101+
"""Increase completed requests."""
102+
self.scheduler_stats.num_completed_reqs += 1
103+
104+
def increase_api_routed_requests(self):
105+
"""Increase API routed requests."""
106+
self.scheduler_stats.num_api_routed_reqs += 1
107+
108+
def decrease_api_routed_requests(self):
109+
"""Decrease API routed requests."""
110+
self.scheduler_stats.num_api_routed_reqs -= 1
103111

104112

105113
metrics_processor = MetricsProcessor()

lmdeploy/metrics/stats.py

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,18 +13,29 @@
1313
@dataclass
1414
class SchedulerStats:
1515
"""Stats associated with the scheduler.
16+
Desc:
17+
Dataflow: client --> API server --> Engine core
18+
API server total = completed + uncompleted = completed + (api_routed + api_waiting)
19+
Engine core total = running + waiting = api_routed
1620
1721
Attributes:
18-
num_total_reqs: The number of all requests received since server start.
19-
num_finished_reqs: The number of successfully completed requests since server start.
20-
num_running_reqs: Currently executing requests.
21-
num_waiting_reqs: Requests queued waiting for execution.
22+
num_total_reqs: API server, the number of all requests received since server start.
23+
num_completed_reqs: API server, the number of successfully completed requests since server start.
24+
num_api_routed_reqs: API server, the number of requests routed to request handles.
25+
num_api_waiting_reqs: API server, the number of requests waiting for free request handles.
26+
num_running_reqs: Engine core, currently executing requests.
27+
num_waiting_reqs: Engine core, requests queued waiting for execution.
2228
gpu_cache_usage: Fraction of GPU KV blocks utilized (0.0 to 1.0).
2329
prefix_cache_hit_rate: Prefix caching hit rate.
2430
"""
2531

32+
# api server
2633
num_total_reqs: int = 0
27-
num_finished_reqs: int = 0
34+
num_completed_reqs: int = 0
35+
num_api_routed_reqs: int = 0
36+
num_api_waiting_reqs: int = 0
37+
38+
# engine core
2839
num_running_reqs: int = 0
2940
num_waiting_reqs: int = 0
3041
gpu_cache_usage: float = 0.0
@@ -33,7 +44,9 @@ class SchedulerStats:
3344
def __repr__(self):
3445
return ('SchedulerStats(\n'
3546
f' num_total_reqs={self.num_total_reqs},\n'
36-
f' num_finished_reqs={self.num_finished_reqs},\n'
47+
f' num_completed_reqs={self.num_completed_reqs},\n'
48+
f' num_api_routed_reqs={self.num_api_routed_reqs},\n'
49+
f' num_api_waiting_reqs={self.num_api_waiting_reqs},\n'
3750
f' num_running_reqs={self.num_running_reqs},\n'
3851
f' num_waiting_reqs={self.num_waiting_reqs},\n'
3952
f' gpu_cache_usage={self.gpu_cache_usage:.6f},\n'

lmdeploy/serve/core/async_engine.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -265,6 +265,7 @@ def _determine_gen_config(self, session, input_ids, gen_config: GenerationConfig
265265
async def safe_run(self, handle, session, **kwargs):
266266
generator = handle.async_stream_infer(session.session_id, **kwargs)
267267
try:
268+
metrics_processor.increase_api_routed_requests()
268269
yield generator
269270
except (Exception, asyncio.CancelledError, GeneratorExit) as e: # noqa
270271
logger.error(f'[safe_run] session {session.session_id} exception caught: {type(e).__name__} {e}')
@@ -274,6 +275,7 @@ async def safe_run(self, handle, session, **kwargs):
274275
raise SafeRunException(f'Safe run exception for session {session.session_id}') from e
275276
finally:
276277
await generator.aclose()
278+
metrics_processor.decrease_api_routed_requests()
277279

278280
async def generate(
279281
self,
@@ -389,13 +391,12 @@ def is_error(status):
389391
if not gen_config.ignore_eos:
390392
stop_ids = gen_config.stop_token_ids or []
391393

392-
metrics_processor.increment_total_requests()
393-
394+
metrics_processor.increase_total_requests()
394395
async with session.request_handle() as handle:
395396
if epoch != self.epoch:
396397
logger.debug(f'[generate] session {session_id} got aborted before starting inference')
397-
# TODO(lvhan): metrics_processor.increment_failed_requests('abort')
398-
metrics_processor.increment_finished_requests()
398+
# TODO(lvhan): metrics_processor.increase_failed_requests('abort')
399+
metrics_processor.increase_completed_requests()
399400
yield GenOut(response='',
400401
history_token_len=0,
401402
input_token_len=len(input_ids),
@@ -467,7 +468,7 @@ def is_error(status):
467468
out.logits = (outputs.logits[:-hit_stop_token] if hit_stop_token else outputs.logits)
468469
yield out
469470
# end of generator loop
470-
metrics_processor.increment_finished_requests()
471+
metrics_processor.increase_completed_requests()
471472

472473
if not is_error(outputs.status):
473474
if outputs.status == ResponseType.CANCEL:

0 commit comments

Comments
 (0)