@@ -73,10 +73,10 @@ def record_specdecode(self, stats: SpeculativeDecodingStats):
7373 def record_finish (self , stats : RequestStats ):
7474 pass
7575
76- def log_spec_msg (self ):
76+ def get_spec_msg (self ):
7777 """Get spec decoding logging msg."""
7878 if self .num_drafts == 0 :
79- return
79+ return None
8080
8181 draft_acceptance_rate = (self .num_accepted_tokens / self .num_draft_tokens *
8282 100 if self .num_draft_tokens > 0 else float ('nan' ))
@@ -97,7 +97,6 @@ def log_spec_msg(self):
9797
9898 def log (self ):
9999 now = time .perf_counter ()
100- spec_msg = self .log_spec_msg ()
101100
102101 # skip logging if no tokens were processed
103102 if self .total_prompt_tokens == 0 and self .total_generation_tokens == 0 :
@@ -108,23 +107,27 @@ def log(self):
108107 prompt_throughput = self .total_prompt_tokens / (now - self .last_log_time )
109108 generation_throughput = self .total_generation_tokens / (now - self .last_log_time )
110109 scheduler_stats = self .last_scheduler_stats
111- self ._reset (now )
110+ scheduler_stats .num_api_waiting_reqs = scheduler_stats .num_total_reqs - \
111+ scheduler_stats .num_completed_reqs - scheduler_stats .num_api_routed_reqs
112+ spec_msg = self .get_spec_msg ()
112113
113114 # format and print
114- log_msg = (f"[ { datetime . fromtimestamp ( time . time ()). strftime ( '%Y-%m-%d %H:%M:%S' ) } "
115- f' DP{ self .dp_rank } ] '
116- f'Avg prompt throughput : { prompt_throughput :.1f} tokens/s, '
117- f'Avg generation throughput : { generation_throughput :.1f } tokens/s, '
118- f'Finished: { scheduler_stats .num_finished_reqs } reqs , '
119- f'Unfinished : { scheduler_stats .num_total_reqs - scheduler_stats .num_finished_reqs } reqs , '
120- f'Running : { scheduler_stats .num_running_reqs } reqs , '
121- f'Waiting: { scheduler_stats . num_waiting_reqs } reqs, '
122- f'GPU KV cache usage: { scheduler_stats .gpu_cache_usage * 100 :.1f } %, '
123- f'Prefix cache hit rate: { scheduler_stats .prefix_cache_hit_rate * 100 :.1f} %' )
115+ log_msg = (
116+ f"[ { datetime . fromtimestamp ( time . time ()). strftime ( '%Y-%m-%d %H:%M:%S' ) } DP{ self .dp_rank } ] "
117+ f'Avg thr (in/out) : { prompt_throughput :.1f } / { generation_throughput :.1f} tokens/s, '
118+ f'API server (completed/routed/waiting) : { scheduler_stats . num_completed_reqs } / '
119+ f' { scheduler_stats .num_api_routed_reqs } / { scheduler_stats . num_api_waiting_reqs } , '
120+ f'Engine (running/waiting) : { scheduler_stats .num_running_reqs } / { scheduler_stats .num_waiting_reqs } , '
121+ f'KV cache : { scheduler_stats .gpu_cache_usage * 100 :.1f } % , ')
122+
123+ if scheduler_stats .prefix_cache_hit_rate != 0 :
124+ log_msg += f'Prefix cache hit rate: { scheduler_stats .prefix_cache_hit_rate * 100 :.1f} %, '
124125
125126 if spec_msg is not None :
126- log_msg += ', ' + spec_msg
127+ log_msg += spec_msg
128+
127129 print (log_msg , flush = True )
130+ self ._reset (now )
128131
129132
130133class PrometheusStatLogger (StatLoggerBase ):
@@ -154,13 +157,18 @@ def __init__(self, model_name: str, max_model_len: int, dp_rank: int = 0):
154157 #
155158 # Scheduler stats
156159 #
157- self .gauge_scheduler_finished = prometheus_client .Gauge (name = 'lmdeploy:num_requests_finished' ,
158- documentation = 'Number of current finished requests.' ,
159- labelnames = labelnames ).labels (* labelvalues )
160+ self .gauge_scheduler_completed = prometheus_client .Gauge (name = 'lmdeploy:num_requests_completed' ,
161+ documentation = 'Number of current completed requests.' ,
162+ labelnames = labelnames ).labels (* labelvalues )
163+
164+ self .gauge_scheduler_api_routed = prometheus_client .Gauge (
165+ name = 'lmdeploy:num_api_requests_routed' ,
166+ documentation = 'Number of requests routed to request handles.' ,
167+ labelnames = labelnames ).labels (* labelvalues )
160168
161- self .gauge_scheduler_unfinished = prometheus_client .Gauge (
162- name = 'lmdeploy:num_requests_unfinished ' ,
163- documentation = 'Number of current unfinished requests .' ,
169+ self .gauge_scheduler_api_waiting = prometheus_client .Gauge (
170+ name = 'lmdeploy:num_api_requests_waiting ' ,
171+ documentation = 'Number of requests waiting for free request handles .' ,
164172 labelnames = labelnames ).labels (* labelvalues )
165173
166174 self .gauge_scheduler_running = prometheus_client .Gauge (
@@ -300,8 +308,10 @@ def __init__(self, model_name: str, max_model_len: int, dp_rank: int = 0):
300308
301309 def record_schedule (self , stats : SchedulerStats ) -> None :
302310 """Report schedule metrics to prometheus."""
303- self .gauge_scheduler_finished .set (stats .num_finished_reqs )
304- self .gauge_scheduler_unfinished .set (stats .num_total_reqs - stats .num_finished_reqs )
311+ self .gauge_scheduler_completed .set (stats .num_completed_reqs )
312+ self .gauge_scheduler_api_routed .set (stats .num_api_routed_reqs )
313+ self .gauge_scheduler_api_waiting .set (stats .num_total_reqs - stats .num_completed_reqs -
314+ stats .num_api_routed_reqs )
305315 self .gauge_scheduler_running .set (stats .num_running_reqs )
306316 self .gauge_scheduler_waiting .set (stats .num_waiting_reqs )
307317 self .gauge_gpu_cache_usage .set (stats .gpu_cache_usage )
0 commit comments