@@ -275,6 +275,7 @@ def parse_args():
275275 ArgumentHelper .num_tokens_per_iter (tb_group )
276276 ArgumentHelper .max_prefill_iters (tb_group )
277277 ArgumentHelper .communicator (tb_group )
278+ ArgumentHelper .async_ (tb_group )
278279
279280 args = parser .parse_args ()
280281 return args
@@ -285,19 +286,19 @@ def main():
285286 random .seed (args .seed )
286287 os .environ ['TM_LOG_LEVEL' ] = args .log_level
287288 if args .backend == 'turbomind' :
288- engine_config = TurbomindEngineConfig (
289- max_batch_size = args .concurrency ,
290- tp = args .tp ,
291- cache_max_entry_count = args .cache_max_entry_count ,
292- session_len = args .session_len ,
293- cache_block_seq_len = args .cache_block_seq_len ,
294- model_format = args .model_format ,
295- quant_policy = args .quant_policy ,
296- num_tokens_per_iter = args .num_tokens_per_iter ,
297- max_prefill_iters = args .max_prefill_iters ,
298- enable_prefix_caching = args .enable_prefix_caching ,
299- communicator = args . communicator ,
300- )
289+ engine_config = TurbomindEngineConfig (max_batch_size = args . concurrency ,
290+ tp = args .tp ,
291+ cache_max_entry_count = args .cache_max_entry_count ,
292+ session_len = args .session_len ,
293+ cache_block_seq_len = args .cache_block_seq_len ,
294+ model_format = args .model_format ,
295+ quant_policy = args .quant_policy ,
296+ num_tokens_per_iter = args .num_tokens_per_iter ,
297+ max_prefill_iters = args .max_prefill_iters ,
298+ enable_prefix_caching = args .enable_prefix_caching ,
299+ communicator = args .communicator ,
300+ enable_metrics = False ,
301+ async_ = args . async_ )
301302 elif args .backend == 'pytorch' :
302303 engine_config = PytorchEngineConfig (
303304 cache_max_entry_count = args .cache_max_entry_count ,
0 commit comments