From 8ac7890f966bcbcc91f25a83a4e613a7fc6522ab Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Sun, 22 Feb 2026 14:34:11 +0000 Subject: [PATCH 1/3] Fix prompt cache length default to auto-compute from prompt_cache_max_pct When --prompt-cache-max-pct is set but --prompt-cache-max-len is not explicitly provided, auto-compute common_tokens as int(prompt_tokens * prompt_cache_max_pct / 100). This makes '--prompt-cache-max-pct 100 --prompt-tokens 8192' do the intuitive thing: build an 8192-token shared prefix so the server can actually cache it. Previously, --prompt-cache-max-len defaulted to 0, which meant TranslationDataset always built an empty shared prefix regardless of --prompt-cache-max-pct, defeating prompt caching. Changed --prompt-cache-max-len default from 0 to None so we can distinguish 'not provided' from 'explicitly set to 0'. Co-authored-by: Aidan Do --- llm_bench/load_test.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/llm_bench/load_test.py b/llm_bench/load_test.py index 174a903..fa23a74 100644 --- a/llm_bench/load_test.py +++ b/llm_bench/load_test.py @@ -162,13 +162,20 @@ def _create_dataset(cls, options: argparse.Namespace): prompt = options.prompt dataset_file = "code.txt" + if options.prompt_cache_max_len is not None: + common_tokens = options.prompt_cache_max_len + elif options.prompt_cache_max_pct is not None: + common_tokens = int(options.prompt_tokens * options.prompt_cache_max_pct / 100) + else: + common_tokens = 0 + return TranslationDataset( path=os.path.join(os.path.dirname(os.path.abspath(__file__)), dataset_file), prompt="\n\n" + prompt, tokenizer_path=options.tokenizer, chat=options.chat, num_tokens=options.prompt_tokens, - common_tokens=options.prompt_cache_max_len, + common_tokens=common_tokens, ) else: raise ValueError(f"Unknown dataset: {options.dataset}") @@ -1451,8 +1458,11 @@ def init_parser(parser): "--prompt-cache-max-len", env_var="PROMPT_CACHE_MAX_LEN", type=int, - default=0, - help="Maximum length of the prompt cache to use. Defaults to 0 (no caching).", + default=None, + help="Maximum number of shared prefix tokens across requests. " + "If not specified but --prompt-cache-max-pct is set, auto-computed as " + "int(prompt_tokens * prompt_cache_max_pct / 100). Defaults to 0 (no shared prefix) " + "when neither this nor --prompt-cache-max-pct is provided.", ) parser.add_argument( "--prompt-cache-max-pct", From 27b5a10137a518690d76998d9ba6773d74904967 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Sun, 22 Feb 2026 14:36:57 +0000 Subject: [PATCH 2/3] Keep --prompt-cache-max-len default as 0 Per review feedback, keep the default at 0 instead of None. The auto-computation from --prompt-cache-max-pct now triggers when prompt_cache_max_len is 0 (the default) and prompt_cache_max_pct is set. An explicit non-zero --prompt-cache-max-len still takes precedence. Co-authored-by: Aidan Do --- llm_bench/load_test.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/llm_bench/load_test.py b/llm_bench/load_test.py index fa23a74..82d118b 100644 --- a/llm_bench/load_test.py +++ b/llm_bench/load_test.py @@ -162,7 +162,7 @@ def _create_dataset(cls, options: argparse.Namespace): prompt = options.prompt dataset_file = "code.txt" - if options.prompt_cache_max_len is not None: + if options.prompt_cache_max_len > 0: common_tokens = options.prompt_cache_max_len elif options.prompt_cache_max_pct is not None: common_tokens = int(options.prompt_tokens * options.prompt_cache_max_pct / 100) @@ -1458,11 +1458,10 @@ def init_parser(parser): "--prompt-cache-max-len", env_var="PROMPT_CACHE_MAX_LEN", type=int, - default=None, + default=0, help="Maximum number of shared prefix tokens across requests. " - "If not specified but --prompt-cache-max-pct is set, auto-computed as " - "int(prompt_tokens * prompt_cache_max_pct / 100). Defaults to 0 (no shared prefix) " - "when neither this nor --prompt-cache-max-pct is provided.", + "When --prompt-cache-max-pct is set and this is 0, auto-computed as " + "int(prompt_tokens * prompt_cache_max_pct / 100). Defaults to 0.", ) parser.add_argument( "--prompt-cache-max-pct", From 9460832e4b273ec0a88a4718925f451258026380 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Sun, 22 Feb 2026 14:38:30 +0000 Subject: [PATCH 3/3] Revert to None default for --prompt-cache-max-len Cleaner idiom: use None to mean 'not provided' rather than overloading 0. Behaviour is identical. Co-authored-by: Aidan Do --- llm_bench/load_test.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/llm_bench/load_test.py b/llm_bench/load_test.py index 82d118b..fa23a74 100644 --- a/llm_bench/load_test.py +++ b/llm_bench/load_test.py @@ -162,7 +162,7 @@ def _create_dataset(cls, options: argparse.Namespace): prompt = options.prompt dataset_file = "code.txt" - if options.prompt_cache_max_len > 0: + if options.prompt_cache_max_len is not None: common_tokens = options.prompt_cache_max_len elif options.prompt_cache_max_pct is not None: common_tokens = int(options.prompt_tokens * options.prompt_cache_max_pct / 100) @@ -1458,10 +1458,11 @@ def init_parser(parser): "--prompt-cache-max-len", env_var="PROMPT_CACHE_MAX_LEN", type=int, - default=0, + default=None, help="Maximum number of shared prefix tokens across requests. " - "When --prompt-cache-max-pct is set and this is 0, auto-computed as " - "int(prompt_tokens * prompt_cache_max_pct / 100). Defaults to 0.", + "If not specified but --prompt-cache-max-pct is set, auto-computed as " + "int(prompt_tokens * prompt_cache_max_pct / 100). Defaults to 0 (no shared prefix) " + "when neither this nor --prompt-cache-max-pct is provided.", ) parser.add_argument( "--prompt-cache-max-pct",