From 8ac7890f966bcbcc91f25a83a4e613a7fc6522ab Mon Sep 17 00:00:00 2001
From: Cursor Agent <cursoragent@cursor.com>
Date: Sun, 22 Feb 2026 14:34:11 +0000
Subject: [PATCH 1/3] Fix prompt cache length default to auto-compute from
 prompt_cache_max_pct

When --prompt-cache-max-pct is set but --prompt-cache-max-len is not
explicitly provided, auto-compute common_tokens as
int(prompt_tokens * prompt_cache_max_pct / 100). This makes
'--prompt-cache-max-pct 100 --prompt-tokens 8192' do the intuitive
thing: build an 8192-token shared prefix so the server can actually
cache it.

Previously, --prompt-cache-max-len defaulted to 0, which meant
TranslationDataset always built an empty shared prefix regardless of
--prompt-cache-max-pct, defeating prompt caching.

Changed --prompt-cache-max-len default from 0 to None so we can
distinguish 'not provided' from 'explicitly set to 0'.

Co-authored-by: Aidan Do <aidando73@gmail.com>
---
 llm_bench/load_test.py | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/llm_bench/load_test.py b/llm_bench/load_test.py
index 174a903..fa23a74 100644
--- a/llm_bench/load_test.py
+++ b/llm_bench/load_test.py
@@ -162,13 +162,20 @@ def _create_dataset(cls, options: argparse.Namespace):
                     prompt = options.prompt
                 dataset_file = "code.txt"
 
+            if options.prompt_cache_max_len is not None:
+                common_tokens = options.prompt_cache_max_len
+            elif options.prompt_cache_max_pct is not None:
+                common_tokens = int(options.prompt_tokens * options.prompt_cache_max_pct / 100)
+            else:
+                common_tokens = 0
+
             return TranslationDataset(
                 path=os.path.join(os.path.dirname(os.path.abspath(__file__)), dataset_file),
                 prompt="\n\n" + prompt,
                 tokenizer_path=options.tokenizer,
                 chat=options.chat,
                 num_tokens=options.prompt_tokens,
-                common_tokens=options.prompt_cache_max_len,
+                common_tokens=common_tokens,
             )
         else:
             raise ValueError(f"Unknown dataset: {options.dataset}")
@@ -1451,8 +1458,11 @@ def init_parser(parser):
         "--prompt-cache-max-len",
         env_var="PROMPT_CACHE_MAX_LEN",
         type=int,
-        default=0,
-        help="Maximum length of the prompt cache to use. Defaults to 0 (no caching).",
+        default=None,
+        help="Maximum number of shared prefix tokens across requests. "
+        "If not specified but --prompt-cache-max-pct is set, auto-computed as "
+        "int(prompt_tokens * prompt_cache_max_pct / 100). Defaults to 0 (no shared prefix) "
+        "when neither this nor --prompt-cache-max-pct is provided.",
     )
     parser.add_argument(
         "--prompt-cache-max-pct",

From 27b5a10137a518690d76998d9ba6773d74904967 Mon Sep 17 00:00:00 2001
From: Cursor Agent <cursoragent@cursor.com>
Date: Sun, 22 Feb 2026 14:36:57 +0000
Subject: [PATCH 2/3] Keep --prompt-cache-max-len default as 0

Per review feedback, keep the default at 0 instead of None.
The auto-computation from --prompt-cache-max-pct now triggers when
prompt_cache_max_len is 0 (the default) and prompt_cache_max_pct is
set. An explicit non-zero --prompt-cache-max-len still takes
precedence.

Co-authored-by: Aidan Do <aidando73@gmail.com>
---
 llm_bench/load_test.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/llm_bench/load_test.py b/llm_bench/load_test.py
index fa23a74..82d118b 100644
--- a/llm_bench/load_test.py
+++ b/llm_bench/load_test.py
@@ -162,7 +162,7 @@ def _create_dataset(cls, options: argparse.Namespace):
                     prompt = options.prompt
                 dataset_file = "code.txt"
 
-            if options.prompt_cache_max_len is not None:
+            if options.prompt_cache_max_len > 0:
                 common_tokens = options.prompt_cache_max_len
             elif options.prompt_cache_max_pct is not None:
                 common_tokens = int(options.prompt_tokens * options.prompt_cache_max_pct / 100)
@@ -1458,11 +1458,10 @@ def init_parser(parser):
         "--prompt-cache-max-len",
         env_var="PROMPT_CACHE_MAX_LEN",
         type=int,
-        default=None,
+        default=0,
         help="Maximum number of shared prefix tokens across requests. "
-        "If not specified but --prompt-cache-max-pct is set, auto-computed as "
-        "int(prompt_tokens * prompt_cache_max_pct / 100). Defaults to 0 (no shared prefix) "
-        "when neither this nor --prompt-cache-max-pct is provided.",
+        "When --prompt-cache-max-pct is set and this is 0, auto-computed as "
+        "int(prompt_tokens * prompt_cache_max_pct / 100). Defaults to 0.",
     )
     parser.add_argument(
         "--prompt-cache-max-pct",

From 9460832e4b273ec0a88a4718925f451258026380 Mon Sep 17 00:00:00 2001
From: Cursor Agent <cursoragent@cursor.com>
Date: Sun, 22 Feb 2026 14:38:30 +0000
Subject: [PATCH 3/3] Revert to None default for --prompt-cache-max-len

Cleaner idiom: use None to mean 'not provided' rather than
overloading 0. Behaviour is identical.

Co-authored-by: Aidan Do <aidando73@gmail.com>
---
 llm_bench/load_test.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/llm_bench/load_test.py b/llm_bench/load_test.py
index 82d118b..fa23a74 100644
--- a/llm_bench/load_test.py
+++ b/llm_bench/load_test.py
@@ -162,7 +162,7 @@ def _create_dataset(cls, options: argparse.Namespace):
                     prompt = options.prompt
                 dataset_file = "code.txt"
 
-            if options.prompt_cache_max_len > 0:
+            if options.prompt_cache_max_len is not None:
                 common_tokens = options.prompt_cache_max_len
             elif options.prompt_cache_max_pct is not None:
                 common_tokens = int(options.prompt_tokens * options.prompt_cache_max_pct / 100)
@@ -1458,10 +1458,11 @@ def init_parser(parser):
         "--prompt-cache-max-len",
         env_var="PROMPT_CACHE_MAX_LEN",
         type=int,
-        default=0,
+        default=None,
         help="Maximum number of shared prefix tokens across requests. "
-        "When --prompt-cache-max-pct is set and this is 0, auto-computed as "
-        "int(prompt_tokens * prompt_cache_max_pct / 100). Defaults to 0.",
+        "If not specified but --prompt-cache-max-pct is set, auto-computed as "
+        "int(prompt_tokens * prompt_cache_max_pct / 100). Defaults to 0 (no shared prefix) "
+        "when neither this nor --prompt-cache-max-pct is provided.",
     )
     parser.add_argument(
         "--prompt-cache-max-pct",