datawhalechina · zzhRooT1998 · Feb 1, 2026
diff --git a/docs/chapter8/第八章记忆与检索.md b/docs/chapter8/第八章记忆与检索.md
@@ -1414,11 +1414,16 @@ def _chunk_paragraphs(paragraphs: List[Dict], chunk_tokens: int, overlap_tokens:
 
 ```python
 def _approx_token_len(text: str) -> int:
-    """近似估计Token长度，支持中英文混合"""
-    # CJK字符按1 token计算
+    # 统计 CJK 字符，每个算 1 token
     cjk = sum(1 for ch in text if _is_cjk(ch))
-    # 其他字符按空白分词计算
-    non_cjk_tokens = len([t for t in text.split() if t])
+
+    # 非 CJK token，按空格分词
+    # 先去掉所有 CJK 字符
+    non_cjk_text = ''.join(ch if not _is_cjk(ch) else ' ' for ch in text)
+
+    # 按空格分词，过滤空串
+    non_cjk_tokens = len([t for t in non_cjk_text.split() if t])
+
     return cjk + non_cjk_tokens
 
 def _is_cjk(ch: str) -> bool: