InternLM
diff --git a/‎.github/workflows/cuda12.8-whl-release.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/cuda12.8-whl-release.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/workflows/unit-test.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/unit-test.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎CMakeLists.txt‎
Lines changed: 4 additions & 4 deletions b/‎CMakeLists.txt‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎README_ja.md‎
Lines changed: 1 addition & 1 deletion b/‎README_ja.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎autotest/config.yaml‎
Lines changed: 1 addition & 3 deletions b/‎autotest/config.yaml‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎benchmark/profile_pipeline_api.py‎
Lines changed: 14 additions & 13 deletions b/‎benchmark/profile_pipeline_api.py‎
Lines changed: 14 additions & 13 deletions
diff --git a/‎benchmark/profile_throughput.py‎
Lines changed: 2 additions & 0 deletions b/‎benchmark/profile_throughput.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎builder/manywheel/Dockerfile_2014‎
Lines changed: 1 addition & 1 deletion b/‎builder/manywheel/Dockerfile_2014‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎builder/manywheel/build_all_wheel.sh‎
Lines changed: 1 addition & 1 deletion b/‎builder/manywheel/build_all_wheel.sh‎
Lines changed: 1 addition & 1 deletion
@@ -13,7 +13,7 @@ jobs:
   linux-build:
     strategy:
       matrix:
-        pyver: [py39, py310, py311, py312, py313]
+        pyver: [py310, py311, py312, py313]
     runs-on: ubuntu-latest
     env:
       PYTHON_VERSION: ${{ matrix.pyver }}
@@ -56,7 +56,7 @@ jobs:
   windows-build:
     strategy:
       matrix:
-        pyver: ['3.9', '3.10', '3.11', '3.12', '3.13']
+        pyver: ['3.10', '3.11', '3.12', '3.13']
     runs-on: windows-latest
     steps:
       - name: Set git for windows
 
@@ -36,7 +36,7 @@ jobs:
     timeout-minutes: 4320 # 72hours
     container:
       image: openmmlab/lmdeploy:dev-cu12.8
-      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e CUDA_VISIBLE_DEVICES=2,3 --pull never"
+      options: "--gpus=all --ipc=host --user root -e PIP_CACHE_DIR=/root/.cache/pip -e CUDA_VISIBLE_DEVICES=2,3 -e HF_HOME=/root/.cache/huggingface --pull never"
       volumes:
         - /nvme/share_data/github-actions/pip-cache:/root/.cache/pip
         - /nvme/share_data/github-actions/hf_home:/root/.cache/huggingface
 
@@ -71,11 +71,9 @@ FetchContent_MakeAvailable(repo-cutlass)
 FetchContent_Declare(
   yaml-cpp
   GIT_REPOSITORY https://github.com/jbeder/yaml-cpp.git
-  GIT_TAG                 0.8.0
+  GIT_TAG                 65c1c270dbe7eec37b2df2531d7497c4eea79aee
   GIT_PROGRESS            TRUE
   USES_TERMINAL_DOWNLOAD  TRUE
-  PATCH_COMMAND git apply ${CMAKE_CURRENT_SOURCE_DIR}/cmake/yaml-cpp_cmake_policy.patch
-  UPDATE_DISCONNECTED     1
 )
 set(YAML_BUILD_SHARED_LIBS OFF CACHE BOOL "Build static library of yaml-cpp")
 FetchContent_MakeAvailable(yaml-cpp)
@@ -87,7 +85,6 @@ FetchContent_Declare(
   GIT_SUBMODULES          "3rdparty/dlpack"
   GIT_PROGRESS            TRUE
   USES_TERMINAL_DOWNLOAD  TRUE
-  UPDATE_DISCONNECTED     1
 )
 
 FetchContent_GetProperties(xgrammar)
@@ -110,6 +107,7 @@ endif()
 
 # the environment variable
 #   ASAN_OPTIONS=protect_shadow_gap=0,intercept_tls_get_addr=0
+#   LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libasan.so.6:/usr/lib/x86_64-linux-gnu/libstdc++.so.6
 # must be set at runtime
 # https://github.com/google/sanitizers/issues/1322
 if (LMDEPLOY_ASAN_ENABLE)
@@ -333,6 +331,8 @@ if (MSVC)
     CMAKE_CUDA_FLAGS_RELWITHDEBINFO)
     string(REGEX REPLACE "-Wall" " /W0 " ${flag_var} "${${flag_var}}")
   endforeach()
+  # avoid min/max macro in "windows.h" conflict with std::min/std::max
+  add_definitions(-DNOMINMAX=1)
 endif()
 
 include_directories(
 
@@ -203,7 +203,7 @@ They differ in the types of supported models and the inference data type. Please
 
 ## Installation
 
-It is recommended installing lmdeploy using pip in a conda environment (python 3.9 - 3.13):
+It is recommended installing lmdeploy using pip in a conda environment (python 3.10 - 3.13):
 
 ```shell
 conda create -n lmdeploy python=3.10 -y
 
@@ -189,7 +189,7 @@ LMDeployは、[TurboMind](./docs/en/inference/turbomind.md)および[PyTorch](./
 
 ## インストール
 
-クリーンなconda環境（Python 3.9 - 3.12）でlmdeployをインストールすることをお勧めします。
+クリーンなconda環境（Python 3.10 - 3.13）でlmdeployをインストールすることをお勧めします。
 
 ```shell
 conda create -n lmdeploy python=3.10 -y
 
@@ -117,7 +117,6 @@ pytorch_chat_model:
         - meta-llama/Llama-4-Scout-17B-16E-Instruct
         - meta-llama/Llama-3.2-1B-Instruct
         - meta-llama/Llama-3.2-3B-Instruct
-        - meta-llama/Llama-3.2-11B-Vision-Instruct
         - meta-llama/Meta-Llama-3-1-8B-Instruct
         - meta-llama/Meta-Llama-3-1-70B-Instruct
         - meta-llama/Meta-Llama-3-8B-Instruct
@@ -219,7 +218,6 @@ turbomind_vl_model:
 
 pytorch_vl_model:
     tp:
-        - meta-llama/Llama-3.2-11B-Vision-Instruct
         - internlm/Intern-S1
         - internlm/Intern-S1-mini
         - OpenGVLab/InternVL2_5-26B-MPO
@@ -244,7 +242,7 @@ pytorch_vl_model:
         - Qwen/Qwen2.5-VL-7B-Instruct
         - Qwen/Qwen2.5-VL-32B-Instruct
         - THUDM/cogvlm-chat-hf
-        - THUDM/cogvlm2-llama3-chinese-chat-19B
+        # - THUDM/cogvlm2-llama3-chinese-chat-19B # 'HFChatTemplate' object has no attribute 'eoa'
         - THUDM/glm-4v-9b
         - microsoft/Phi-3-vision-128k-instruct
         - microsoft/Phi-3.5-vision-instruct
 
@@ -275,6 +275,7 @@ def parse_args():
     ArgumentHelper.num_tokens_per_iter(tb_group)
     ArgumentHelper.max_prefill_iters(tb_group)
     ArgumentHelper.communicator(tb_group)
+    ArgumentHelper.async_(tb_group)
 
     args = parser.parse_args()
     return args
@@ -285,19 +286,19 @@ def main():
     random.seed(args.seed)
     os.environ['TM_LOG_LEVEL'] = args.log_level
     if args.backend == 'turbomind':
-        engine_config = TurbomindEngineConfig(
-            max_batch_size=args.concurrency,
-            tp=args.tp,
-            cache_max_entry_count=args.cache_max_entry_count,
-            session_len=args.session_len,
-            cache_block_seq_len=args.cache_block_seq_len,
-            model_format=args.model_format,
-            quant_policy=args.quant_policy,
-            num_tokens_per_iter=args.num_tokens_per_iter,
-            max_prefill_iters=args.max_prefill_iters,
-            enable_prefix_caching=args.enable_prefix_caching,
-            communicator=args.communicator,
-        )
+        engine_config = TurbomindEngineConfig(max_batch_size=args.concurrency,
+                                              tp=args.tp,
+                                              cache_max_entry_count=args.cache_max_entry_count,
+                                              session_len=args.session_len,
+                                              cache_block_seq_len=args.cache_block_seq_len,
+                                              model_format=args.model_format,
+                                              quant_policy=args.quant_policy,
+                                              num_tokens_per_iter=args.num_tokens_per_iter,
+                                              max_prefill_iters=args.max_prefill_iters,
+                                              enable_prefix_caching=args.enable_prefix_caching,
+                                              communicator=args.communicator,
+                                              enable_metrics=False,
+                                              async_=args.async_)
     elif args.backend == 'pytorch':
         engine_config = PytorchEngineConfig(
             cache_max_entry_count=args.cache_max_entry_count,
 
@@ -327,6 +327,7 @@ def parse_args():
     ArgumentHelper.model_format(tb_group, default='hf')
     ArgumentHelper.num_tokens_per_iter(tb_group)
     ArgumentHelper.max_prefill_iters(tb_group)
+    ArgumentHelper.async_(tb_group)
     ArgumentHelper.communicator(tb_group)
 
     args = parser.parse_args()
@@ -348,6 +349,7 @@ def main():
             quant_policy=args.quant_policy,
             num_tokens_per_iter=args.num_tokens_per_iter,
             max_prefill_iters=args.max_prefill_iters,
+            async_=args.async_,
             enable_prefix_caching=args.enable_prefix_caching,
             dtype=args.dtype,
             communicator=args.communicator,
 
@@ -29,7 +29,7 @@ RUN bash /tmp/install_conda.sh && rm /tmp/install_conda.sh
 RUN /opt/conda/bin/conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/main && \
     /opt/conda/bin/conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/r
 
-RUN PY_VERSIONS=(3.9 3.10 3.11 3.12 3.13) && \
+RUN PY_VERSIONS=(3.10 3.11 3.12 3.13) && \
     for pyver in "${PY_VERSIONS[@]}"; do \
         /opt/conda/bin/conda create -n py${pyver//./} python=${pyver} -yq && \
         /opt/conda/envs/py${pyver//./}/bin/pip install -i 'https://mirrors.aliyun.com/pypi/simple/' --no-cache-dir pybind11; \
 
@@ -10,7 +10,7 @@ PLAT_NAME=manylinux2014_x86_64
 for cuver in ${CUDA_VER}; do
     DOCKER_TAG=cuda${cuver}
     OUTPUT_FOLDER=cuda${cuver}_dist
-    for pyver in py39 py310 py311 py312 py313; do
+    for pyver in py310 py311 py312 py313; do
         bash ${TOPDIR}/manywheel/build_wheel.sh ${pyver} ${PLAT_NAME} ${DOCKER_TAG} ${OUTPUT_FOLDER} \
             |& tee ${PLAT_NAME}.${pyver}.cuda${cuver}.log.txt
     done