Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,7 @@ LMDeploy is a toolkit for compressing, deploying, and serving LLM, developed by
<li>MiniCPM3 (4B)</li>
<li>SDAR (1.7B-30B)</li>
<li>gpt-oss (20B, 120B)</li>
<li>GLM-4.7-Flash (30B)</li>
</ul>
</td>
<td>
Expand Down
2 changes: 2 additions & 0 deletions README_ja.md
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,8 @@ LMDeploy TurboMindエンジンは卓越した推論能力を持ち、さまざ
<li>Phi-4-mini (3.8B)</li>
<li>MiniCPM3 (4B)</li>
<li>SDAR (1.7B-30B)</li>
<li>gpt-oss (20B, 120B)</li>
<li>GLM-4.7-Flash (30B)</li>
</ul>
</td>
<td>
Expand Down
1 change: 1 addition & 0 deletions README_zh-CN.md
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,7 @@ LMDeploy TurboMind 引擎拥有卓越的推理能力,在各种规模的模型
<li>MiniCPM3 (4B)</li>
<li>SDAR (1.7B-30B)</li>
<li>gpt-oss (20B, 120B)</li>
<li>GLM-4.7-Flash (30B)</li>
</ul>
</td>
<td>
Expand Down
1 change: 1 addition & 0 deletions docs/en/supported_models/supported_models.md
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@ The following tables detail the models supported by LMDeploy's TurboMind engine
| Phi-3.5-MoE | 16x3.8B | LLM | Yes | Yes | No | - | - |
| Phi-3.5-vision | 4.2B | MLLM | Yes | Yes | No | - | - |
| SDAR | 1.7B-30B | LLM | Yes | Yes | No | - | - |
| GLM-4.7-Flash | 30B | LLM | Yes | No | No | No | No |

```{note}
* [1] Currently Mono-InternVL does not support FP16 due to numerical instability. Please use BF16 instead.
Expand Down
1 change: 1 addition & 0 deletions docs/zh_cn/supported_models/supported_models.md
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@
| GLM-4.1V-Thinking | 9B | MLLM | Yes | Yes | Yes | - | - |
| GLM-4.5 | 355B | LLM | Yes | Yes | Yes | - | - |
| GLM-4.5-Air | 106B | LLM | Yes | Yes | Yes | - | - |
| GLM-4.7-Flash | 30B | LLM | Yes | No | No | No | No |
| CodeGeeX4 | 9B | LLM | Yes | Yes | Yes | - | - |
| Phi-3.5-mini | 3.8B | LLM | Yes | Yes | No | - | - |
| Phi-3.5-MoE | 16x3.8B | LLM | Yes | Yes | No | - | - |
Expand Down
7 changes: 3 additions & 4 deletions lmdeploy/pytorch/backends/cuda/attention/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def use_fa3_warning():


@functools.lru_cache
def _enable_fa3(alibi: bool, learnable_sink: bool, block_sparse_size: int):
def _enable_fa3(alibi: bool, learnable_sink: bool, block_sparse_size: int, head_size: int) -> bool:
"""Check if FA3 should be enabled.

FA3 is enabled when:
Expand All @@ -44,7 +44,7 @@ def _enable_fa3(alibi: bool, learnable_sink: bool, block_sparse_size: int):
Returns:
True if FA3 should be enabled, False otherwise.
"""
enable = not alibi and not learnable_sink and block_sparse_size == 1
enable = not alibi and not learnable_sink and block_sparse_size == 1 and head_size <= 256
if enable and not use_fa3_warning():
enable = False
return enable
Expand Down Expand Up @@ -127,8 +127,7 @@ def build(
causal=causal,
**kwargs,
)

enable_fa3 = _enable_fa3(alibi, learnable_sink, block_sparse_size)
enable_fa3 = _enable_fa3(alibi, learnable_sink, block_sparse_size, head_size)

if use_flash_mla is True:
logger.debug('Build FlashMLAImpl Attention')
Expand Down
24 changes: 24 additions & 0 deletions lmdeploy/pytorch/configurations/glm4.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# Copyright (c) OpenMMLab. All rights reserved.
from .deepseek_v2 import DeepseekV2ModelConfigBuilder


class Glm4MoeLiteModelConfigBuilder(DeepseekV2ModelConfigBuilder):

@classmethod
def condition(cls, hf_config):
"""config."""
return hf_config.model_type in ['glm4_moe_lite']

@classmethod
def build(cls, hf_config, model_path: str = None, is_draft_model: bool = False, spec_method: str = None, **kwargs):
"""build."""
# set default attrs
if not hasattr(hf_config, 'scoring_func'):
hf_config.scoring_func = 'sigmoid'
if not hasattr(hf_config, 'moe_layer_freq'):
hf_config.moe_layer_freq = 1
return super().build(hf_config,
model_path=model_path,
is_draft_model=is_draft_model,
spec_method=spec_method,
**kwargs)
2 changes: 1 addition & 1 deletion lmdeploy/pytorch/models/deepseek_mtp.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ def __init__(self, config: Any, dtype: torch.dtype = None, device: torch.device
rope_scaling = get_rope_parameters(config)
if rope_scaling is not None:
mscale_all_dim = rope_scaling.get('mscale_all_dim', 0)
scaling_factor = rope_scaling['factor']
scaling_factor = rope_scaling.get('factor', 1.0)
if mscale_all_dim:
mscale = yarn_get_mscale(scaling_factor, mscale_all_dim)
self.softmax_scale = self.softmax_scale * mscale * mscale
Expand Down
2 changes: 1 addition & 1 deletion lmdeploy/pytorch/models/deepseek_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -445,7 +445,7 @@ def __init__(self, config: Any, dtype: torch.dtype = None, device: torch.device
rope_scaling = get_rope_parameters(config)
if rope_scaling is not None:
mscale_all_dim = rope_scaling.get('mscale_all_dim', 0)
scaling_factor = rope_scaling['factor']
scaling_factor = rope_scaling.get('factor', 1.0)
if mscale_all_dim:
mscale = yarn_get_mscale(scaling_factor, mscale_all_dim)
self.softmax_scale = self.softmax_scale * mscale * mscale
Expand Down
4 changes: 4 additions & 0 deletions lmdeploy/pytorch/models/module_map.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,10 @@
'Glm4MoeForCausalLM': f'{LMDEPLOY_PYTORCH_MODEL_PATH}.glm4_moe.Glm4MoeForCausalLM',
})

# glm4.7

MODULE_MAP.update({'Glm4MoeLiteForCausalLM': f'{LMDEPLOY_PYTORCH_MODEL_PATH}.deepseek_v2.DeepseekV2ForCausalLM'})

# internlm
MODULE_MAP.update({
'InternLMForCausalLM': f'{LMDEPLOY_PYTORCH_MODEL_PATH}.internlm.InternLMForCausalLM',
Expand Down