support internvl3.5 (InternLM#3886)

lvhan028 · web-flow · commit d46103a05c0c · 2025-08-27T11:55:14.000+08:00
* Fix interns1 LLM mapping for turbomind engine (InternLM#3848) * support internvl3.5 * update docs
diff --git a/README.md b/README.md
@@ -159,6 +159,7 @@ LMDeploy is a toolkit for compressing, deploying, and serving LLM, developed by
   <li>InternVL2 (1B-76B)</li>
   <li>InternVL2.5(MPO) (1B-78B)</li>
   <li>InternVL3 (1B-78B)</li>
+  <li>InternVL3.5 (1B-241BA28B)</li>
   <li>Intern-S1 (241B)</li>
   <li>Intern-S1-mini (8.3B)</li>
   <li>Mono-InternVL (2B)</li>
diff --git a/README_ja.md b/README_ja.md
@@ -157,6 +157,7 @@ LMDeploy TurboMindエンジンは卓越した推論能力を持ち、さまざ
   <li>InternVL2 (1B-76B)</li>
   <li>InternVL2.5(MPO) (1B-78B)</li>
   <li>InternVL3 (1B-78B)</li>
+  <li>InternVL3.5 (1B-241BA28B)</li>
   <li>Intern-S1 (241B)</li>
   <li>Intern-S1-mini (8.3B)</li>
   <li>Mono-InternVL (2B)</li>
diff --git a/README_zh-CN.md b/README_zh-CN.md
@@ -161,6 +161,7 @@ LMDeploy TurboMind 引擎拥有卓越的推理能力，在各种规模的模型
   <li>InternVL2 (1B-76B)</li>
   <li>InternVL2.5(MPO) (1B-78B)</li>
   <li>InternVL3 (1B-78B)</li>
+  <li>InternVL3.5 (1B-241BA28B)</li>
   <li>Intern-S1 (241B)</li>
   <li>Intern-S1-mini (8.3B)</li>
   <li>Mono-InternVL (2B)</li>
diff --git a/docs/en/supported_models/supported_models.md b/docs/en/supported_models/supported_models.md
@@ -40,6 +40,7 @@ The following tables detail the models supported by LMDeploy's TurboMind engine
 |    InternVL2<sup>\[2\]</sup>     | 1 - 2B, 8B - 76B | MLLM |    Yes    |  Yes\*  |  Yes\*  |  Yes  |
 | InternVL2.5(MPO)<sup>\[2\]</sup> |     1 - 78B      | MLLM |    Yes    |  Yes\*  |  Yes\*  |  Yes  |
 |    InternVL3<sup>\[2\]</sup>     |     1 - 78B      | MLLM |    Yes    |  Yes\*  |  Yes\*  |  Yes  |
+|   InternVL3.5<sup>\[3\]</sup>    |   1 - 241BA28B   | MLLM |    Yes    |  Yes\*  |  Yes\*  |  No   |
 |             ChemVLM              |     8B - 26B     | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
 |       MiniCPM-Llama3-V-2_5       |        -         | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
 |          MiniCPM-V-2_6           |        -         | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
@@ -103,6 +104,7 @@ The following tables detail the models supported by LMDeploy's TurboMind engine
 |           InternVL2            |     1B-76B      | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
 |        InternVL2.5(MPO)        |     1B-78B      | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
 |           InternVL3            |     1B-78B      | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
+|          InternVL3.5           |   1B-241BA28B   | MLLM |    Yes    |   Yes   |   Yes   |  No  |  No   |
 | Mono-InternVL<sup>\[1\]</sup>  |       2B        | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
 |            ChemVLM             |     8B-26B      | MLLM |    Yes    |   Yes   |   No    |  -   |   -   |
 |             Gemma2             |     9B-27B      | LLM  |    Yes    |   Yes   |   Yes   |  -   |   -   |
diff --git a/docs/zh_cn/supported_models/supported_models.md b/docs/zh_cn/supported_models/supported_models.md
@@ -40,6 +40,7 @@
 |            InternVL2             | 1-2B, 8B - 76B | MLLM |    Yes    |  Yes\*  |  Yes\*  |  Yes  |
 | InternVL2.5(MPO)<sup>\[2\]</sup> |    1 - 78B     | MLLM |    Yes    |  Yes\*  |  Yes\*  |  Yes  |
 |    InternVL3<sup>\[2\]</sup>     |    1 - 78B     | MLLM |    Yes    |  Yes\*  |  Yes\*  |  Yes  |
+|   InternVL3.5<sup>\[3\]</sup>    |  1 - 241BA28B  | MLLM |    Yes    |  Yes\*  |  Yes\*  |  No   |
 |             ChemVLM              |    8B - 26B    | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
 |       MiniCPM-Llama3-V-2_5       |       -        | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
 |          MiniCPM-V-2_6           |       -        | MLLM |    Yes    |   Yes   |   Yes   |  Yes  |
@@ -103,6 +104,7 @@
 |           InternVL2            |     1B-76B      | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
 |        InternVL2.5(MPO)        |     1B-78B      | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
 |           InternVL3            |     1B-78B      | MLLM |    Yes    |   Yes   |   Yes   |  -   |   -   |
+|          InternVL3.5           |   1B-241BA28B   | MLLM |    Yes    |   Yes   |   Yes   |  No  |  No   |
 | Mono-InternVL<sup>\[1\]</sup>  |       2B        | MLLM |   Yes\*   |   Yes   |   Yes   |  -   |   -   |
 |            ChemVLM             |     8B-26B      | MLLM |    Yes    |   Yes   |   No    |  -   |   -   |
 |             Gemma2             |     9B-27B      | LLM  |    Yes    |   Yes   |   Yes   |  -   |   -   |
diff --git a/lmdeploy/turbomind/deploy/source_model/internvl.py b/lmdeploy/turbomind/deploy/source_model/internvl.py
@@ -1,8 +1,9 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from .base import INPUT_MODELS
+from .gpt_oss import GptOssReader
 from .internlm2 import InternLM2Reader
 from .llama import LlamaModel, LlamaReader
-from .qwen import Qwen3MoeReader
+from .qwen import Qwen3MoeReader, Qwen3Reader
 
 
 class InternVLReader(LlamaReader):
@@ -34,8 +35,59 @@ def __init__(self, new_params: dict, unused_params: dict, last_bin: bool, model_
         super().__init__(new_params, unused_params, last_bin, model_cfg, **kwargs)
 
 
+class InternVL3d5Reader(Qwen3Reader):
+    attn_layer_prefix = 'language_model.model.layers'
+    attn_layer_patten = r'language_model\.model\.layers\.([0-9]+).'
+    tok_embeddings_key = 'language_model.model.embed_tokens.weight'
+    norm_weight_key = 'language_model.model.norm.weight'
+    output_weight_key = 'language_model.lm_head.weight'
+
+    def __init__(self, new_params: dict, unused_params: dict, last_bin: bool, model_cfg: dict, **kwargs):
+        model_cfg = model_cfg.get('llm_config') or model_cfg.get('text_config')
+        super().__init__(new_params, unused_params, last_bin, model_cfg, **kwargs)
+
+
+class InternVL3d5Qwen3MoEReader(Qwen3MoeReader):
+    attn_layer_prefix = 'language_model.model.layers'
+    attn_layer_patten = r'language_model\.model\.layers\.([0-9]+).'
+    tok_embeddings_key = 'language_model.model.embed_tokens.weight'
+    norm_weight_key = 'language_model.model.norm.weight'
+    output_weight_key = 'language_model.lm_head.weight'
+
+    def __init__(self, new_params: dict, unused_params: dict, last_bin: bool, model_cfg: dict, **kwargs):
+        model_cfg = model_cfg.get('llm_config') or model_cfg.get('text_config')
+        super().__init__(new_params, unused_params, last_bin, model_cfg, **kwargs)
+
+
+class InternVL3d5GptOSSReader(GptOssReader):
+    attn_layer_prefix = 'language_model.model.layers'
+    attn_layer_patten = r'language_model\.model\.layers\.([0-9]+).'
+    tok_embeddings_key = 'language_model.model.embed_tokens.weight'
+    norm_weight_key = 'language_model.model.norm.weight'
+    output_weight_key = 'language_model.lm_head.weight'
+
+    def __init__(self, new_params: dict, unused_params: dict, last_bin: bool, model_cfg: dict, **kwargs):
+        model_cfg = model_cfg.get('llm_config') or model_cfg.get('text_config')
+        super().__init__(new_params, unused_params, last_bin, model_cfg, **kwargs)
+
+
 class InternS1Reader(Qwen3MoeReader):
-    """InternVL3Reader for InternVL+Qwen3MoE model."""
+    """InternS1Reader for internlm/InternS1 model."""
+
+    attn_layer_prefix = 'model.language_model.layers'
+    attn_layer_patten = r'model\.language_model\.layers\.([0-9]+).'
+    tok_embeddings_key = 'model.language_model.embed_tokens.weight'
+    norm_weight_key = 'model.language_model.norm.weight'
+    output_weight_key = 'lm_head.weight'
+
+    def __init__(self, new_params: dict, unused_params: dict, last_bin: bool, model_cfg: dict, **kwargs):
+        model_cfg = model_cfg.get('text_config')
+        if model_cfg is None:
+            raise ValueError(f'Miss "text_config" in model config: {model_cfg}')
+        super().__init__(new_params, unused_params, last_bin, model_cfg, **kwargs)
+
+
+class InternS1MiniReader(Qwen3Reader):
 
     attn_layer_prefix = 'model.language_model.layers'
     attn_layer_patten = r'model\.language_model\.layers\.([0-9]+).'
@@ -58,14 +110,22 @@ def __init__(self, model_path: str, tokenizer_path: str, **kwargs):
         super().__init__(model_path, tokenizer_path, **kwargs)
         from transformers import AutoConfig
         config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
+
+        arch = config.architectures[0]
+        if arch == 'InternVLChatModel':
+            relations = dict(InternLM2ForCausalLM=('internlm2', InternVL2Reader),
+                             LlamaForCausalLM=('llama', InternVLReader),
+                             Qwen2ForCausalLM=('qwen2', InternVLReader),
+                             Qwen3MoeForCausalLM=('qwen3-moe', InternVL3d5Qwen3MoEReader),
+                             Qwen3ForCausalLM=('qwen3', InternVL3d5Reader),
+                             GptOssForCausalLM=('gpt-oss', InternVL3d5GptOSSReader))
+        elif arch == 'InternS1ForConditionalGeneration':
+            relations = dict(Qwen3MoeForCausalLM=('qwen3-moe', InternS1Reader),
+                             Qwen3ForCausalLM=('qwen3', InternS1MiniReader))
+        else:
+            raise ValueError('unsupported model arch {arch}')
         self.llm_config = getattr(config, 'llm_config', None) or getattr(config, 'text_config', None)
         arch = self.llm_config.architectures[0]
-        relations = dict(
-            InternLM2ForCausalLM=('internlm2', InternVL2Reader),
-            LlamaForCausalLM=('llama', InternVLReader),
-            Qwen2ForCausalLM=('qwen2', InternVLReader),
-            Qwen3MoeForCausalLM=('qwen3-moe', InternS1Reader),
-        )
         llm_model, self.Reader = relations[arch]
         self.llm_model = INPUT_MODELS.get(llm_model)(model_path=model_path, tokenizer_path=tokenizer_path, **kwargs)
 
diff --git a/lmdeploy/turbomind/supported_models.py b/lmdeploy/turbomind/supported_models.py
@@ -82,9 +82,8 @@ def is_supported(model_path: str):
     import os
 
     def _is_head_dim_supported(cfg):
-        num_attn_head = cfg.num_attention_heads
-        hidden_size = cfg.hidden_size
-        return (hidden_size // num_attn_head) in [128, 64]
+        head_dim = cfg.head_dim if hasattr(cfg, 'head_dim') else cfg.hidden_size // cfg.num_attention_heads
+        return head_dim in [128, 64]
 
     support_by_turbomind = False
     triton_model_path = os.path.join(model_path, 'triton_models')