disable prefixcaching for some models

RunningLeon · RunningLeon · commit 83dd5562f94d · 2025-04-03T15:37:40.000+08:00
diff --git a/lmdeploy/api.py b/lmdeploy/api.py
@@ -68,7 +68,7 @@ def pipeline(model_path: str,
             if backend_config is not None else None
         model_path = get_model(model_path, download_dir, revision)
 
-    task, pipeline_class = get_task(model_path)
+    _, pipeline_class = get_task(model_path)
 
     if type(backend_config) is not PytorchEngineConfig:
         # set auto backend mode
diff --git a/lmdeploy/pytorch/engine/cache_engine.py b/lmdeploy/pytorch/engine/cache_engine.py
@@ -243,16 +243,17 @@ def swap_out(self, src_to_dst: Dict[int, int]) -> None:
             src_to_dst (Dict[int, int]): Map between src and dst.
         """
         self._swap(self.full_gpu_cache, self.full_cpu_cache, src_to_dst)
-        
+
     def copy_to(self, src_to_dst: Dict[int, int], cache_type: str = 'gpu') -> None:
         """Copy cache.
+
         Args:
             src_to_dst (Dict[int, int]): Map between src and dst.
             cache_type (str): cache type 'cpu', 'gpu'
         """
         target_cache = self.full_gpu_cache if cache_type == 'gpu' else self.full_cpu_cache
         self._swap(target_cache, target_cache, src_to_dst)
-        
+
     @classmethod
     def get_cache_block_size(cls,
                              block_size: int,
diff --git a/lmdeploy/pytorch/engine/model_agent.py b/lmdeploy/pytorch/engine/model_agent.py
@@ -249,7 +249,10 @@ async def __forward(inputs):
                 return await self.async_forward(inputs, swap_in_map=dict(), swap_out_map=dict(), copy_map=dict())
             else:
                 swap_done = True
-                return await self.async_forward(inputs, swap_in_map=swap_in_map, swap_out_map=swap_out_map, copy_map=copy_map)
+                return await self.async_forward(inputs,
+                                                swap_in_map=swap_in_map,
+                                                swap_out_map=swap_out_map,
+                                                copy_map=copy_map)
 
         async def __long_context_single_forward(new_inputs, max_seqlen: int):
             """one large sequence."""
diff --git a/lmdeploy/serve/vl_async_engine.py b/lmdeploy/serve/vl_async_engine.py
@@ -28,7 +28,13 @@ def __init__(self,
                  **kwargs) -> None:
         if backend == 'pytorch':
             try_import_deeplink(backend_config.device_type)
+        if backend_config.enable_prefix_caching and backend == 'turbomind':
+            backend_config.enable_prefix_caching = False
+            logger.warning('VLM does not support prefix caching for turbomind engine.')
         self.vl_encoder = ImageEncoder(model_path, backend, vision_config, backend_config=backend_config)
+        if backend_config.enable_prefix_caching and not self.vl_encoder.model.support_prefix_caching:
+            logger.warning(f'Prefix caching is not supported for {model_path}')
+
         super().__init__(model_path, backend=backend, backend_config=backend_config, **kwargs)
         if self.model_name == 'base':
             raise RuntimeError(
diff --git a/lmdeploy/vl/model/base.py b/lmdeploy/vl/model/base.py
@@ -14,6 +14,7 @@
 class VisonModel(ABC):
     """Visual model which extract image feature."""
     _arch: Union[str, List[str]] = None
+    support_prefix_caching: bool = True
 
     def __init__(self,
                  model_path: str,
diff --git a/lmdeploy/vl/model/cogvlm.py b/lmdeploy/vl/model/cogvlm.py
@@ -3,7 +3,6 @@
 
 from lmdeploy.utils import get_logger
 from lmdeploy.vl.model.base import VISION_MODELS, VisonModel
-from lmdeploy.vl.utils import hash_multimodal_data
 
 logger = get_logger('lmdeploy')
 
@@ -13,6 +12,7 @@ class CogVLMVisionModel(VisonModel):
     """CogVLM vision model."""
 
     _arch = 'CogVLMForCausalLM'
+    support_prefix_caching: bool = False
 
     def build_preprocessor(self):
         from torchvision import transforms
@@ -47,14 +47,10 @@ def preprocess(self, messages: List[Dict]) -> List[Dict]:
         for image, params in images:
             image = image.convert('RGB')
             pixel_values = self.image_transform(image)
-            hash_value = None
-            if self.enable_prefix_caching:
-                hash_value = hash_multimodal_data(model_id=self.model_path, image=image, params=params)
             outputs.append(
                 dict(pixel_values=pixel_values,
                      image_size=image.size,
                      image_tokens=self.n_token_per_image,
-                     hash_value=hash_value,
                      image_token_id=self.image_token_id))
         messages.append(dict(role='preprocess', content=outputs))
         return messages
diff --git a/lmdeploy/vl/model/gemma3_vl.py b/lmdeploy/vl/model/gemma3_vl.py
@@ -39,6 +39,7 @@ class Gemma3VisionModel(VisonModel):
     """Gemma3 vision model."""
 
     _arch = 'Gemma3ForConditionalGeneration'
+    support_prefix_caching: bool = False
 
     def __init__(self,
                  model_path: str,
diff --git a/lmdeploy/vl/model/glm_4v.py b/lmdeploy/vl/model/glm_4v.py
@@ -5,7 +5,6 @@
 
 from lmdeploy.utils import get_logger
 from lmdeploy.vl.model.base import VISION_MODELS, VisonModel
-from lmdeploy.vl.utils import hash_multimodal_data
 
 logger = get_logger('lmdeploy')
 
@@ -15,6 +14,7 @@ class GLM4VisionModel(VisonModel):
     """glm-4v-9b vision model."""
 
     _arch = ['ChatGLMModel', 'ChatGLMForConditionalGeneration']
+    support_prefix_caching: bool = False
 
     @classmethod
     def match(cls, config: AutoConfig):
@@ -60,13 +60,9 @@ def preprocess(self, messages: List[Dict]) -> List[Dict]:
             images = [x.convert('RGB') for x in images]
             pixel_values = [self.image_transform(x) for x in images]
             for image, pixel_value in zip(images, pixel_values):
-                hash_value = None
-                if self.enable_prefix_caching:
-                    hash_value = hash_multimodal_data(model_id=self.model_path, image=image)
                 data = dict(pixel_values=pixel_value,
                             image_size=image.size,
                             image_tokens=self.n_token_per_image,
-                            hash_value=hash_value,
                             image_token_id=self.image_token_id)
                 outputs.append(data)
         messages.append(dict(role='preprocess', content=outputs))
diff --git a/lmdeploy/vl/model/mllama.py b/lmdeploy/vl/model/mllama.py
@@ -18,6 +18,7 @@ class MllamaVLModel(VisonModel):
     """llama3.2 model."""
 
     _arch = 'MllamaForConditionalGeneration'
+    support_prefix_caching: bool = False
 
     def build_preprocessor(self):
         from transformers import AutoProcessor
diff --git a/lmdeploy/vl/model/qwen2.py b/lmdeploy/vl/model/qwen2.py
@@ -4,7 +4,6 @@
 import torch
 
 from lmdeploy.vl.model.base import VISION_MODELS, VisonModel
-from lmdeploy.vl.utils import hash_multimodal_data
 
 
 def check_qwen_vl_deps_install():
@@ -26,6 +25,7 @@ class Qwen2VLModel(VisonModel):
     """Qwen2VL model."""
 
     _arch = ['Qwen2VLForConditionalGeneration', 'Qwen2_5_VLForConditionalGeneration']
+    support_prefix_caching: bool = False
 
     def build_preprocessor(self):
         check_qwen_vl_deps_install()
@@ -41,20 +41,13 @@ def preprocess(self, messages: List[Dict]) -> List[Dict]:
         outputs = []
         for image, params in images:
             image = image.convert('RGB')
-            hash_value = None
-            if self.enable_prefix_caching:
-                hash_value = hash_multimodal_data(model_id=self.model_path, image=image, params=params)
             item = dict(type='image', image=image)
             item.update({key: params[key] for key in params.keys() if key in optional_keys})
             image_inputs, _ = process_vision_info([dict(content=[item])])
             result = self.processor.image_processor(images=image_inputs, videos=None, return_tensors='pt')
             merge_length = self.processor.image_processor.merge_size**2
             image_tokens = result['image_grid_thw'].prod(dim=1) // merge_length
-            result.update(
-                dict(image_size=image.size,
-                     image_tokens=image_tokens,
-                     image_token_id=self.image_token_id,
-                     hash_value=hash_value))
+            result.update(dict(image_size=image.size, image_tokens=image_tokens, image_token_id=self.image_token_id))
             outputs.append(result)
         messages.append(dict(role='preprocess', content=outputs))
         return messages