support match unfull blocks for multimudals

RunningLeon · RunningLeon · commit d7f40fcc82c5 · 2025-03-12T17:08:14.000+08:00
diff --git a/lmdeploy/pytorch/engine/cache_engine.py b/lmdeploy/pytorch/engine/cache_engine.py
@@ -246,6 +246,16 @@ def swap_out(self, src_to_dst: Dict[int, int]) -> None:
         """
         self._swap(self.full_gpu_cache, self.full_cpu_cache, src_to_dst)
 
+    def copy_to(self, src_to_dst: Dict[int, int], cache_type: str = 'gpu') -> None:
+        """Copy cache.
+
+        Args:
+            src_to_dst (Dict[int, int]): Map between src and dst.
+            cache_type (str): cache type 'cpu', 'gpu'
+        """
+        target_cache = self.full_gpu_cache if cache_type == 'gpu' else self.full_cpu_cache
+        self._swap(target_cache, target_cache, src_to_dst)
+
     @classmethod
     def get_cache_block_size(cls,
                              block_size: int,
diff --git a/lmdeploy/pytorch/engine/engine.py b/lmdeploy/pytorch/engine/engine.py
@@ -628,6 +628,7 @@ def __need_logits(seqs: SeqList):
         running = scheduler_output.running
         swap_in_map = scheduler_output.swap_in_map
         swap_out_map = scheduler_output.swap_out_map
+        copy_map = scheduler_output.copy_map
         assert len(running) > 0
 
         # create inputs
@@ -645,6 +646,7 @@ def __need_logits(seqs: SeqList):
                     inputs=inputs,
                     swap_in_map=swap_in_map,
                     swap_out_map=swap_out_map,
+                    copy_map=copy_map,
                     all_ids=all_ids,
                     guided_input_ids=guided_input_ids,
                     sampling_inputs=sampling_inputs,
diff --git a/lmdeploy/pytorch/engine/model_agent.py b/lmdeploy/pytorch/engine/model_agent.py
@@ -28,7 +28,7 @@ def msg_with_rank(rank: int, msg: str):
     return f'rank[{rank}] - {msg}'
 
 
-def cache_swapping(cache_engine: CacheEngine, swap_in_map: dict, swap_out_map: dict):
+def cache_swapping(cache_engine: CacheEngine, swap_in_map: dict, swap_out_map: dict, copy_map: dict):
     """perform cache swapping."""
     issued_cache_op = False
     if len(swap_in_map) > 0:
@@ -37,7 +37,9 @@ def cache_swapping(cache_engine: CacheEngine, swap_in_map: dict, swap_out_map: d
     if len(swap_out_map) > 0:
         cache_engine.swap_out(swap_out_map)
         issued_cache_op = True
-
+    if len(copy_map) > 0:
+        cache_engine.copy_to(copy_map)
+        issued_cache_op = True
     if issued_cache_op:
         cache_engine.events.wait()
 
@@ -63,7 +65,6 @@ def model_forward(
             kv_quant_policy=cache_engine.cache_config.quant_policy,
         )
         with ctx_mgr.context(context):
-            model_metas = None
             model_metas = model.update_model_metas(
                 past_key_values=cache_engine.gpu_cache,
                 context=context,
@@ -123,7 +124,7 @@ def all_context(self):
         with device_mgr.context(self.device_ctx), dist_mgr.context(self.dist_ctx):
             yield
 
-    async def async_forward(self, inputs: ModelInputs, swap_in_map: SwapMap, swap_out_map: SwapMap):
+    async def async_forward(self, inputs: ModelInputs, swap_in_map: SwapMap, swap_out_map: SwapMap, copy_map: SwapMap):
         """model forward.
 
         Args:
@@ -172,7 +173,7 @@ def get_free_mem(self):
             gpu_mem_physical_free, _ = get_gpu_memory()
             return gpu_mem_physical_free
 
-    async def _async_model_forward(self, inputs: ModelInputs, swap_in_map: Dict, swap_out_map: Dict,
+    async def _async_model_forward(self, inputs: ModelInputs, swap_in_map: Dict, swap_out_map: Dict, copy_map: Dict,
                                    return_logits: bool):
         """model forward."""
         max_prefill_token_num = self.cache_config.max_prefill_token_num
@@ -212,12 +213,15 @@ def get_output(self):
 
         async def __forward(inputs):
             """forward."""
-            nonlocal swap_done, swap_in_map, swap_out_map
+            nonlocal swap_done, swap_in_map, swap_out_map, copy_map
             if swap_done:
-                return await self.async_forward(inputs, swap_in_map=dict(), swap_out_map=dict())
+                return await self.async_forward(inputs, swap_in_map=dict(), swap_out_map=dict(), copy_map=dict())
             else:
                 swap_done = True
-                return await self.async_forward(inputs, swap_in_map=swap_in_map, swap_out_map=swap_out_map)
+                return await self.async_forward(inputs,
+                                                swap_in_map=swap_in_map,
+                                                swap_out_map=swap_out_map,
+                                                copy_map=copy_map)
 
         async def __long_context_single_forward(inputs):
             """one large sequence."""
@@ -278,7 +282,7 @@ def __get_last_logits():
 
         return next_token_ids
 
-    async def _async_step_background(self, inputs: ModelInputs, swap_in_map: Dict, swap_out_map: Dict,
+    async def _async_step_background(self, inputs: ModelInputs, swap_in_map: Dict, swap_out_map: Dict, copy_map: Dict,
                                      all_ids: torch.Tensor, guided_input_ids: torch.Tensor,
                                      sampling_inputs: SamplingInputs, num_appendable_ids: torch.LongTensor,
                                      num_ignore_eos: torch.LongTensor, loop_count: int, return_logits: bool,
@@ -322,6 +326,7 @@ def __update_inputs(next_token_ids):
             output = await self._async_model_forward(inputs,
                                                      swap_in_map=swap_in_map,
                                                      swap_out_map=swap_out_map,
+                                                     copy_map=copy_map,
                                                      return_logits=return_logits)
             logits = output['logits']
             logits = logits[0]  # [bs, seq, prob] -> [seq, prob]
@@ -359,6 +364,7 @@ def __update_inputs(next_token_ids):
             if is_decoding and idx < loop_count - 1:
                 swap_in_map = dict()
                 swap_out_map = dict()
+                copy_map = dict()
                 inputs.model_metas = model_metas
                 __update_inputs(next_token_ids)
 
@@ -516,8 +522,8 @@ def build_cache_engine(self):
         with self.all_context():
             self.cache_engine = CacheEngine(self.cache_config, self.model_config, self.tp_rank, world_size=self.tp)
 
-    def _forward_impl(self, inputs: ModelInputs, swap_in_map: SwapMap, swap_out_map: SwapMap):
-        cache_swapping(self.cache_engine, swap_in_map=swap_in_map, swap_out_map=swap_out_map)
+    def _forward_impl(self, inputs: ModelInputs, swap_in_map: SwapMap, swap_out_map: SwapMap, copy_map: SwapMap):
+        cache_swapping(self.cache_engine, swap_in_map=swap_in_map, swap_out_map=swap_out_map, copy_map=copy_map)
         output = model_forward(
             self.patched_model,
             inputs,
@@ -527,15 +533,15 @@ def _forward_impl(self, inputs: ModelInputs, swap_in_map: SwapMap, swap_out_map:
         )
         return output
 
-    async def async_forward(self, inputs: ModelInputs, swap_in_map: SwapMap, swap_out_map: SwapMap):
+    async def async_forward(self, inputs: ModelInputs, swap_in_map: SwapMap, swap_out_map: SwapMap, copy_map: SwapMap):
         """model forward.
 
         Args:
             inputs (Dict): The input data comes from _make_inputs.
             swap_in_map (SwapMap): Cache maps to swap in.
             swap_out_map (SwapMap): Cache maps to swap out.
         """
-        output = self._forward_impl(inputs, swap_in_map=swap_in_map, swap_out_map=swap_out_map)
+        output = self._forward_impl(inputs, swap_in_map=swap_in_map, swap_out_map=swap_out_map, copy_map=copy_map)
         await asyncio.sleep(0)
         return output
 
diff --git a/lmdeploy/pytorch/messages.py b/lmdeploy/pytorch/messages.py
@@ -371,6 +371,17 @@ def __init__(self, multimodals: MultiModalInputs):
         if multimodals is None:
             multimodals = dict()
         self.multimodals = multimodals
+        self._init_mm_ranges()
+
+    def _init_mm_ranges(self):
+        """init mm ranges and sort it."""
+        mm_ranges = []
+        for _, modal_datas in self.multimodals.items():
+            for modal_data in modal_datas:
+                data = (modal_data.start, modal_data.end, modal_data.meta.get('hash_value', None))
+                mm_ranges.append(data)
+        mm_ranges.sort(key=lambda x: x[1])
+        self._mm_ranges = mm_ranges
 
     def get_datas(self, start=0, end=-1):
         """get multimodals from prompts position [start, end)."""
@@ -389,29 +400,24 @@ def get_datas(self, start=0, end=-1):
     def get_step(self, step: int):
         """get step that before a whole image."""
         real_step = step
-        for modal_type, modal_datas in self.multimodals.items():
-            for modal_data in modal_datas:
-                if modal_data.start > real_step:
-                    continue
-                elif modal_data.end <= real_step:
-                    continue
-                else:
-                    real_step = modal_data.start
+        for start, end, _ in self._mm_ranges:
+            if start <= real_step < end:
+                real_step = start
         return real_step
 
     def get_hash_values(self, start: int, end: int):
         """get multimodals hash values that from [start, end)"""
-        hash_values = []
-        for modal_type, modal_datas in self.multimodals.items():
-            for modal_data in modal_datas:
-                if modal_data.start < end and modal_data.end > start:
-                    if modal_data.meta.get('hash_value', None):
-                        hash_values.append(modal_data.meta['hash_value'])
-        if hash_values:
-            hash_values = tuple(hash_values)
-        else:
-            hash_values = None
-        return hash_values
+        mm_hash_values = []
+        multimodal_ends = []
+        for mm_start, mm_end, hash_value in self._mm_ranges:
+            # the mm range intersect with the target range
+            if mm_start < end and mm_end > start:
+                mm_hash_values.append(hash_value)
+            # the mm end in the target range
+            if start < mm_end <= end:
+                cur_data = (tuple(mm_hash_values), mm_end)
+                multimodal_ends.append(cur_data)
+        return tuple(mm_hash_values), multimodal_ends
 
     def add_inputs(self, input_mms: MultiModalInputs):
         """add new inputs."""
@@ -421,6 +427,14 @@ def add_inputs(self, input_mms: MultiModalInputs):
             else:
                 self.multimodals[modal_type] = vals
 
+            # update mm_ranges
+            for modal_data in vals:
+                data = (modal_data.start, modal_data.end, modal_data.meta.get('hash_value', None))
+                self._mm_ranges.append(data)
+
+        # sort mm_ranges
+        self._mm_ranges.sort(key=lambda x: x[1])
+
     def empty(self):
         if len(self.multimodals) == 0:
             return 0
diff --git a/lmdeploy/pytorch/paging/block_trie.py b/lmdeploy/pytorch/paging/block_trie.py
diff --git a/lmdeploy/pytorch/paging/scheduler.py b/lmdeploy/pytorch/paging/scheduler.py