empty cache

bukejiyu · bukejiyu · commit f4e630effdce · 2025-11-20T09:34:53.000Z
diff --git a/fastdeploy/model_executor/layers/linear.py b/fastdeploy/model_executor/layers/linear.py
@@ -143,11 +143,13 @@ def __init__(
         self.with_bias = with_bias
         self.add_bias = add_bias
         self.prefix = prefix
-        self.is_quantized = fd_config.model_config.is_quantized
+        self.is_quantized = fd_config.model_config.is_quantized and not (
+            fd_config.quant_config.name() == "mix_quant" and fd_config.quant_config.dense_quant_type is None
+        )
         # key
         if weight_key:
             self.weight_key = f"{prefix}.{weight_key}"
-        elif fd_config.model_config.is_quantized and not skip_quant:
+        elif self.is_quantized and not skip_quant:
             self.weight_key = f"{prefix}.quant_weight"
             self.weight_scale_key = f"{prefix}.weight_scale"
             self.act_scale_key = f"{prefix}.activation_scale"
@@ -170,7 +172,7 @@ def __init__(
             self.output_size,
         ]
 
-        if fd_config.quant_config and not skip_quant:
+        if fd_config.quant_config and not skip_quant and fd_config.quant_config.get_quant_method(self):
             self.quant_method = fd_config.quant_config.get_quant_method(self)
         else:
             self.quant_method: Optional[QuantMethodBase] = UnquantizedLinearMethod()
@@ -232,7 +234,7 @@ def load_state_dict(self, state_dict: dict):
         # weight
         self.state_dict = state_dict
         assert self.weight_key is not None, "weight_key should not be None."
-        if self.fd_config.model_config.is_quantized:
+        if self.is_quantized:
             self.load_prequant_weight(state_dict)
         else:
             self.load_weight(state_dict)
@@ -358,10 +360,6 @@ def __init__(
         self.output_sizes = output_sizes
 
     def weight_loader(self, param, loaded_weight, loaded_shard_id: Optional[str] = None):
-        weight_need_transpose = getattr(param, "weight_need_transpose", False)
-        if weight_need_transpose:
-            loaded_weight = get_tensor(loaded_weight).transpose([1, 0])
-
         assert loaded_shard_id in ["q_a", "kv_a"]
         if not param._is_initialized():
             param.initialize()
@@ -387,7 +385,6 @@ def weight_loader(self, param, loaded_weight, loaded_shard_id: Optional[str] = N
             else:
                 loaded_weight = loaded_weight.cast(param.dtype)
         # (bukejiyu) After this fix, the early H2D copy for non-GPU devices is no longer needed and can be safely removed.
-        loaded_weight = get_tensor(loaded_weight)
         h2d_copy(param, loaded_weight)
 
 
@@ -784,7 +781,7 @@ def load_state_dict(self, state_dict: dict):
         assert self.weight_key is not None, "weight_key should not be None."
         # qkv fused in disk
 
-        if self.fd_config.model_config.is_quantized:
+        if self.is_quantized:
             self.load_prequant_weight(state_dict)
         else:
             self.load_weight(state_dict)
@@ -959,16 +956,20 @@ def __init__(
         # Split num_attention_heads when using TP inference.
         self.num_heads_per_partition = divide(num_attention_heads, self.nranks)
         self.local_rank = fd_config.parallel_config.tensor_parallel_rank
-
-        self.kv_b_proj = kv_b_proj
+        self.fd_config = fd_config
+        if self.fd_config.load_config.load_choices == "default_v1":
+            self.kv_b_proj = kv_b_proj
+        else:
+            self.kv_b_proj = None
 
         self.weight_dtype = self._helper.get_default_dtype()
 
         # Override weight keys to use the combined kv_b_proj
         self.weight_key = f"{prefix}.weight"  # e.g., "kv_b_proj.weight"
 
     def process_weights_after_loading(self):
-
+        if self.fd_config.load_config.dynamic_load_weight:
+            return
         w = self.kv_b_proj.weight.reshape(
             [
                 self.kv_lora_rank,
diff --git a/fastdeploy/model_executor/utils.py b/fastdeploy/model_executor/utils.py
@@ -131,18 +131,26 @@ def slice_fn(weight_or_paramter, output_dim, start, end, step=1):
 def process_weight_transpose(layer, weight_name):
     weight = getattr(layer, weight_name)
     if len(weight.shape) == 2:
-        weight_transpose = weight.transpose([1, 0])
+        weight_shape = weight.shape[::-1]
     elif len(weight.shape) == 3:
-        weight_transpose = weight.transpose([0, 2, 1])
-
+        weight_shape = [weight.shape[0]] + list(weight.shape[1:][::-1])
     weight_tmp = layer.create_parameter(
-        shape=weight_transpose.shape,
-        dtype=weight_transpose.dtype,
+        shape=weight_shape,
+        dtype=weight.dtype,
         default_initializer=paddle.nn.initializer.Constant(0),
         is_bias=False,
     )
+    if layer.fd_config.load_config.dynamic_load_weight or layer.fd_config.model_config.enable_cache:
+        free_tensor(weight, clear_memory=False)
+        setattr(layer, weight_name, weight_tmp)
+        return
+
+    if len(weight.shape) == 2:
+        weight_transpose = weight.transpose([1, 0])
+    elif len(weight.shape) == 3:
+        weight_transpose = weight.transpose([0, 2, 1])
     weight_tmp.copy_(weight_transpose, False)
-    free_tensor(weight)
+    free_tensor(weight, clear_memory=False)
     setattr(layer, weight_name, weight_tmp)
 
 
@@ -163,9 +171,16 @@ def fn(model_sublayer_name: str, param=None):
         model_sublayer = sublayers_dict[model_sublayer_name]
         if isinstance(model_sublayer, KVBatchLinear):
             model_sublayer.process_weights_after_loading()
+        if fd_config.quant_config and not fd_config.quant_config.is_checkpoint_bf16:
+            # skip for offline quantization
+            return
         if hasattr(model_sublayer, "quant_method"):
             quant_method = getattr(model_sublayer, "quant_method", None)
-            unquant_moe_cls = type(get_moe_method())
+            unquant_moe_layer = get_moe_method()
+            if unquant_moe_layer is None:
+                unquant_moe_cls = object
+            else:
+                unquant_moe_cls = type(unquant_moe_layer)
             if type(quant_method) is UnquantizedLinearMethod or type(quant_method) is unquant_moe_cls:
                 # skip unquantized linear
                 return
@@ -225,26 +240,33 @@ def process_final_after_loading(model, fd_config: FDConfig):
     from fastdeploy.model_executor.layers.moe.moe import get_moe_method
 
     for name, sublayer in model.named_sublayers():
+        if isinstance(sublayer, KVBatchLinear):
+            continue
         quant_method = getattr(sublayer, "quant_method", None)
         if quant_method is not None:
-            unquant_moe_cls = type(get_moe_method())
-            if not (type(quant_method) is UnquantizedLinearMethod or type(quant_method) is unquant_moe_cls):
+            unquant_moe_layer = get_moe_method()
+            if unquant_moe_layer is None:
+                unquant_moe_cls = object
+            else:
+                unquant_moe_cls = type(unquant_moe_layer)
+            is_unquant_cls = type(quant_method) is UnquantizedLinearMethod or type(quant_method) is unquant_moe_cls
+            is_offline_quantized_ckpt = not (fd_config.quant_config and fd_config.quant_config.is_checkpoint_bf16)
+            if is_unquant_cls or is_offline_quantized_ckpt:
+                if hasattr(quant_method, "process_weights_after_loading"):
+                    quant_method.process_weights_after_loading(sublayer)
                 continue
-            if hasattr(quant_method, "process_weights_after_loading"):
-                quant_method.process_weights_after_loading(sublayer)
-        if isinstance(sublayer, KVBatchLinear):
-            continue
         if not hasattr(sublayer, "process_weights_after_loading"):
             continue
-        # Only for specific layers, such as lmhead
         sublayer.process_weights_after_loading()
 
 
-def free_tensor(tensor):
+def free_tensor(tensor, clear_memory=True):
     if hasattr(tensor, "tensor_track"):
         tensor.tensor_track = None
     tensor.value().get_tensor()._clear()
     del tensor
+    if clear_memory:
+        paddle.device.cuda.empty_cache()
 
 
 def fd_cast(weight, param):