Skip to content

Commit f4e630e

Browse files
committed
empty cache
1 parent 74f33ef commit f4e630e

File tree

2 files changed

+51
-28
lines changed

2 files changed

+51
-28
lines changed

fastdeploy/model_executor/layers/linear.py

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -143,11 +143,13 @@ def __init__(
143143
self.with_bias = with_bias
144144
self.add_bias = add_bias
145145
self.prefix = prefix
146-
self.is_quantized = fd_config.model_config.is_quantized
146+
self.is_quantized = fd_config.model_config.is_quantized and not (
147+
fd_config.quant_config.name() == "mix_quant" and fd_config.quant_config.dense_quant_type is None
148+
)
147149
# key
148150
if weight_key:
149151
self.weight_key = f"{prefix}.{weight_key}"
150-
elif fd_config.model_config.is_quantized and not skip_quant:
152+
elif self.is_quantized and not skip_quant:
151153
self.weight_key = f"{prefix}.quant_weight"
152154
self.weight_scale_key = f"{prefix}.weight_scale"
153155
self.act_scale_key = f"{prefix}.activation_scale"
@@ -170,7 +172,7 @@ def __init__(
170172
self.output_size,
171173
]
172174

173-
if fd_config.quant_config and not skip_quant:
175+
if fd_config.quant_config and not skip_quant and fd_config.quant_config.get_quant_method(self):
174176
self.quant_method = fd_config.quant_config.get_quant_method(self)
175177
else:
176178
self.quant_method: Optional[QuantMethodBase] = UnquantizedLinearMethod()
@@ -232,7 +234,7 @@ def load_state_dict(self, state_dict: dict):
232234
# weight
233235
self.state_dict = state_dict
234236
assert self.weight_key is not None, "weight_key should not be None."
235-
if self.fd_config.model_config.is_quantized:
237+
if self.is_quantized:
236238
self.load_prequant_weight(state_dict)
237239
else:
238240
self.load_weight(state_dict)
@@ -358,10 +360,6 @@ def __init__(
358360
self.output_sizes = output_sizes
359361

360362
def weight_loader(self, param, loaded_weight, loaded_shard_id: Optional[str] = None):
361-
weight_need_transpose = getattr(param, "weight_need_transpose", False)
362-
if weight_need_transpose:
363-
loaded_weight = get_tensor(loaded_weight).transpose([1, 0])
364-
365363
assert loaded_shard_id in ["q_a", "kv_a"]
366364
if not param._is_initialized():
367365
param.initialize()
@@ -387,7 +385,6 @@ def weight_loader(self, param, loaded_weight, loaded_shard_id: Optional[str] = N
387385
else:
388386
loaded_weight = loaded_weight.cast(param.dtype)
389387
# (bukejiyu) After this fix, the early H2D copy for non-GPU devices is no longer needed and can be safely removed.
390-
loaded_weight = get_tensor(loaded_weight)
391388
h2d_copy(param, loaded_weight)
392389

393390

@@ -784,7 +781,7 @@ def load_state_dict(self, state_dict: dict):
784781
assert self.weight_key is not None, "weight_key should not be None."
785782
# qkv fused in disk
786783

787-
if self.fd_config.model_config.is_quantized:
784+
if self.is_quantized:
788785
self.load_prequant_weight(state_dict)
789786
else:
790787
self.load_weight(state_dict)
@@ -959,16 +956,20 @@ def __init__(
959956
# Split num_attention_heads when using TP inference.
960957
self.num_heads_per_partition = divide(num_attention_heads, self.nranks)
961958
self.local_rank = fd_config.parallel_config.tensor_parallel_rank
962-
963-
self.kv_b_proj = kv_b_proj
959+
self.fd_config = fd_config
960+
if self.fd_config.load_config.load_choices == "default_v1":
961+
self.kv_b_proj = kv_b_proj
962+
else:
963+
self.kv_b_proj = None
964964

965965
self.weight_dtype = self._helper.get_default_dtype()
966966

967967
# Override weight keys to use the combined kv_b_proj
968968
self.weight_key = f"{prefix}.weight" # e.g., "kv_b_proj.weight"
969969

970970
def process_weights_after_loading(self):
971-
971+
if self.fd_config.load_config.dynamic_load_weight:
972+
return
972973
w = self.kv_b_proj.weight.reshape(
973974
[
974975
self.kv_lora_rank,

fastdeploy/model_executor/utils.py

Lines changed: 37 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -131,18 +131,26 @@ def slice_fn(weight_or_paramter, output_dim, start, end, step=1):
131131
def process_weight_transpose(layer, weight_name):
132132
weight = getattr(layer, weight_name)
133133
if len(weight.shape) == 2:
134-
weight_transpose = weight.transpose([1, 0])
134+
weight_shape = weight.shape[::-1]
135135
elif len(weight.shape) == 3:
136-
weight_transpose = weight.transpose([0, 2, 1])
137-
136+
weight_shape = [weight.shape[0]] + list(weight.shape[1:][::-1])
138137
weight_tmp = layer.create_parameter(
139-
shape=weight_transpose.shape,
140-
dtype=weight_transpose.dtype,
138+
shape=weight_shape,
139+
dtype=weight.dtype,
141140
default_initializer=paddle.nn.initializer.Constant(0),
142141
is_bias=False,
143142
)
143+
if layer.fd_config.load_config.dynamic_load_weight or layer.fd_config.model_config.enable_cache:
144+
free_tensor(weight, clear_memory=False)
145+
setattr(layer, weight_name, weight_tmp)
146+
return
147+
148+
if len(weight.shape) == 2:
149+
weight_transpose = weight.transpose([1, 0])
150+
elif len(weight.shape) == 3:
151+
weight_transpose = weight.transpose([0, 2, 1])
144152
weight_tmp.copy_(weight_transpose, False)
145-
free_tensor(weight)
153+
free_tensor(weight, clear_memory=False)
146154
setattr(layer, weight_name, weight_tmp)
147155

148156

@@ -163,9 +171,16 @@ def fn(model_sublayer_name: str, param=None):
163171
model_sublayer = sublayers_dict[model_sublayer_name]
164172
if isinstance(model_sublayer, KVBatchLinear):
165173
model_sublayer.process_weights_after_loading()
174+
if fd_config.quant_config and not fd_config.quant_config.is_checkpoint_bf16:
175+
# skip for offline quantization
176+
return
166177
if hasattr(model_sublayer, "quant_method"):
167178
quant_method = getattr(model_sublayer, "quant_method", None)
168-
unquant_moe_cls = type(get_moe_method())
179+
unquant_moe_layer = get_moe_method()
180+
if unquant_moe_layer is None:
181+
unquant_moe_cls = object
182+
else:
183+
unquant_moe_cls = type(unquant_moe_layer)
169184
if type(quant_method) is UnquantizedLinearMethod or type(quant_method) is unquant_moe_cls:
170185
# skip unquantized linear
171186
return
@@ -225,26 +240,33 @@ def process_final_after_loading(model, fd_config: FDConfig):
225240
from fastdeploy.model_executor.layers.moe.moe import get_moe_method
226241

227242
for name, sublayer in model.named_sublayers():
243+
if isinstance(sublayer, KVBatchLinear):
244+
continue
228245
quant_method = getattr(sublayer, "quant_method", None)
229246
if quant_method is not None:
230-
unquant_moe_cls = type(get_moe_method())
231-
if not (type(quant_method) is UnquantizedLinearMethod or type(quant_method) is unquant_moe_cls):
247+
unquant_moe_layer = get_moe_method()
248+
if unquant_moe_layer is None:
249+
unquant_moe_cls = object
250+
else:
251+
unquant_moe_cls = type(unquant_moe_layer)
252+
is_unquant_cls = type(quant_method) is UnquantizedLinearMethod or type(quant_method) is unquant_moe_cls
253+
is_offline_quantized_ckpt = not (fd_config.quant_config and fd_config.quant_config.is_checkpoint_bf16)
254+
if is_unquant_cls or is_offline_quantized_ckpt:
255+
if hasattr(quant_method, "process_weights_after_loading"):
256+
quant_method.process_weights_after_loading(sublayer)
232257
continue
233-
if hasattr(quant_method, "process_weights_after_loading"):
234-
quant_method.process_weights_after_loading(sublayer)
235-
if isinstance(sublayer, KVBatchLinear):
236-
continue
237258
if not hasattr(sublayer, "process_weights_after_loading"):
238259
continue
239-
# Only for specific layers, such as lmhead
240260
sublayer.process_weights_after_loading()
241261

242262

243-
def free_tensor(tensor):
263+
def free_tensor(tensor, clear_memory=True):
244264
if hasattr(tensor, "tensor_track"):
245265
tensor.tensor_track = None
246266
tensor.value().get_tensor()._clear()
247267
del tensor
268+
if clear_memory:
269+
paddle.device.cuda.empty_cache()
248270

249271

250272
def fd_cast(weight, param):

0 commit comments

Comments
 (0)