Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions examples/robobrain/conf/compress.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,4 @@ action: run
hydra:
run:
dir: ${experiment.exp_dir}/hydra

20 changes: 20 additions & 0 deletions examples/robobrain/conf/compress/mix_precision.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
defaults:
- model
- _self_

system:
save_dir: Qwen3_30B_MixPrecision_Search

compress_args:
scheme: "mix_precision_search"
targets: ["Linear"]

data:
num_calibration_samples: 128
batch_size: 1

tokenizer_args:
#tokenizer_path: ${model.model_path}
use_fast: true
trust_remote_code: true

23 changes: 23 additions & 0 deletions examples/robobrain/conf/compress_mix.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
defaults:
- _self_
- compress: mix_precision

experiment:
exp_name: robobrain_mix
exp_dir: outputs/${experiment.exp_name}
task:
type: compress
entrypoint: flagscale/compress/compressor_mix_precision.py
runner:
hostfile: null
cmds:
before_start: source activate flagscale-inference
envs:
CUDA_VISIBLE_DEVICES: 0
CUDA_DEVICE_MAX_CONNECTIONS: 1

action: run

hydra:
run:
dir: ${experiment.exp_dir}/hydra
220 changes: 75 additions & 145 deletions flagscale/compress/adapter.py
Original file line number Diff line number Diff line change
@@ -1,162 +1,92 @@
import torch
from compressed_tensors.quantization import (
QuantizationConfig,
QuantizationScheme,
QuantizationStatus,
apply_quantization_config,
disable_quantization,
enable_quantization,
is_preset_scheme,
preset_name_to_scheme,
)
from compressed_tensors.quantization.lifecycle.apply import find_name_or_class_matches
from llmcompressor.modifiers.quantization.calibration import (
freeze_module_quantization,
initialize_observer,
update_weight_zp_scale,
)
from llmcompressor.modifiers.quantization.gptq.utils import get_output_error
from llmcompressor.modifiers.quantization.gptq.utils.gptq_wrapper import GPTQWrapper
from llmcompressor.modifiers.utils.layer_compressor import LayerCompressor
from llmcompressor.modifiers.utils.pytorch_helpers import run_calibration_forward
from llmcompressor.transformers.sparsification.compressed_tensors_utils import (
modify_save_pretrained,
)
from llmcompressor.utils.fsdp.context import fix_fsdp_module_name
from llmcompressor.utils.helpers import DisableKVCache

from flagscale.runner.utils import logger

__all__ = ["LLMCompressorAdapter"]

QUANT_MAPPING_NAMES = {"gptq": GPTQWrapper}
import os
from typing import Optional, Dict, Any, Union
from transformers import PreTrainedModel, PreTrainedTokenizer
from flagscale.logger import logger

from llmcompressor import oneshot

class LLMCompressorAdapter:
def __init__(
self,
model,
scheme,
targets,
algo=None,
ignore=None,
dataset=None,
num_calibration_steps=384,
model: PreTrainedModel,
tokenizer: Optional[PreTrainedTokenizer] = None,
dataset: Optional[Any] = None,
output_dir: str = "./output",
num_calibration_steps: int = 512,
**kwargs
):
self.model = model
modify_save_pretrained(self.model)
if algo is not None:
assert len(algo) == 1
for k, v in algo.items():
self.algo = k
self.algo_args = v
else:
self.algo = algo
self.scheme = scheme
self.ignore = ignore
self.targets = targets
self.wrapper_cls = None
self.layer_compressors_ = []
self.num_calibration_steps = num_calibration_steps
self.tokenizer = tokenizer
self.dataset = dataset

if (self.algo is None and is_preset_scheme(self.scheme)) or self.algo in list(
QUANT_MAPPING_NAMES.keys()
):
self.wrapper_cls = QUANT_MAPPING_NAMES[self.algo] if self.algo is not None else None
quant_config = self.init_quant_config()

### find ignore and target to quant, initialize module for quant
### overwrite forward if quantization_enabled is Tue
apply_quantization_config(self.model, quant_config)
if self.wrapper_cls is None:
self.preprocess_weight()
self.output_dir = output_dir
self.num_calibration_steps = num_calibration_steps

self.algo = kwargs.get("algo", {})
self.scheme = kwargs.get("scheme", "W8A16")
self.targets = kwargs.get("targets", ["Linear"])
self.ignore = kwargs.get("ignore", [])

self.is_mix_precision = (self.scheme == "mix_precision_search") or (isinstance(self.algo, str) and self.algo == "mix_precision")

def _prepare_recipe(self):
from llmcompressor.modifiers.quantization import QuantizationModifier

if not self.is_mix_precision:
modifier = QuantizationModifier(
targets=self.targets,
ignore=self.ignore,
scheme=self.scheme,
**(self.algo if isinstance(self.algo, dict) else {})
)
return [modifier]

else:
self.init_compressor()
if self.dataset is not None:
self.run_blockwise_calib_forward()
self.model.apply(freeze_module_quantization)
logger.info("Detected Mixed Precision Mode. Recipe will be handled by the pipeline.")
return None

def init_quant_config(self):
if self.scheme is not None:
# takes precedence over config_groups
if isinstance(self.scheme, str) and is_preset_scheme(self.scheme):
# attach targets to scheme
self.scheme = {self.scheme: self.targets}

self.config_groups = {}
for idx, key in enumerate(self.scheme.keys()):
if is_preset_scheme(key):
scheme = preset_name_to_scheme(key, self.scheme[key])
else:
scheme = QuantizationScheme.model_validate(
{"targets": self.scheme[key], **self.scheme}
)

group_name = f"group_{idx}"
self.config_groups[group_name] = scheme

if self.config_groups is None or len(self.config_groups) == 0:
default_quant_scheme = QuantizationScheme(targets=self.targets)
self.config_groups = {"group_0": default_quant_scheme}
logger.info(f"No config groups were provided, using default {self.config_groups}")

return QuantizationConfig(
config_groups=self.config_groups,
kv_cache_scheme=None, ### TODO(lvmengsi): not support kv cache quant for now
quantization_status=QuantizationStatus.INITIALIZED,
ignore=self.ignore,
)

def init_compressor(self):
for name, layer in self.model.named_modules():
name = fix_fsdp_module_name(name)
if name is None:
continue
def run(self):
logger.info(f"Starting compression with scheme: {self.scheme}")

if self.is_mix_precision:
try:
idx = int(name.split(".")[-1])
except:
continue

if find_name_or_class_matches(name, layer, self.ignore):
continue
logger.info(f"prepare compressor for layer {name}")
compressor = LayerCompressor(
self.wrapper_cls, self.model, layer, idx, name, self.algo_args
import flagscale.compress.pipelines.mix_precision_pipeline
logger.info("Successfully registered MixPrecisionPipeline.")
except ImportError as e:
raise ImportError(f"Failed to import mix_precision_pipeline: {e}. Please check your PYTHONPATH.")

recipe = self._prepare_recipe()

oneshot_args = {
"model": self.model,
"dataset": self.dataset,
"output_dir": self.output_dir,
"num_calibration_batches": self.num_calibration_steps,
}

if self.is_mix_precision:

from llmcompressor.pipelines.registry import CalibrationPipeline
#pipeline_cls = CalibrationPipeline.load("mix_precision_search")
pipeline_cls = CalibrationPipeline.load_from_registry("mix_precision_search")

logger.info("Invoking MixPrecisionPipeline manually...")
pipeline_cls(
model=self.model,
dataloader=self.dataset,
dataset_args=None,
output_dir=self.output_dir
)
self.layer_compressors_.append(compressor)
self.layer_compressors_[0].set_early_stop()

else:
oneshot_args["recipe"] = recipe
oneshot(**oneshot_args)

def preprocess_weight(self):
for idx, (name, layer) in enumerate(self.model.named_modules()):
layer.apply(lambda module: initialize_observer(layer, base_name="weight"))
self.model.apply(update_weight_zp_scale)
self.save_artifacts()

def add_hook(self):
pass
def save_artifacts(self):

@torch.no_grad()
def run_blockwise_calib_forward(self):
logger.info("start calibration")
self.model.apply(disable_quantization)
with DisableKVCache(self.model):
intermediates = run_calibration_forward(
self.model,
self.dataset,
num_calibration_steps=self.num_calibration_steps,
mask_padding=False,
)
self.layer_compressors_[0].clear_early_stop()
if self.tokenizer:
self.tokenizer.save_pretrained(self.output_dir)
logger.info(f"Artifacts saved to {self.output_dir}")

for idx, layer_compressor in enumerate(self.layer_compressors_):
logger.info(f"start calibration layer {layer_compressor.name}")
layer_compressor.pre_compress()
unquantized_outputs = layer_compressor.calibrate_layer(intermediates)
layer_compressor.compress()
layer_compressor.post_compress()
layer_compressor.revert_layer_wrappers()
quantized_outputs = layer_compressor.calibrate_layer(intermediates)
error = get_output_error(unquantized_outputs, quantized_outputs)
logger.info(f"Mean output error from quantization: {error:.3f}")
intermediates = quantized_outputs
self.model.apply(enable_quantization)
Loading