diff --git a/.gitignore b/.gitignore index 04c27b4..643d990 100644 --- a/.gitignore +++ b/.gitignore @@ -173,3 +173,6 @@ runs/ exps/ wandb wandb/ + +# generated calib data +datasets/ diff --git a/deepcompressor/app/diffusion/dataset/base.py b/deepcompressor/app/diffusion/dataset/base.py index 535b191..f0c177d 100644 --- a/deepcompressor/app/diffusion/dataset/base.py +++ b/deepcompressor/app/diffusion/dataset/base.py @@ -39,6 +39,9 @@ def __len__(self) -> int: return len(self.filepaths) def __getitem__(self, idx) -> dict[str, tp.Any]: + + # TODO verfify ZImage data loading. + data = np.load(self.filepaths[idx], allow_pickle=True).item() if isinstance(data["input_args"][0], str): name = data["input_args"][0] diff --git a/deepcompressor/app/diffusion/dataset/calib.py b/deepcompressor/app/diffusion/dataset/calib.py index f794d30..4b5b637 100644 --- a/deepcompressor/app/diffusion/dataset/calib.py +++ b/deepcompressor/app/diffusion/dataset/calib.py @@ -15,6 +15,7 @@ FluxSingleTransformerBlock, FluxTransformerBlock, ) +from diffusers.models.transformers.transformer_z_image import ZImageTransformerBlock from omniconfig import configclass from deepcompressor.data.cache import ( @@ -172,6 +173,18 @@ def _init_cache(self, name: str, module: nn.Module) -> IOTensorsCache: ), outputs=TensorCache(channels_dim=-1, reshape=LinearReshapeFn()), ) + elif isinstance(module, ZImageTransformerBlock): + return IOTensorsCache( + inputs=TensorsCache( + OrderedDict( + x=TensorCache(channels_dim=-1, reshape=LinearReshapeFn()), + attn_mask=TensorCache(channels_dim=-1, reshape=LinearReshapeFn()), + freqs_cis=TensorCache(channels_dim=-1, reshape=LinearReshapeFn()), + # TODO verify + ) + ), + outputs=TensorCache(channels_dim=-1, reshape=LinearReshapeFn()), + ) elif isinstance(module, Attention): return IOTensorsCache( inputs=TensorsCache( diff --git a/deepcompressor/app/diffusion/dataset/collect/utils.py b/deepcompressor/app/diffusion/dataset/collect/utils.py index dc4169d..be9b1ce 100644 --- a/deepcompressor/app/diffusion/dataset/collect/utils.py +++ b/deepcompressor/app/diffusion/dataset/collect/utils.py @@ -10,6 +10,7 @@ FluxTransformer2DModel, PixArtTransformer2DModel, SanaTransformer2DModel, + ZImageTransformer2DModel, ) from diffusers.models.unets.unet_2d_condition import UNet2DConditionModel @@ -58,10 +59,19 @@ def __call__( new_args.append(input_kwargs.pop("hidden_states")) elif isinstance(module, FluxTransformer2DModel): new_args.append(input_kwargs.pop("hidden_states")) + elif isinstance(module, ZImageTransformer2DModel): + new_args.append(input_kwargs.pop("x")) + new_args.append(input_kwargs.pop("t")) + new_args.append(input_kwargs.pop("cap_feats")) else: raise ValueError(f"Unknown model: {module}") cache = tree_map(lambda x: x.cpu(), {"input_args": new_args, "input_kwargs": input_kwargs, "outputs": output}) - split_cache = tree_split(cache) + + if isinstance(module, ZImageTransformer2DModel): + # assume that batch size is 1. + split_cache = [cache] + else: + split_cache = tree_split(cache) if isinstance(module, PixArtTransformer2DModel) and self.zero_redundancy: for cache in split_cache: diff --git a/deepcompressor/app/diffusion/nn/patch.py b/deepcompressor/app/diffusion/nn/patch.py index a39ff40..5bf4c1c 100644 --- a/deepcompressor/app/diffusion/nn/patch.py +++ b/deepcompressor/app/diffusion/nn/patch.py @@ -1,8 +1,10 @@ import torch.nn as nn from diffusers.models.attention_processor import Attention from diffusers.models.transformers.transformer_flux import FluxSingleTransformerBlock +from diffusers.models.transformers.transformer_z_image import ZImageTransformer2DModel, ZImageTransformerBlock from deepcompressor.nn.patch.conv import ConcatConv2d, ShiftedConv2d +from deepcompressor.nn.patch.ff import convert_z_image_ff from deepcompressor.nn.patch.linear import ConcatLinear, ShiftedLinear from deepcompressor.utils import patch, tools @@ -116,3 +118,11 @@ def replace_attn_processor(model: nn.Module) -> None: logger.info(f"+ Replacing {name} processor with DiffusionAttentionProcessor.") module.set_processor(DiffusionAttentionProcessor(module.processor)) tools.logging.Formatter.indent_dec() + + +def replace_zimage_feedforward(z_image_model: ZImageTransformer2DModel) -> None: + """Replace custom FeedForward module in `ZImageTransformerBlock`s with standard FeedForward in diffusers lib.""" + for _, module in z_image_model.named_modules(): + if isinstance(module, ZImageTransformerBlock): + orig_ff = module.feed_forward + module.feed_forward = convert_z_image_ff(orig_ff) diff --git a/deepcompressor/app/diffusion/nn/struct.py b/deepcompressor/app/diffusion/nn/struct.py index a25c4f4..4d7197a 100644 --- a/deepcompressor/app/diffusion/nn/struct.py +++ b/deepcompressor/app/diffusion/nn/struct.py @@ -35,6 +35,12 @@ FluxTransformer2DModel, FluxTransformerBlock, ) +from diffusers.models.transformers.transformer_z_image import ( + ZImageTransformerBlock, + ZImageTransformer2DModel, + TimestepEmbedder, + RopeEmbedder +) from diffusers.models.transformers.transformer_sd3 import SD3Transformer2DModel from diffusers.models.unets.unet_2d import UNet2DModel from diffusers.models.unets.unet_2d_blocks import ( @@ -46,6 +52,7 @@ UpBlock2D, ) from diffusers.models.unets.unet_2d_condition import UNet2DConditionModel +from diffusers.pipelines.z_image.pipeline_z_image import ZImagePipeline from diffusers.pipelines import ( FluxControlPipeline, FluxFillPipeline, @@ -71,7 +78,7 @@ from deepcompressor.nn.struct.base import BaseModuleStruct from deepcompressor.utils.common import join_name -from .attention import DiffusionAttentionProcessor +from deepcompressor.app.diffusion.nn.attention import DiffusionAttentionProcessor # endregion @@ -85,6 +92,7 @@ FluxSingleTransformerBlock, FluxTransformerBlock, SanaTransformerBlock, + ZImageTransformerBlock, ] UNET_BLOCK_CLS = tp.Union[ DownBlock2D, @@ -100,6 +108,7 @@ SD3Transformer2DModel, FluxTransformer2DModel, SanaTransformer2DModel, + ZImageTransformer2DModel, ] UNET_CLS = tp.Union[UNet2DModel, UNet2DConditionModel] MODEL_CLS = tp.Union[DIT_CLS, UNET_CLS] @@ -112,6 +121,7 @@ FluxControlPipeline, FluxFillPipeline, SanaPipeline, + ZImagePipeline, ] PIPELINE_CLS = tp.Union[UNET_PIPELINE_CLS, DIT_PIPELINE_CLS] @@ -268,15 +278,18 @@ def _default_construct( @classmethod def _get_default_key_map(cls) -> dict[str, set[str]]: - unet_key_map = UNetStruct._get_default_key_map() - dit_key_map = DiTStruct._get_default_key_map() - flux_key_map = FluxStruct._get_default_key_map() key_map: dict[str, set[str]] = defaultdict(set) - for rkey, keys in unet_key_map.items(): - key_map[rkey].update(keys) - for rkey, keys in dit_key_map.items(): - key_map[rkey].update(keys) - for rkey, keys in flux_key_map.items(): + # unet_key_map = UNetStruct._get_default_key_map() + # dit_key_map = DiTStruct._get_default_key_map() + # flux_key_map = FluxStruct._get_default_key_map() + # for rkey, keys in unet_key_map.items(): + # key_map[rkey].update(keys) + # for rkey, keys in dit_key_map.items(): + # key_map[rkey].update(keys) + # for rkey, keys in flux_key_map.items(): + # key_map[rkey].update(keys) + zimage_key_map = ZImageStruct._get_default_key_map() + for rkey, keys in zimage_key_map.items(): key_map[rkey].update(keys) return {k: v for k, v in key_map.items() if v} @@ -362,7 +375,7 @@ def _default_construct( o_proj_rname = "to_out.0" assert isinstance(o_proj, nn.Linear) elif parent is not None: - assert isinstance(parent.module, FluxSingleTransformerBlock) + assert isinstance(parent.module, (FluxSingleTransformerBlock, ZImageTransformerBlock)) assert isinstance(parent.module.proj_out, ConcatLinear) assert len(parent.module.proj_out.linears) == 2 o_proj = parent.module.proj_out.linears[0] @@ -580,11 +593,11 @@ def __post_init__(self) -> None: super().__post_init__() self.norm_key = join_name(self.key, self.norm_rkey, sep="_") self.add_norm_key = join_name(self.key, self.add_norm_rkey, sep="_") - self.attn_norm_structs = [ + self.pre_attn_norm_structs = [ DiffusionModuleStruct(norm, parent=self, fname="pre_attn_norm", rname=rname, rkey=self.norm_rkey, idx=idx) for idx, (norm, rname) in enumerate(zip(self.pre_attn_norms, self.pre_attn_norm_rnames, strict=True)) ] - self.add_attn_norm_structs = [ + self.pre_attn_add_norm_structs = [ DiffusionModuleStruct( norm, parent=self, fname="pre_attn_add_norm", rname=rname, rkey=self.add_norm_rkey, idx=idx ) @@ -606,10 +619,10 @@ def __post_init__(self) -> None: ) def named_key_modules(self) -> tp.Generator[tp.Tuple[str, str, nn.Module, BaseModuleStruct, str], None, None]: - for attn_norm in self.attn_norm_structs: + for attn_norm in self.pre_attn_norm_structs: if attn_norm.module is not None: yield from attn_norm.named_key_modules() - for add_attn_norm in self.add_attn_norm_structs: + for add_attn_norm in self.pre_attn_add_norm_structs: if add_attn_norm.module is not None: yield from add_attn_norm.named_key_modules() for attn_struct in self.attn_structs: @@ -705,6 +718,15 @@ def _default_construct( ffn, ffn_rname = module.ff, "ff" pre_add_ffn_norm, pre_add_ffn_norm_rname = module.norm2_context, "norm2_context" add_ffn, add_ffn_rname = module.ff_context, "ff_context" + elif isinstance(module, ZImageTransformerBlock): + parallel = False + norm_type, add_norm_type = "rms_norm", None + pre_attn_norms, pre_attn_norm_rnames = [module.attention_norm1], ["attention_norm1"] + attns, attn_rnames = [module.attention], ["attention"] + pre_attn_add_norms, pre_attn_add_norm_rnames = [], [] + pre_ffn_norm, pre_ffn_norm_rname = module.ffn_norm1, "ffn_norm1" + ffn, ffn_rname = module.feed_forward, "feed_forward" + pre_add_ffn_norm, pre_add_ffn_norm_rname, add_ffn, add_ffn_rname = None, "", None, "" else: raise NotImplementedError(f"Unsupported module type: {type(module)}") return DiffusionTransformerBlockStruct( @@ -1696,6 +1718,8 @@ def _default_construct( module = module.transformer if isinstance(module, FluxTransformer2DModel): return FluxStruct.construct(module, parent=parent, fname=fname, rname=rname, rkey=rkey, idx=idx, **kwargs) + elif isinstance(module, ZImageTransformer2DModel): + return ZImageStruct.construct(module, parent=parent, fname=fname, rname=rname, rkey=rkey, idx=idx, **kwargs) else: if isinstance(module, PixArtTransformer2DModel): input_embed, input_embed_rname = module.pos_embed, "pos_embed" @@ -1946,6 +1970,260 @@ def _get_default_key_map(cls) -> dict[str, set[str]]: return {k: v for k, v in key_map.items() if v} +@dataclass(kw_only=True) +class ZImageStruct(DiffusionModelStruct, DiffusionTransformerStruct): + # region relative keys + # TODO + noise_refiner_rkey: tp.ClassVar[str] = "nrk" + context_refiner_rkey: tp.ClassVar[str] = "crk" + layers_rkey: tp.ClassVar[str] = "lk" + # endregion + + module: ZImageTransformer2DModel = field(repr=False, kw_only=False) + """the module of ZImageTransformer2DModel""" + + # region child modules + all_x_embedder: nn.ModuleDict + all_final_layer: nn.ModuleDict + noise_refiner: nn.ModuleList + context_refiner: nn.ModuleList + t_embedder: TimestepEmbedder + cap_embedder: nn.Sequential + x_pad_token: nn.Parameter + cap_pad_token: nn.Parameter + layers: nn.ModuleList + rope_embedder: RopeEmbedder + # endregion + + # region relative names + all_x_embedder_rname: str + all_final_layer_rname: str + noise_refiner_rname: str + context_refiner_rname: str + t_embedder_rname: str + cap_embedder_rname: str + x_pad_token_rname: str + cap_pad_token_rname: str + layers_rname: str + rope_embedder_rname: str + # endregion + + # region absolute names + noise_refiner_names: list[str] = field(init=False, repr=False) + context_refiner_names: list[str] = field(init=False, repr=False) + layers_names: list[str] = field(init=False, repr=False) + # endregion + + # region absolute keys + # TODO + # endregion + + # region child structs + noise_refiner_structs: list[DiffusionTransformerBlockStruct] = field(init=False) + context_refiner_structs: list[DiffusionTransformerBlockStruct] = field(init=False) + layers_structs: list[DiffusionTransformerBlockStruct] = field(init=False) + # endregion + + + @property + def num_blocks(self) -> int: + return len(self.noise_refiner_structs) + len(self.context_refiner_structs) + len(self.layers_structs) + + @property + def block_structs(self) -> list[DiffusionTransformerBlockStruct]: + return [*self.noise_refiner_structs, *self.context_refiner_structs, *self.layers_structs] + + @property + def block_names(self) -> list[str]: + return [*self.noise_refiner_names, *self.context_refiner_names, *self.layers_names] + + def __post_init__(self) -> None: + BaseModuleStruct.__post_init__(self) + noise_refiner_indexed_rnames = [ + f"{self.noise_refiner_rname}.{idx}" for idx in range(len(self.noise_refiner)) + ] + self.noise_refiner_names = [join_name(self.name, indexed_rname) for indexed_rname in noise_refiner_indexed_rnames] + + context_refiner_indexed_rnames = [ + f"{self.context_refiner_rname}.{idx}" for idx in range(len(self.context_refiner)) + ] + self.context_refiner_names = [join_name(self.name, indexed_rname) for indexed_rname in context_refiner_indexed_rnames] + + layers_indexed_rnames = [ + f"{self.layers_rname}.{idx}" for idx in range(len(self.layers)) + ] + self.layers_names = [join_name(self.name, indexed_rname) for indexed_rname in layers_indexed_rnames] + + self.pre_module_structs = OrderedDict() + self.post_module_structs = OrderedDict() + + self.noise_refiner_structs = [ + self.transformer_block_struct_cls.construct( + block, + parent=self, + fname="noise_refiner", + rname=rname, + rkey=self.noise_refiner_rkey, # TODO + idx=idx, + ) + for idx, (block, rname) in enumerate( + zip(self.noise_refiner, noise_refiner_indexed_rnames, strict=True) + ) + ] + + self.context_refiner_structs = [ + self.transformer_block_struct_cls.construct( + block, + parent=self, + fname="context_refiner", + rname=rname, + rkey=self.context_refiner_rkey, # TODO + idx=idx, + ) + for idx, (block, rname) in enumerate( + zip(self.context_refiner, context_refiner_indexed_rnames, strict=True) + ) + ] + + self.layers_structs = [ + self.transformer_block_struct_cls.construct( + block, + parent=self, + fname="layers", + rname=rname, + rkey=self.layers_rkey, # TODO + idx=idx, + ) + for idx, (block, rname) in enumerate( + zip(self.layers, layers_indexed_rnames, strict=True) + ) + ] + + + def _get_iter_block_activations_args( + self, **input_kwargs + ) -> tuple[list[nn.Module], list[DiffusionModuleStruct | DiffusionBlockStruct], list[bool], list[bool]]: + layers, layer_structs, recomputes, use_prev_layer_outputs = [], [], [], [] + + layers.extend(self.noise_refiner) + layer_structs.extend(self.noise_refiner_structs) + use_prev_layer_outputs.append(False) + use_prev_layer_outputs.extend([True] * (len(self.noise_refiner) - 1)) + recomputes.extend([False] * len(self.noise_refiner)) + + layers.extend(self.context_refiner) + layer_structs.extend(self.context_refiner_structs) + use_prev_layer_outputs.append(False) + use_prev_layer_outputs.extend([True] * (len(self.context_refiner) - 1)) + recomputes.extend([False] * len(self.context_refiner)) + + layers.extend(self.layers) + layer_structs.extend(self.layers_structs) + use_prev_layer_outputs.append(False) + use_prev_layer_outputs.extend([True] * (len(self.layers) - 1)) + recomputes.extend([False] * len(self.layers)) + + return layers, layer_structs, recomputes, use_prev_layer_outputs + + + def get_prev_module_keys(self) -> tuple[str, ...]: + return tuple() + + + def get_post_module_keys(self) -> tuple[str, ...]: + return tuple() + + + @staticmethod + def _default_construct( + module: tp.Union[ZImagePipeline, ZImageTransformer2DModel], + /, + parent: tp.Optional[BaseModuleStruct] = None, + fname: str = "", + rname: str = "", + rkey: str = "", + idx: int = 0, + **kwargs, + ) -> "ZImageStruct": + if isinstance(module, ZImagePipeline): + module = module.transformer + if isinstance(module, ZImageTransformer2DModel): + all_x_embedder, all_x_embedder_rname = module.all_x_embedder, "all_x_embedder" + all_final_layer, all_final_layer_rname = module.all_final_layer, "all_final_layer" + noise_refiner, noise_refiner_rname = module.noise_refiner, "noise_refiner" + context_refiner, context_refiner_rname = module.context_refiner, "context_refiner" + t_embedder, t_embedder_rname = module.t_embedder, "t_embedder" + cap_embedder, cap_embedder_rname = module.cap_embedder, "cap_embedder" + x_pad_token, x_pad_token_rname = module.x_pad_token, "x_pad_token" + cap_pad_token, cap_pad_token_rname = module.cap_pad_token, "cap_pad_token" + layers, layers_rname = module.layers, "layers" + rope_embedder, rope_embedder_rname = module.rope_embedder, "rope_embedder" + return ZImageStruct( + module=module, + parent=parent, + fname=fname, + idx=idx, + rname=rname, + rkey=rkey, + + all_x_embedder=all_x_embedder, + all_final_layer=all_final_layer, + noise_refiner=noise_refiner, + context_refiner=context_refiner, + t_embedder=t_embedder, + cap_embedder=cap_embedder, + x_pad_token=x_pad_token, + cap_pad_token=cap_pad_token, + layers=layers, + rope_embedder=rope_embedder, + + all_x_embedder_rname=all_x_embedder_rname, + all_final_layer_rname=all_final_layer_rname, + noise_refiner_rname=noise_refiner_rname, + context_refiner_rname=context_refiner_rname, + t_embedder_rname=t_embedder_rname, + cap_embedder_rname=cap_embedder_rname, + x_pad_token_rname=x_pad_token_rname, + cap_pad_token_rname=cap_pad_token_rname, + layers_rname=layers_rname, + rope_embedder_rname=rope_embedder_rname, + + # these fields are not valid in Z-Image model, just hard code to None + norm_in=None, + proj_in=None, + norm_out=None, + proj_out=None, + norm_in_rname=None, + proj_in_rname=None, + norm_out_rname=None, + proj_out_rname=None, + transformer_blocks=None, + transformer_blocks_rname=None + ) + raise NotImplementedError(f"Unsupported module type: {type(module)}") + + @classmethod + def _get_default_key_map(cls) -> dict[str, set[str]]: + key_map: dict[str, set[str]] = defaultdict(set) + for block_rkey, block_cls in ( + (cls.noise_refiner_rkey, cls.transformer_block_struct_cls), + (cls.context_refiner_rkey, cls.transformer_block_struct_cls), + (cls.layers_rkey, cls.transformer_block_struct_cls), + ): + block_key = block_rkey + block_key_map = block_cls._get_default_key_map() + for rkey, keys in block_key_map.items(): + brkey = join_name(block_rkey, rkey, sep="_") + for key in keys: + key = join_name(block_key, key, sep="_") + key_map[rkey].add(key) + key_map[brkey].add(key) + if block_rkey: + key_map[block_rkey].add(key) + return {k: v for k, v in key_map.items() if v} + + + DiffusionAttentionStruct.register_factory(Attention, DiffusionAttentionStruct._default_construct) DiffusionFeedForwardStruct.register_factory( @@ -1962,8 +2240,16 @@ def _get_default_key_map(cls) -> dict[str, set[str]]: tp.Union[FluxPipeline, FluxControlPipeline, FluxTransformer2DModel], FluxStruct._default_construct ) +ZImageStruct.register_factory( + tp.Union[ZImagePipeline, ZImageTransformer2DModel], ZImageStruct._default_construct +) + DiTStruct.register_factory(tp.Union[DIT_PIPELINE_CLS, DIT_CLS], DiTStruct._default_construct) DiffusionTransformerStruct.register_factory(Transformer2DModel, DiffusionTransformerStruct._default_construct) DiffusionModelStruct.register_factory(tp.Union[PIPELINE_CLS, MODEL_CLS], DiffusionModelStruct._default_construct) + + +if __name__ == "__main__": + print(ZImageStruct._get_default_key_map()) diff --git a/deepcompressor/app/diffusion/pipeline/config.py b/deepcompressor/app/diffusion/pipeline/config.py index bc7cbe2..33b7130 100644 --- a/deepcompressor/app/diffusion/pipeline/config.py +++ b/deepcompressor/app/diffusion/pipeline/config.py @@ -6,6 +6,8 @@ from dataclasses import dataclass, field import torch +from diffusers.models.transformers.transformer_z_image import ZImageTransformer2DModel +from diffusers.pipelines.z_image.pipeline_z_image import ZImagePipeline from diffusers.pipelines import ( AutoPipelineForText2Image, DiffusionPipeline, @@ -25,9 +27,10 @@ from ....nn.patch.linear import ConcatLinear, ShiftedLinear from ....nn.patch.lowrank import LowRankBranch from ..nn.patch import ( - replace_fused_linear_with_concat_linear, - replace_up_block_conv_with_concat_conv, - shift_input_activations, + # replace_fused_linear_with_concat_linear, + # replace_up_block_conv_with_concat_conv, + replace_zimage_feedforward, + # shift_input_activations, ) __all__ = ["DiffusionPipelineConfig"] @@ -344,6 +347,8 @@ def _default_build( path = "black-forest-labs/FLUX.1-Fill-dev" elif name == "flux.1-schnell": path = "black-forest-labs/FLUX.1-schnell" + elif name == "z-image-turbo": + path = "Tongyi-MAI/Z-Image-Turbo" else: raise ValueError(f"Path for {name} is not specified.") if name in ["flux.1-canny-dev", "flux.1-depth-dev"]: @@ -357,14 +362,18 @@ def _default_build( pipeline.text_encoder.to(dtype) else: pipeline = SanaPipeline.from_pretrained(path, torch_dtype=dtype) + elif name == "z-image-turbo": + pipeline = ZImagePipeline.from_pretrained(path, torch_dtype=dtype, low_cpu_mem_usage=False) else: pipeline = AutoPipelineForText2Image.from_pretrained(path, torch_dtype=dtype) pipeline = pipeline.to(device) model = pipeline.unet if hasattr(pipeline, "unet") else pipeline.transformer - replace_fused_linear_with_concat_linear(model) - replace_up_block_conv_with_concat_conv(model) - if shift_activations: - shift_input_activations(model) + # replace_fused_linear_with_concat_linear(model) + # replace_up_block_conv_with_concat_conv(model) + if isinstance(model, ZImageTransformer2DModel): + replace_zimage_feedforward(model) + # if shift_activations: + # shift_input_activations(model) return pipeline @staticmethod diff --git a/deepcompressor/backend/nunchaku/common.py b/deepcompressor/backend/nunchaku/common.py index f0886ef..57a6b51 100644 --- a/deepcompressor/backend/nunchaku/common.py +++ b/deepcompressor/backend/nunchaku/common.py @@ -194,6 +194,7 @@ def convert_to_nunchaku_transformer_block_state_dict( print( f" - Converting {block_name} weights of {candidate_local_names} to {converted_local_name}." f" (smooth_fused={smooth_fused}, shifted={shift is not None}, float_point={float_point})" + f" smooth={type(smooth)}, branch={type(branch)}, bias={type(bias)}, shift={type(shift)}" ) update_state_dict( converted, diff --git a/deepcompressor/backend/nunchaku/convert.py b/deepcompressor/backend/nunchaku/convert.py index d37d0ba..ff3dfcb 100644 --- a/deepcompressor/backend/nunchaku/convert.py +++ b/deepcompressor/backend/nunchaku/convert.py @@ -7,6 +7,7 @@ import torch import tqdm +from .z_image import convert_to_nunchaku_z_image_state_dicts from .flux import convert_to_nunchaku_flux_state_dicts @@ -16,6 +17,7 @@ parser.add_argument("--output-root", type=str, default="", help="root to the output checkpoint directory.") parser.add_argument("--model-name", type=str, default=None, help="name of the model.") parser.add_argument("--float-point", action="store_true", help="use float-point 4-bit quantization.") + parser.add_argument("--dry-run", type=bool, default=False, help="if True, state dicts will be NOT be saved") args = parser.parse_args() if not args.output_root: args.output_root = args.quant_path @@ -26,25 +28,46 @@ else: model_name = args.model_name assert model_name, "Model name must be provided." - assert "flux" in model_name.lower(), "Only Flux models are supported." + assert "flux" in model_name.lower() or "z-image" in model_name.lower(), f"{model_name} model is NOT supported so far." state_dict_path = os.path.join(args.quant_path, "model.pt") scale_dict_path = os.path.join(args.quant_path, "scale.pt") smooth_dict_path = os.path.join(args.quant_path, "smooth.pt") branch_dict_path = os.path.join(args.quant_path, "branch.pt") - map_location = "cuda" if torch.cuda.is_available() and torch.cuda.device_count() > 0 else "cpu" + map_location = "cuda:4" if torch.cuda.is_available() and torch.cuda.device_count() > 0 else "cpu" state_dict = torch.load(state_dict_path, map_location=map_location) scale_dict = torch.load(scale_dict_path, map_location="cpu") smooth_dict = torch.load(smooth_dict_path, map_location=map_location) if os.path.exists(smooth_dict_path) else {} branch_dict = torch.load(branch_dict_path, map_location=map_location) if os.path.exists(branch_dict_path) else {} - converted_state_dict, other_state_dict = convert_to_nunchaku_flux_state_dicts( - state_dict=state_dict, - scale_dict=scale_dict, - smooth_dict=smooth_dict, - branch_dict=branch_dict, - float_point=args.float_point, - ) - output_dirpath = os.path.join(args.output_root, model_name) - os.makedirs(output_dirpath, exist_ok=True) - safetensors.torch.save_file(converted_state_dict, os.path.join(output_dirpath, "transformer_blocks.safetensors")) - safetensors.torch.save_file(other_state_dict, os.path.join(output_dirpath, "unquantized_layers.safetensors")) - print(f"Quantized model saved to {output_dirpath}.") + if "flux" in model_name.lower(): + converted_state_dict, other_state_dict = convert_to_nunchaku_flux_state_dicts( + state_dict=state_dict, + scale_dict=scale_dict, + smooth_dict=smooth_dict, + branch_dict=branch_dict, + float_point=args.float_point, + ) + elif "z-image" in model_name.lower(): + if any("refiner" in k for k in smooth_dict): + skip_refiners = False + else: + skip_refiners = True + print(f" - skip_refiners = {skip_refiners}") + converted_state_dict, other_state_dict = convert_to_nunchaku_z_image_state_dicts( + model_dict=state_dict, + scale_dict=scale_dict, + smooth_dict=smooth_dict, + branch_dict=branch_dict, + float_point=args.float_point, + skip_refiners=skip_refiners, + ) + else: + raise ValueError(f"{model_name} model is NOT supported so far.") + + if args.dry_run: + print(f"Program in DRY RUN mode. Quantized checkpoints NOT saved.") + else: + output_dirpath = os.path.join(args.output_root, model_name) + os.makedirs(output_dirpath, exist_ok=True) + safetensors.torch.save_file(converted_state_dict, os.path.join(output_dirpath, "transformer_blocks.safetensors")) + safetensors.torch.save_file(other_state_dict, os.path.join(output_dirpath, "unquantized_layers.safetensors")) + print(f"Quantized model saved to {output_dirpath}.") diff --git a/deepcompressor/backend/nunchaku/z_image.py b/deepcompressor/backend/nunchaku/z_image.py new file mode 100644 index 0000000..2815700 --- /dev/null +++ b/deepcompressor/backend/nunchaku/z_image.py @@ -0,0 +1,171 @@ +from collections import OrderedDict +import torch + +from .common import convert_to_nunchaku_transformer_block_state_dict, update_state_dict + + +def _print_dict(d: dict, dict_name: str): + print(f"################# print {dict_name} ################") + for k, v in d.items(): + if isinstance(v, torch.Tensor): + print( + f"{dict_name}_key: {k} -> value tensor shape: {v.shape}, dtype: {v.dtype}") + elif isinstance(v, OrderedDict): + for sub_k, sub_v in v.items(): + if isinstance(sub_v, torch.Tensor): + print( + f"{dict_name}_key: {k}/{sub_k} -> tensor shape: {sub_v.shape}, dtype: {sub_v.dtype}") + else: + print( + f"{dict_name}_key: {k}/{sub_k} -> value type: {type(sub_v)}, {sub_v}") + else: + print(f"{dict_name}_key: {k} -> value type: {type(v)}, {v}") + print("\n") + + +def _replace_lora_and_smooth_key(transformer_block_state_dict: dict): + replaced = {} + for k, v in transformer_block_state_dict.items(): + if ".lora_down" in k: + new_k = k.replace(".lora_down", ".proj_down") + elif ".lora_up" in k: + new_k = k.replace(".lora_up", ".proj_up") + elif ".smooth_orig" in k: + new_k = k.replace(".smooth_orig", ".smooth_factor_orig") + elif ".smooth" in k: + new_k = k.replace(".smooth", ".smooth_factor") + else: + new_k = k + replaced[new_k] = v + return replaced + + + +def z_image_transformer_block_convert( + model_dict: dict[str, torch.Tensor], + scale_dict: dict[str, torch.Tensor], + smooth_dict: dict[str, torch.Tensor], + branch_dict: dict[str, torch.Tensor], + block_name: str, + float_point: bool = False, +) -> dict[str, torch.Tensor]: + converted_quantized_part_names = [ + "attention.to_qkv", # attention q,k,v + "attention.to_out.0", # attention to_out + "feed_forward.net.0.proj", # feed forward up proj + "feed_forward.net.2", # feed forward down proj + ] + + def _original_name(converted_name: str): + if "to_qkv" in converted_name: + return [ + converted_name.replace("to_qkv", "to_q"), + converted_name.replace("to_qkv", "to_k"), + converted_name.replace("to_qkv", "to_v") + ] + else: + return converted_name + + def _smooth_name(converted_name: str): + if "to_qkv" in converted_name: + return converted_name.replace("to_qkv", "to_q") + else: + return converted_name + + _branch_name = _smooth_name + + converted_transformer_block_state_dict = convert_to_nunchaku_transformer_block_state_dict( + state_dict=model_dict, + scale_dict=scale_dict, + smooth_dict=smooth_dict, + branch_dict=branch_dict, + block_name=block_name, + local_name_map={ + name: _original_name(name) for name in converted_quantized_part_names + }, + smooth_name_map={ + name: _smooth_name(name) for name in converted_quantized_part_names + }, + branch_name_map={ + name: _branch_name(name) for name in converted_quantized_part_names + }, + convert_map={ + name: "linear" for name in converted_quantized_part_names + }, + float_point=float_point, + ) + + not_quantized_parts = [ + # all norm layers are not quantized. + "attention.norm_q.weight", + "attention.norm_k.weight", + "attention_norm1.weight", + "attention_norm2.weight", + "ffn_norm1.weight", + "ffn_norm2.weight", + "adaLN_modulation.0.weight", + "adaLN_modulation.0.bias", + ] + + for part_name in not_quantized_parts: + absolute_name = f"{block_name}.{part_name}" + if absolute_name in model_dict: + print(f" - Copying {block_name} weights: {part_name}") + converted_transformer_block_state_dict[part_name] = model_dict[absolute_name].clone().cpu() + + converted_transformer_block_state_dict = _replace_lora_and_smooth_key(converted_transformer_block_state_dict) + return converted_transformer_block_state_dict + + +def convert_to_nunchaku_z_image_state_dicts( + model_dict: dict[str, torch.Tensor], + scale_dict: dict[str, torch.Tensor], + smooth_dict: dict[str, torch.Tensor], + branch_dict: dict[str, torch.Tensor], + float_point: bool = False, + skip_refiners: bool = False, +) -> tuple[dict[str, torch.Tensor], dict[str, torch.Tensor]]: + + _print_dict(model_dict, "model_dict") + _print_dict(scale_dict, "scale_dict") + _print_dict(smooth_dict, "smooth_dict") + _print_dict(branch_dict, "branch_dict") + + transformer_block_names: set[str] = set() + others: dict[str, torch.Tensor] = {} + + if skip_refiners: + transfomer_block_name_prefix = ("layers.",) + else: + transfomer_block_name_prefix = ( + "noise_refiner.", "context_refiner.", "layers.") + for param_name in model_dict.keys(): + if param_name.startswith(transfomer_block_name_prefix): + block_name = ".".join(param_name.split(".")[:2]) + transformer_block_names.add(block_name) + else: + others[param_name] = model_dict[param_name] + + transformer_block_names = sorted(transformer_block_names, key=lambda x: ( + x.split(".")[0], int(x.split(".")[-1]))) + print(f"Converting {len(transformer_block_names)} transformer blocks...") + converted_state_dict: dict[str, torch.Tensor] = {} + for b_name in transformer_block_names: + converted_tranzformer_block = z_image_transformer_block_convert( + model_dict=model_dict, + scale_dict=scale_dict, + smooth_dict=smooth_dict, + branch_dict=branch_dict, + block_name=b_name, + float_point=float_point, + ) + update_state_dict( + converted_state_dict, + converted_tranzformer_block, + prefix=b_name, + ) + + _print_dict(converted_state_dict, "converted_state_dict") + _print_dict(others, "others") + + return converted_state_dict, others diff --git a/deepcompressor/csrc/load.py b/deepcompressor/csrc/load.py index 6aae3f1..2a1ed42 100644 --- a/deepcompressor/csrc/load.py +++ b/deepcompressor/csrc/load.py @@ -16,6 +16,7 @@ extra_cuda_cflags=[ "-O3", "-std=c++20", + "-I/usr/local/cuda/include", "-U__CUDA_NO_HALF_OPERATORS__", "-U__CUDA_NO_HALF_CONVERSIONS__", "-U__CUDA_NO_HALF2_OPERATORS__", diff --git a/deepcompressor/dataset/cache.py b/deepcompressor/dataset/cache.py index 7d63d48..ce56f0d 100644 --- a/deepcompressor/dataset/cache.py +++ b/deepcompressor/dataset/cache.py @@ -380,6 +380,7 @@ def _iter_layer_activations( # noqa: C901 if psutil.virtual_memory().percent > 90: raise RuntimeError("memory usage > 90%%, aborting") gc.collect() + tbar.close() else: # region we then forward the layer to collect activations device = next(layer.parameters()).device diff --git a/deepcompressor/nn/patch/ff.py b/deepcompressor/nn/patch/ff.py new file mode 100644 index 0000000..bde5699 --- /dev/null +++ b/deepcompressor/nn/patch/ff.py @@ -0,0 +1,52 @@ +import torch +import torch.nn as nn +from torch.nn.parameter import Parameter +from diffusers.models.attention import FeedForward +from diffusers.models.transformers.transformer_z_image import FeedForward as ZImageFeedForward + + +def convert_z_image_ff(zff: ZImageFeedForward) -> FeedForward: + assert isinstance(zff, ZImageFeedForward) + assert zff.w1.in_features == zff.w3.in_features + assert zff.w1.out_features == zff.w3.out_features + assert zff.w1.out_features == zff.w2.in_features + converted_ff = FeedForward( + dim=zff.w1.in_features, + dim_out=zff.w2.out_features, + dropout=0.0, + activation_fn="swiglu", + inner_dim=zff.w2.in_features, + bias=False, + ).to(dtype=zff.w1.weight.dtype, device=zff.w1.weight.device) + + up_proj: nn.Linear = converted_ff.net[0].proj + down_proj: nn.Linear = converted_ff.net[2] + with torch.no_grad(): + up_proj.weight.copy_(torch.cat([zff.w3.weight, zff.w1.weight], dim=0)) + down_proj.weight.copy_(zff.w2.weight) + + return converted_ff + + +if __name__ == "__main__": + _dim = 50 + _hidden_dim = 100 + z_image_ff = ZImageFeedForward(dim=_dim, hidden_dim=_hidden_dim) + with torch.no_grad(): + z_image_ff.w1.weight = Parameter(torch.randn(_hidden_dim, _dim, dtype=torch.float32)) + z_image_ff.w2.weight = Parameter(torch.randn(_dim, _hidden_dim, dtype=torch.float32)) + z_image_ff.w3.weight = Parameter(torch.randn(_hidden_dim, _dim, dtype=torch.float32)) + + converted_ff = convert_z_image_ff(z_image_ff) + + x = torch.randn(_dim, dtype=torch.float32) + + y1 = z_image_ff(x) + + y2 = converted_ff(x) + + print(f"y1: {y1}") + print(f"y2: {y2}") + + print(f"allclose: {torch.allclose(y1, y2)}") + diff --git a/deepcompressor/utils/common.py b/deepcompressor/utils/common.py index 8040a9a..12233d9 100644 --- a/deepcompressor/utils/common.py +++ b/deepcompressor/utils/common.py @@ -192,10 +192,10 @@ def tree_collate(batch: list[tp.Any] | tuple[tp.Any, ...]) -> tp.Any: return [tree_collate(samples) for samples in zip(*batch, strict=True)] elif isinstance(batch[0], torch.Tensor): # if all tensors in batch are exactly the same, return the tensor itself - if all(torch.equal(batch[0], b) for b in batch): - return batch[0] - else: - return torch.cat(batch) + # if all(torch.equal(batch[0], b) for b in batch): + # return batch[0] + # else: + return torch.cat(batch) else: return batch[0] diff --git a/examples/diffusion/configs/__default__.yaml b/examples/diffusion/configs/__default__.yaml index c0ad56c..cab9ef1 100644 --- a/examples/diffusion/configs/__default__.yaml +++ b/examples/diffusion/configs/__default__.yaml @@ -20,7 +20,7 @@ eval: gen_root: "{output}/{job}" ref_root: baselines/{dtype}/{model}/{protocol} gt_stats_root: benchmarks/stats - num_gpus: 8 + num_gpus: 1 batch_size_per_gpu: 1 chunk_start: 0 chunk_step: 1 diff --git a/examples/diffusion/configs/collect/qdiff.yaml b/examples/diffusion/configs/collect/qdiff.yaml index 3142610..b7cb714 100644 --- a/examples/diffusion/configs/collect/qdiff.yaml +++ b/examples/diffusion/configs/collect/qdiff.yaml @@ -1,5 +1,5 @@ collect: root: datasets dataset_name: qdiff - data_path: prompts/qdiff.yaml + data_path: examples/diffusion/prompts/qdiff.yaml num_samples: 128 \ No newline at end of file diff --git a/examples/diffusion/configs/model/z-image-turbo-rank128-skip-refiners.yaml b/examples/diffusion/configs/model/z-image-turbo-rank128-skip-refiners.yaml new file mode 100644 index 0000000..5d4af08 --- /dev/null +++ b/examples/diffusion/configs/model/z-image-turbo-rank128-skip-refiners.yaml @@ -0,0 +1,70 @@ +pipeline: + name: z-image-turbo + dtype: torch.bfloat16 +collect: + root: /data/dongd/dc_calib/datasets/{dtype}/{model}/{protocol}/{data}/s128 +eval: + num_steps: 9 + guidance_scale: 0 + protocol: fmeuler{num_steps}-g{guidance_scale} + benchmarks: + - "deepcompressor/dataset/MJHQ30KMETA.json" + num_samples: 300 +quant: + calib: + batch_size: 1 + path: datasets/{dtype}/{model}/{protocol}/{data}/s128 + num_samples: 128 + num_workers: 1 + wgts: + calib_range: + element_batch_size: 64 + sample_batch_size: 16 + element_size: 512 + sample_size: -1 + low_rank: + rank: 128 + sample_batch_size: 16 + sample_size: -1 + skips: + - nrk + - crk + - transformer_norm + - transformer_add_norm + - attn_add + - ffn_add + ipts: + calib_range: + element_batch_size: 64 + sample_batch_size: 16 + element_size: 512 + sample_size: -1 + skips: + - nrk + - crk + - transformer_norm + - transformer_add_norm + - attn_add + - ffn_add + opts: + calib_range: + element_batch_size: 64 + sample_batch_size: 16 + element_size: 512 + sample_size: -1 + smooth: + proj: + element_batch_size: -1 + sample_batch_size: 16 + element_size: -1 + sample_size: -1 + skips: + - nrk + - crk + - transformer_norm + - transformer_add_norm + - attn_add + - ffn_add + attn: + sample_batch_size: 16 + sample_size: -1 diff --git a/examples/diffusion/configs/model/z-image-turbo-rank128.yaml b/examples/diffusion/configs/model/z-image-turbo-rank128.yaml new file mode 100644 index 0000000..272305c --- /dev/null +++ b/examples/diffusion/configs/model/z-image-turbo-rank128.yaml @@ -0,0 +1,64 @@ +pipeline: + name: z-image-turbo + dtype: torch.bfloat16 +collect: + root: /data/dongd/dc_calib/datasets/{dtype}/{model}/{protocol}/{data}/s128 +eval: + num_steps: 9 + guidance_scale: 0 + protocol: fmeuler{num_steps}-g{guidance_scale} + benchmarks: + - "deepcompressor/dataset/MJHQ30KMETA.json" + num_samples: 300 +quant: + calib: + batch_size: 1 + path: datasets/{dtype}/{model}/{protocol}/{data}/s128 + num_samples: 128 + num_workers: 1 + wgts: + calib_range: + element_batch_size: 64 + sample_batch_size: 16 + element_size: 512 + sample_size: -1 + low_rank: + rank: 128 + sample_batch_size: 16 + sample_size: -1 + skips: + - transformer_norm + - transformer_add_norm + - attn_add + - ffn_add + ipts: + calib_range: + element_batch_size: 64 + sample_batch_size: 16 + element_size: 512 + sample_size: -1 + skips: + - transformer_norm + - transformer_add_norm + - attn_add + - ffn_add + opts: + calib_range: + element_batch_size: 64 + sample_batch_size: 16 + element_size: 512 + sample_size: -1 + smooth: + proj: + element_batch_size: -1 + sample_batch_size: 16 + element_size: -1 + sample_size: -1 + skips: + - transformer_norm + - transformer_add_norm + - attn_add + - ffn_add + attn: + sample_batch_size: 16 + sample_size: -1 diff --git a/examples/diffusion/configs/model/z-image-turbo-rank64.yaml b/examples/diffusion/configs/model/z-image-turbo-rank64.yaml new file mode 100644 index 0000000..33f7a57 --- /dev/null +++ b/examples/diffusion/configs/model/z-image-turbo-rank64.yaml @@ -0,0 +1,64 @@ +pipeline: + name: z-image-turbo + dtype: torch.bfloat16 +collect: + root: /data/dongd/dc_calib/datasets/{dtype}/{model}/{protocol}/{data}/s128 +eval: + num_steps: 9 + guidance_scale: 0 + protocol: fmeuler{num_steps}-g{guidance_scale} + benchmarks: + - "deepcompressor/dataset/MJHQ30KMETA.json" + num_samples: 300 +quant: + calib: + batch_size: 1 + path: datasets/{dtype}/{model}/{protocol}/{data}/s128 + num_samples: 128 + num_workers: 1 + wgts: + calib_range: + element_batch_size: 64 + sample_batch_size: 16 + element_size: 512 + sample_size: -1 + low_rank: + rank: 64 + sample_batch_size: 16 + sample_size: -1 + skips: + - transformer_norm + - transformer_add_norm + - attn_add + - ffn_add + ipts: + calib_range: + element_batch_size: 64 + sample_batch_size: 16 + element_size: 512 + sample_size: -1 + skips: + - transformer_norm + - transformer_add_norm + - attn_add + - ffn_add + opts: + calib_range: + element_batch_size: 64 + sample_batch_size: 16 + element_size: 512 + sample_size: -1 + smooth: + proj: + element_batch_size: -1 + sample_batch_size: 16 + element_size: -1 + sample_size: -1 + skips: + - transformer_norm + - transformer_add_norm + - attn_add + - ffn_add + attn: + sample_batch_size: 16 + sample_size: -1 diff --git a/examples/diffusion/configs/model/z-image-turbo-skip-refiners.yaml b/examples/diffusion/configs/model/z-image-turbo-skip-refiners.yaml new file mode 100644 index 0000000..ce32e4b --- /dev/null +++ b/examples/diffusion/configs/model/z-image-turbo-skip-refiners.yaml @@ -0,0 +1,69 @@ +pipeline: + name: z-image-turbo + dtype: torch.bfloat16 +collect: + root: /data/dongd/dc_calib/datasets/{dtype}/{model}/{protocol}/{data}/s128 +eval: + num_steps: 9 + guidance_scale: 0 + protocol: fmeuler{num_steps}-g{guidance_scale} + benchmarks: + - "deepcompressor/dataset/MJHQ30KMETA.json" + num_samples: 300 +quant: + calib: + batch_size: 1 + path: datasets/{dtype}/{model}/{protocol}/{data}/s128 + num_samples: 128 + num_workers: 1 + wgts: + calib_range: + element_batch_size: 64 + sample_batch_size: 16 + element_size: 512 + sample_size: -1 + low_rank: + sample_batch_size: 16 + sample_size: -1 + skips: + - nrk + - crk + - transformer_norm + - transformer_add_norm + - attn_add + - ffn_add + ipts: + calib_range: + element_batch_size: 64 + sample_batch_size: 16 + element_size: 512 + sample_size: -1 + skips: + - nrk + - crk + - transformer_norm + - transformer_add_norm + - attn_add + - ffn_add + opts: + calib_range: + element_batch_size: 64 + sample_batch_size: 16 + element_size: 512 + sample_size: -1 + smooth: + proj: + element_batch_size: -1 + sample_batch_size: 16 + element_size: -1 + sample_size: -1 + skips: + - nrk + - crk + - transformer_norm + - transformer_add_norm + - attn_add + - ffn_add + attn: + sample_batch_size: 16 + sample_size: -1 diff --git a/examples/diffusion/configs/model/z-image-turbo.yaml b/examples/diffusion/configs/model/z-image-turbo.yaml new file mode 100644 index 0000000..3630015 --- /dev/null +++ b/examples/diffusion/configs/model/z-image-turbo.yaml @@ -0,0 +1,63 @@ +pipeline: + name: z-image-turbo + dtype: torch.bfloat16 +collect: + root: /data/dongd/dc_calib/datasets/{dtype}/{model}/{protocol}/{data}/s128 +eval: + num_steps: 9 + guidance_scale: 0 + protocol: fmeuler{num_steps}-g{guidance_scale} + benchmarks: + - "deepcompressor/dataset/MJHQ30KMETA.json" + num_samples: 300 +quant: + calib: + batch_size: 1 + path: datasets/{dtype}/{model}/{protocol}/{data}/s128 + num_samples: 128 + num_workers: 1 + wgts: + calib_range: + element_batch_size: 64 + sample_batch_size: 16 + element_size: 512 + sample_size: -1 + low_rank: + sample_batch_size: 16 + sample_size: -1 + skips: + - transformer_norm + - transformer_add_norm + - attn_add + - ffn_add + ipts: + calib_range: + element_batch_size: 64 + sample_batch_size: 16 + element_size: 512 + sample_size: -1 + skips: + - transformer_norm + - transformer_add_norm + - attn_add + - ffn_add + opts: + calib_range: + element_batch_size: 64 + sample_batch_size: 16 + element_size: 512 + sample_size: -1 + smooth: + proj: + element_batch_size: -1 + sample_batch_size: 16 + element_size: -1 + sample_size: -1 + skips: + - transformer_norm + - transformer_add_norm + - attn_add + - ffn_add + attn: + sample_batch_size: 16 + sample_size: -1 diff --git a/examples/diffusion/configs/svdquant/int4.yaml b/examples/diffusion/configs/svdquant/int4.yaml index 626635b..4190555 100644 --- a/examples/diffusion/configs/svdquant/int4.yaml +++ b/examples/diffusion/configs/svdquant/int4.yaml @@ -22,4 +22,4 @@ quant: - null allow_unsigned: true pipeline: - shift_activations: true \ No newline at end of file + shift_activations: false \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 55eb2ef..f9f1612 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,7 +22,7 @@ torchvision = ">= 0.18.1" torchmetrics = ">= 1.4.0" ninja = ">= 1.11.1" bitsandbytes = ">= 0.42.0" -transformers = ">= 4.46.0" +transformers = ">= 4.46.0, <=4.57.3" lm_eval = ">= 0.4.2" accelerate = ">= 0.26.0" datasets = ">= 2.16.0" @@ -44,7 +44,7 @@ bs4 = ">= 0.0.2" ftfy = ">= 6.2.0" cd-fvd = ">= 0.1.1" xformers = ">= 0.0.26" -pyav = ">= 13.0.0" +av = ">= 13.0.0" clip = ">= 0.2.0" image_reward = { git = "https://github.com/THUDM/ImageReward.git", branch = "main" } diff --git a/z_image_scripts/z_image_key_map.py b/z_image_scripts/z_image_key_map.py new file mode 100644 index 0000000..40f0c86 --- /dev/null +++ b/z_image_scripts/z_image_key_map.py @@ -0,0 +1,259 @@ +'''printed from ZImageStruct._get_default_key_map()''' +_map = { + 'transformer_norm': { + 'lk_transformer_norm', + 'nrk_transformer_norm', + 'crk_transformer_norm' + }, + 'nrk_transformer_norm': { + 'nrk_transformer_norm' + }, + 'nrk': { + 'nrk_attn_add_qkv_proj', + 'nrk_ffn_add_up_proj', + 'nrk_ffn_up_proj', + 'nrk_ffn_add_down_proj', + 'nrk_attn_out_proj', + 'nrk_transformer_norm', + 'nrk_ffn_down_proj', + 'nrk_transformer_add_norm', + 'nrk_attn_add_out_proj', + 'nrk_attn_qkv_proj' + }, + 'transformer_add_norm': { + 'nrk_transformer_add_norm', + 'lk_transformer_add_norm', + 'crk_transformer_add_norm' + }, + 'nrk_transformer_add_norm': { + 'nrk_transformer_add_norm' + }, + 'attn': { + 'crk_attn_out_proj', + 'nrk_attn_out_proj', + 'lk_attn_out_proj', + 'lk_attn_qkv_proj', + 'crk_attn_qkv_proj', + 'nrk_attn_qkv_proj' + }, + 'nrk_attn': { + 'nrk_attn_qkv_proj', + 'nrk_attn_out_proj' + }, + 'attn_add': { + 'nrk_attn_add_qkv_proj', + 'lk_attn_add_qkv_proj', + 'crk_attn_add_out_proj', + 'crk_attn_add_qkv_proj', + 'nrk_attn_add_out_proj', + 'lk_attn_add_out_proj' + }, + 'nrk_attn_add': { + 'nrk_attn_add_qkv_proj', + 'nrk_attn_add_out_proj' + }, + 'attn_qkv_proj': { + 'lk_attn_qkv_proj', + 'crk_attn_qkv_proj', + 'nrk_attn_qkv_proj' + }, + 'nrk_attn_qkv_proj': { + 'nrk_attn_qkv_proj' + }, + 'attn_out_proj': { + 'lk_attn_out_proj', + 'crk_attn_out_proj', + 'nrk_attn_out_proj' + }, + 'nrk_attn_out_proj': { + 'nrk_attn_out_proj' + }, + 'attn_add_qkv_proj': { + 'nrk_attn_add_qkv_proj', + 'lk_attn_add_qkv_proj', + 'crk_attn_add_qkv_proj' + }, + 'nrk_attn_add_qkv_proj': { + 'nrk_attn_add_qkv_proj' + }, + 'attn_add_out_proj': { + 'lk_attn_add_out_proj', + 'crk_attn_add_out_proj', + 'nrk_attn_add_out_proj' + }, + 'nrk_attn_add_out_proj': { + 'nrk_attn_add_out_proj' + }, + 'ffn': { + 'crk_ffn_up_proj', + 'nrk_ffn_up_proj', + 'lk_ffn_up_proj', + 'nrk_ffn_down_proj', + 'crk_ffn_down_proj', + 'lk_ffn_down_proj' + }, + 'nrk_ffn': { + 'nrk_ffn_down_proj', + 'nrk_ffn_up_proj' + }, + 'ffn_add': { + 'nrk_ffn_add_up_proj', + 'crk_ffn_add_down_proj', + 'lk_ffn_add_up_proj', + 'nrk_ffn_add_down_proj', + 'crk_ffn_add_up_proj', + 'lk_ffn_add_down_proj' + }, + 'nrk_ffn_add': { + 'nrk_ffn_add_up_proj', + 'nrk_ffn_add_down_proj' + }, + 'ffn_up_proj': { + 'nrk_ffn_up_proj', + 'lk_ffn_up_proj', + 'crk_ffn_up_proj' + }, + 'nrk_ffn_up_proj': { + 'nrk_ffn_up_proj' + }, + 'ffn_down_proj': { + 'nrk_ffn_down_proj', + 'crk_ffn_down_proj', + 'lk_ffn_down_proj' + }, + 'nrk_ffn_down_proj': { + 'nrk_ffn_down_proj' + }, + 'ffn_add_up_proj': { + 'nrk_ffn_add_up_proj', + 'crk_ffn_add_up_proj', + 'lk_ffn_add_up_proj' + }, + 'nrk_ffn_add_up_proj': { + 'nrk_ffn_add_up_proj' + }, + 'ffn_add_down_proj': { + 'lk_ffn_add_down_proj', + 'nrk_ffn_add_down_proj', + 'crk_ffn_add_down_proj' + }, + 'nrk_ffn_add_down_proj': { + 'nrk_ffn_add_down_proj' + }, + 'crk_transformer_norm': { + 'crk_transformer_norm' + }, + 'crk': { + 'crk_ffn_up_proj', + 'crk_attn_out_proj', + 'crk_ffn_add_down_proj', + 'crk_attn_add_out_proj', + 'crk_transformer_add_norm', + 'crk_attn_add_qkv_proj', + 'crk_ffn_add_up_proj', + 'crk_ffn_down_proj', + 'crk_transformer_norm', + 'crk_attn_qkv_proj' + }, + 'crk_transformer_add_norm': { + 'crk_transformer_add_norm' + }, + 'crk_attn': { + 'crk_attn_qkv_proj', + 'crk_attn_out_proj' + }, + 'crk_attn_add': { + 'crk_attn_add_out_proj', + 'crk_attn_add_qkv_proj' + }, + 'crk_attn_qkv_proj': { + 'crk_attn_qkv_proj' + }, + 'crk_attn_out_proj': { + 'crk_attn_out_proj' + }, + 'crk_attn_add_qkv_proj': { + 'crk_attn_add_qkv_proj' + }, + 'crk_attn_add_out_proj': { + 'crk_attn_add_out_proj' + }, + 'crk_ffn': { + 'crk_ffn_up_proj', + 'crk_ffn_down_proj' + }, + 'crk_ffn_add': { + 'crk_ffn_add_up_proj', + 'crk_ffn_add_down_proj' + }, + 'crk_ffn_up_proj': { + 'crk_ffn_up_proj' + }, + 'crk_ffn_down_proj': { + 'crk_ffn_down_proj' + }, + 'crk_ffn_add_up_proj': { + 'crk_ffn_add_up_proj' + }, + 'crk_ffn_add_down_proj': { + 'crk_ffn_add_down_proj' + }, + 'lk_transformer_norm': { + 'lk_transformer_norm' + }, + 'lk': { + 'lk_ffn_add_up_proj', + 'lk_transformer_add_norm', + 'lk_attn_add_qkv_proj', + 'lk_transformer_norm', + 'lk_ffn_up_proj', + 'lk_attn_out_proj', + 'lk_attn_qkv_proj', + 'lk_ffn_add_down_proj', + 'lk_ffn_down_proj', + 'lk_attn_add_out_proj' + }, + 'lk_transformer_add_norm': { + 'lk_transformer_add_norm' + }, + 'lk_attn': { + 'lk_attn_out_proj', + 'lk_attn_qkv_proj' + }, + 'lk_attn_add': { + 'lk_attn_add_qkv_proj', + 'lk_attn_add_out_proj' + }, + 'lk_attn_qkv_proj': { + 'lk_attn_qkv_proj' + }, + 'lk_attn_out_proj': { + 'lk_attn_out_proj' + }, + 'lk_attn_add_qkv_proj': { + 'lk_attn_add_qkv_proj' + }, + 'lk_attn_add_out_proj': { + 'lk_attn_add_out_proj' + }, + 'lk_ffn': { + 'lk_ffn_down_proj', + 'lk_ffn_up_proj' + }, + 'lk_ffn_add': { + 'lk_ffn_add_down_proj', + 'lk_ffn_add_up_proj' + }, + 'lk_ffn_up_proj': { + 'lk_ffn_up_proj' + }, + 'lk_ffn_down_proj': { + 'lk_ffn_down_proj' + }, + 'lk_ffn_add_up_proj': { + 'lk_ffn_add_up_proj' + }, + 'lk_ffn_add_down_proj': { + 'lk_ffn_add_down_proj' + } +} \ No newline at end of file diff --git a/z_image_scripts/z_image_key_map_test.sh b/z_image_scripts/z_image_key_map_test.sh new file mode 100755 index 0000000..246e084 --- /dev/null +++ b/z_image_scripts/z_image_key_map_test.sh @@ -0,0 +1 @@ +TORCH_CUDA_ARCH_LIST="8.9" python3 -c "import runpy; runpy.run_path('deepcompressor/app/diffusion/nn/struct.py', run_name='__main__')" \ No newline at end of file diff --git a/z_image_scripts/z_image_turbo_calib.sh b/z_image_scripts/z_image_turbo_calib.sh new file mode 100755 index 0000000..cd067cc --- /dev/null +++ b/z_image_scripts/z_image_turbo_calib.sh @@ -0,0 +1,6 @@ +TORCH_CUDA_ARCH_LIST="9.0" python3 -m deepcompressor.app.diffusion.dataset.collect.calib \ + examples/diffusion/configs/model/z-image-turbo.yaml examples/diffusion/configs/collect/qdiff.yaml + +echo $? + +# nohup z_image_scripts/z_image_turbo_calib.sh > z_image_scripts/z_image_turbo_calib_20251203_0019.log 2>&1 & \ No newline at end of file diff --git a/z_image_scripts/z_image_turbo_convert.sh b/z_image_scripts/z_image_turbo_convert.sh new file mode 100755 index 0000000..692acbd --- /dev/null +++ b/z_image_scripts/z_image_turbo_convert.sh @@ -0,0 +1,11 @@ +echo "-quant-path /data/dongd/dc_saved_model/Z_IMAGE_TURBO_20251204_0743" + +TORCH_CUDA_ARCH_LIST="9.0" python -m deepcompressor.backend.nunchaku.convert \ + --quant-path /data/dongd/dc_saved_model/Z_IMAGE_TURBO_20251204_0743 \ + --output-root /data/dongd/dc_converted_model/Z_IMAGE_TURBO_20251204_0743_r128 \ + --model-name z-image-turbo \ + + +echo $? + +# nohup z_image_scripts/z_image_turbo_convert.sh > z_image_scripts/z_image_turbo_convert_20251205_2244.log 2>&1 & \ No newline at end of file diff --git a/z_image_scripts/z_image_turbo_gen_image_after_quant.sh b/z_image_scripts/z_image_turbo_gen_image_after_quant.sh new file mode 100755 index 0000000..a160ad1 --- /dev/null +++ b/z_image_scripts/z_image_turbo_gen_image_after_quant.sh @@ -0,0 +1,9 @@ +echo "--load-from /data/dongd/dc_saved_model/Z_IMAGE_TURBO_20251203_0031" + +TORCH_CUDA_ARCH_LIST="9.0" python3 -m deepcompressor.app.diffusion.ptq \ + examples/diffusion/configs/model/z-image-turbo.yaml examples/diffusion/configs/svdquant/int4.yaml \ + --load-from /data/dongd/dc_saved_model/Z_IMAGE_TURBO_20251203_0031 --skip-eval true + +echo $? + +# nohup z_image_scripts/z_image_turbo_gen_image_after_quant.sh > z_image_scripts/z_image_turbo_gen_image_after_quant_20251203_1136.log 2>&1 & \ No newline at end of file diff --git a/z_image_scripts/z_image_turbo_quantize.sh b/z_image_scripts/z_image_turbo_quantize.sh new file mode 100755 index 0000000..7c6ef1f --- /dev/null +++ b/z_image_scripts/z_image_turbo_quantize.sh @@ -0,0 +1,7 @@ +TORCH_CUDA_ARCH_LIST="9.0" python3 -m deepcompressor.app.diffusion.ptq \ + examples/diffusion/configs/model/z-image-turbo-rank128-skip-refiners.yaml examples/diffusion/configs/svdquant/int4.yaml \ + --save-model /data/dongd/dc_saved_model/Z_IMAGE_TURBO_20251204_1646 --copy-on-save true --skip-eval true + +echo $? + +# nohup z_image_scripts/z_image_turbo_quantize.sh > z_image_scripts/z_image_turbo_quantize_20251204_1646.log 2>&1 & \ No newline at end of file