nfp4

kylesayrs · kylesayrs · commit 06774281c1ed · 2025-11-07T18:59:10.000Z
Signed-off-by: Kyle Sayers &lt;kylesayrs@gmail.com&gt;
diff --git a/src/llmcompressor/entrypoints/model_free/__init__.py b/src/llmcompressor/entrypoints/model_free/__init__.py
@@ -1,5 +1,6 @@
 import os
 import shutil
+from collections import defaultdict
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from pathlib import Path
 from typing import Optional
@@ -13,13 +14,19 @@
 
 from llmcompressor.entrypoints.model_free.helpers import (
     gpu_if_available,
+    validate_safetensors_index,
     validate_scheme,
 )
 from llmcompressor.entrypoints.model_free.lifecycle import (
-    calibrate_weights,
+    calibrate_global_scale,
+    calibrate_scale_zp,
     compress_module,
     initialize_quantized_linear,
 )
+from llmcompressor.entrypoints.model_free.microscale import (
+    get_fused_names,
+    is_microscale_scheme,
+)
 from llmcompressor.entrypoints.model_free.model_utils import (
     get_checkpoint_files,
     is_weights_file,
@@ -55,16 +62,20 @@ def model_free_ptq(
     model_files = get_checkpoint_files(model_stub)
     scheme_name, scheme = validate_scheme(scheme)
     device = gpu_if_available(device)
+    validate_safetensors_index(model_files, scheme)
 
     # 0. collect safetensors files, copy files
     jobs = []
+    job_fn = (
+        _process_file
+        if not is_microscale_scheme(scheme)
+        else _process_file_microscale_scheme
+    )
     for file_path, resolved_path in model_files:
         save_path = Path(save_directory) / file_path
 
         if file_path.endswith("safetensors"):
-            jobs.append(
-                (_process_file, resolved_path, save_path, scheme, ignore, device)
-            )
+            jobs.append((job_fn, resolved_path, save_path, scheme, ignore, device))
 
         else:
             if is_weights_file(file_path):
@@ -108,6 +119,7 @@ def _process_file(
         ignored
     :param device: device used to quantize and compress weights
     """
+    assert not is_microscale_scheme(scheme), "Use `_process_file_microscale_scheme`"
     tensors = load_file(file_path)
 
     for name in list(tensors.keys()):
@@ -121,7 +133,66 @@ def _process_file(
         module = initialize_quantized_linear(tensors[name], scheme, device)
 
         # 2. calibrate weight qparams
-        calibrate_weights(module)
+        calibrate_scale_zp(module)
+
+        # 3. compress module using qparams
+        compress_module(module)
+
+        # 4. save compressed data (on cpu)
+        del tensors[name]
+        prefix = module_name + "."
+        for key, value in module.state_dict(prefix=prefix).items():
+            tensors[key] = value.to("cpu")
+
+    save_file(tensors, save_path)
+    total_size = sum(tensor.nbytes for tensor in tensors.values())
+    weight_map = {key: os.path.basename(save_path) for key in tensors.keys()}
+    return total_size, weight_map
+
+
+def _process_file_microscale_scheme(
+    file_path: str | os.PathLike,
+    save_path: str | os.PathLike,
+    scheme: QuantizationScheme,
+    ignore: str | list[str],
+    device: str | torch.device,
+) -> tuple[int, dict[str, str]]:
+    """
+    Quantize and compress tensors in a given safetensors file
+
+    :param file_path: safetensors file to process
+    :param save_path: save path of file with quantized weights
+    :param scheme: quantization scheme to apply to tensors
+    :param ignore: modules to ignore. Modules ending with "norm" are automatically
+        ignored
+    :param device: device used to quantize and compress weights
+    """
+    assert is_microscale_scheme(scheme), "Use `_process_file` for non microscale scheme"
+    tensors = load_file(file_path)
+    fused_names = get_fused_names(tensors)
+    fused_names_to_parent = {
+        name: prefix for prefix, names in fused_names.items() for name in names
+    }
+    fused_parent_submodules = defaultdict(dict)
+
+    for name in list(tensors.keys()):
+        module_name, param_name = name.rsplit(".", 1)
+        is_linear_weight = param_name == "weight" and not module_name.endswith("norm")
+        is_ignored = any(_match_name(module_name, ign) for ign in ignore)
+        if not is_linear_weight or is_ignored:
+            continue
+
+        # 1. initialize module with qparams (on device)
+        module = initialize_quantized_linear(tensors[name], scheme, device)
+
+        # 2. calibrate weight qparams. Delay scale/zp calibration for fused modules
+        calibrate_global_scale(module)
+        if name in fused_names_to_parent:
+            fused_parent = fused_names_to_parent[name]
+            fused_parent_submodules[fused_parent][name] = module
+            continue
+
+        calibrate_scale_zp(module)
 
         # 3. compress module using qparams
         compress_module(module)
@@ -132,6 +203,28 @@ def _process_file(
         for key, value in module.state_dict(prefix=prefix).items():
             tensors[key] = value.to("cpu")
 
+    # compress and save miscroscale fused modules
+    for parent_name, named_modules in fused_parent_submodules.items():
+        # 2.1. fuse global scales
+        global_scales = [m.weight_global_scale for m in named_modules.values()]
+        fused_global_scale = torch.min(torch.cat(global_scales, dim=0))
+
+        for name, module in named_modules.items():
+            module_name, param_name = name.rsplit(".", 1)
+            module.weight_global_scale.data.copy_(fused_global_scale)
+
+            # 2.2. finish calibration with fused global scales
+            calibrate_scale_zp(module)
+
+            # 3. compress module using qparams
+            compress_module(module)
+
+            # 4. save compressed data (on cpu)
+            del tensors[name]
+            prefix = module_name + "."
+            for key, value in module.state_dict(prefix=prefix).items():
+                tensors[key] = value.to("cpu")
+
     save_file(tensors, save_path)
     total_size = sum(tensor.nbytes for tensor in tensors.values())
     weight_map = {key: os.path.basename(save_path) for key in tensors.keys()}
diff --git a/src/llmcompressor/entrypoints/model_free/helpers.py b/src/llmcompressor/entrypoints/model_free/helpers.py
@@ -1,12 +1,16 @@
-from typing import Optional
+import json
 
 import torch
-from compressed_tensors.quantization import QuantizationScheme, preset_name_to_scheme
+from compressed_tensors.quantization import (
+    QuantizationScheme,
+    preset_name_to_scheme,
+)
 from compressed_tensors.utils import getattr_chain
-from compressed_tensors.utils.match import _match_name
 from loguru import logger
 
-__all__ = ["validate_scheme", "gpu_if_available", "is_match_name"]
+from .microscale import get_fused_names, is_microscale_scheme
+
+__all__ = ["validate_scheme", "gpu_if_available"]
 
 
 def validate_scheme(scheme: QuantizationScheme) -> tuple[str, QuantizationScheme]:
@@ -48,6 +52,34 @@ def validate_scheme(scheme: QuantizationScheme) -> tuple[str, QuantizationScheme
     return scheme_name, scheme
 
 
+def validate_safetensors_index(
+    model_files: list[tuple[str, str]], scheme: QuantizationScheme
+):
+    resolved_paths = [
+        resolved_path
+        for file_path, resolved_path in model_files
+        if file_path.endswith("safetensors.index.json")
+    ]
+    if len(resolved_paths) <= 0:
+        return
+    resolved_path = resolved_paths[0]
+
+    if is_microscale_scheme(scheme):
+        with open(resolved_path, "r") as file:
+            weight_map: dict[str, str] = json.load(file)["weight_map"]
+
+        fused_names = get_fused_names(weight_map)
+        for submodule_names in fused_names.values():
+            file_names = [weight_map[name] for name in submodule_names]
+            if not all(file_name == file_names[0] for file_name in file_names):
+                raise NotImplementedError(
+                    "When using a microscale scheme (NVFP4, MXFP4), global scales "
+                    "will be fused. Current implmentation requires that all fused "
+                    "modules (attention and non-moe mlp) be stored in the same file. "
+                    f"Instead, got {submodule_names}\n\n {file_names}"
+                )
+
+
 def gpu_if_available(device: torch.device | str | None) -> torch.device:
     if device is not None:
         return torch.device(device)
@@ -61,15 +93,3 @@ def gpu_if_available(device: torch.device | str | None) -> torch.device:
     else:
         logger.warning("CUDA/XPU is not available! Compressing model on CPU instead")
         return torch.device("cpu")
-
-
-def is_match_name(
-    name: str, targets: list[str], ignore: Optional[str | list[str]] = None
-) -> bool:
-    targets = targets if isinstance(targets, list) else [targets]
-    ignore = ignore if isinstance(ignore, list) else [ignore]
-
-    matches_target = any(_match_name(name, target) for target in targets)
-    matches_ignore = any(_match_name(name, ign) for ign in ignore)
-
-    return matches_target and not matches_ignore
diff --git a/src/llmcompressor/entrypoints/model_free/lifecycle.py b/src/llmcompressor/entrypoints/model_free/lifecycle.py
@@ -3,7 +3,6 @@
 from compressed_tensors.config.format import _get_quant_compression_format
 from compressed_tensors.quantization import (
     QuantizationScheme,
-    QuantizationStrategy,
     initialize_module_for_quantization,
 )
 
@@ -17,7 +16,8 @@
 
 __all__ = [
     "initialize_quantized_linear",
-    "calibrate_weights",
+    "calibrate_global_scale",
+    "calibrate_scale_zp",
     "compress_module",
 ]
 
@@ -35,15 +35,17 @@ def initialize_quantized_linear(
     return module
 
 
-def calibrate_weights(module: torch.nn.Linear):
-    scheme: QuantizationScheme = getattr(module, "quantization_scheme")
+def calibrate_global_scale(module: torch.nn.Linear):
     initialize_observer(module, "weight")
+    apply_calibration_status(module)
+    update_weight_global_scale(module)
+    freeze_module_quantization(module)
 
+
+def calibrate_scale_zp(module: torch.nn.Linear):
+    initialize_observer(module, "weight")
     apply_calibration_status(module)
-    if scheme.weights.strategy == QuantizationStrategy.TENSOR_GROUP:
-        update_weight_global_scale(module)
     update_weight_zp_scale(module)
-
     freeze_module_quantization(module)
 
 
diff --git a/src/llmcompressor/entrypoints/model_free/microscale.py b/src/llmcompressor/entrypoints/model_free/microscale.py
@@ -0,0 +1,49 @@
+import torch
+from compressed_tensors.quantization import QuantizationScheme, QuantizationStrategy
+
+__all__ = ["get_fused_names", "is_microscale_scheme"]
+
+
+def is_microscale_scheme(scheme: QuantizationScheme) -> bool:
+    assert scheme.weights is not None
+    return scheme.weights.strategy == QuantizationStrategy.TENSOR_GROUP
+
+
+def get_fused_names(tensors: dict[str, torch.Tensor]) -> dict[str, list[str]]:
+    fused_names = {}
+
+    for name in tensors:
+        parts = name.rsplit(".")
+        if len(parts) < 3:
+            continue
+
+        parent, module, param = parts[-3:]
+
+        if (
+            ("attn" in parent or "attention" in parent)
+            and module == "q_proj"
+            and param == "weight"
+        ):
+            parent_name = ".".join((*parts[:-3], parent))
+            q_name = ".".join((parent_name, "q_proj", param))
+            k_name = ".".join((parent_name, "k_proj", param))
+            v_name = ".".join((parent_name, "v_proj", param))
+
+            submodule_names = [q_name, k_name, v_name]
+
+            if all(name in tensors for name in submodule_names):
+                assert parent_name not in fused_names
+                fused_names[parent_name] = submodule_names
+
+        if "mlp" in parent and module == "gate_proj" and param == "weight":
+            parent_name = ".".join((*parts[:-3], parent))
+            gate_name = ".".join((parent_name, "gate_proj", param))
+            up_name = ".".join((parent_name, "up_proj", param))
+
+            submodule_names = [gate_name, up_name]
+
+            if all(name in tensors for name in submodule_names):
+                assert parent_name not in fused_names
+                fused_names[parent_name] = submodule_names
+
+    return fused_names
diff --git a/src/llmcompressor/entrypoints/model_free/model_utils.py b/src/llmcompressor/entrypoints/model_free/model_utils.py
@@ -18,7 +18,7 @@ def is_weights_file(file_name: str) -> bool:
     return any(file_name.endswith(suffix) for suffix in weights_files)
 
 
-def get_checkpoint_files(model_stub: str | os.PathLike) -> list[str]:
+def get_checkpoint_files(model_stub: str | os.PathLike) -> list[tuple[str, str]]:
     # In the future, this function can accept and pass download kwargs to cached_file
 
     if os.path.exists(model_stub):
diff --git a/tests/llmcompressor/pipelines/test_model_free_ptq.py b/tests/llmcompressor/pipelines/test_model_free_ptq.py
@@ -41,7 +41,8 @@ def _get_tiny_block_quant():
 
 @requires_gpu
 @pytest.mark.parametrize(
-    "scheme", [_get_tiny_w4a16_quant(), "FP8_dynamic", _get_tiny_block_quant()]
+    "scheme",
+    [_get_tiny_w4a16_quant(), "FP8_dynamic", _get_tiny_block_quant(), "NVFP4A16"],
 )
 def test_model_free_ptq_matches_oneshot(scheme, tmp_path):
     model = "nm-testing/tinysmokellama-3.2"

Original file line number	Diff line number	Diff line change
`@@ -41,7 +41,8 @@ def _get_tiny_block_quant():`
`41`	`41`
`42`	`42`	`@requires_gpu`
`43`	`43`	`@pytest.mark.parametrize(`
`44`		`- "scheme", [_get_tiny_w4a16_quant(), "FP8_dynamic", _get_tiny_block_quant()]`
	`44`	`+ "scheme",`
	`45`	`+ [_get_tiny_w4a16_quant(), "FP8_dynamic", _get_tiny_block_quant(), "NVFP4A16"],`
`45`	`46`	`)`
`46`	`47`	`def test_model_free_ptq_matches_oneshot(scheme, tmp_path):`
`47`	`48`	`model = "nm-testing/tinysmokellama-3.2"`