vllm-project
diff --git a/‎examples/model_free_ptq/README.md‎
Lines changed: 50 additions & 0 deletions b/‎examples/model_free_ptq/README.md‎
Lines changed: 50 additions & 0 deletions
diff --git a/‎examples/model_free_ptq/kimi_k2_thinking_fp8_block.py‎
Lines changed: 1 addition & 1 deletion b/‎examples/model_free_ptq/kimi_k2_thinking_fp8_block.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/model_free_ptq/kimi_k2_thinking_nvfp4a16.py‎
Lines changed: 36 additions & 0 deletions b/‎examples/model_free_ptq/kimi_k2_thinking_nvfp4a16.py‎
Lines changed: 36 additions & 0 deletions
diff --git a/‎setup.py‎
Lines changed: 1 addition & 0 deletions b/‎setup.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/llmcompressor/entrypoints/model_free/__init__.py‎
Lines changed: 20 additions & 62 deletions b/‎src/llmcompressor/entrypoints/model_free/__init__.py‎
Lines changed: 20 additions & 62 deletions
diff --git a/‎src/llmcompressor/entrypoints/model_free/helpers.py‎
Lines changed: 81 additions & 50 deletions b/‎src/llmcompressor/entrypoints/model_free/helpers.py‎
Lines changed: 81 additions & 50 deletions
@@ -13,3 +13,53 @@
 In `kimi_k2_thinking_fp8_block.py`, we call `model_free_ptq` by providing a `scheme` and `ignore` list, similar to how we provide reicpes to `oneshot` calls. In the case of Kimi-K2 Thinking, we apply the `FP8_BLOCK` scheme and ignore layers that are incompatible with a block_size of 128 (specifically, `kv_a_proj_with_mqa` and `q_a_proj`).
 
 In contrast to `oneshot`, we expect the model stub or pathway string to be directly passed in, as opposed to first being loaded through transformers. Once complete, the model is compressed using compressed-tensors and saved to `SAVE_DIR`.
+
+To get started, simply call `model_free_ptq` with your desired model stub and save directory
+```python
+model_free_ptq(
+    model_stub="unsloth/Kimi-K2-Thinking-BF16",
+    save_directory="Kimi-K2-Thinking-FP8-BLOCK",
+    scheme="FP8_BLOCK",
+    ignore=[
+        "re:.*gate$",
+        "lm_head",
+        "re:.*kv_a_proj_with_mqa$",
+        "re:.*q_a_proj$",
+        "model.embed_tokens",
+    ],
+    max_workers=15,
+    device="cuda:0",
+)
+
+```
+
+
+# Quantizing models to NVFP4A16/ MXFP4A16
+
+Using `model_free_ptq` to quantizing models with microscale schemes (NVFP4/MXFP4) is the same as quantizing models using non-microscale schemes, except for one additional step. That extra step is that the safetensors in the model files must be reindexed in order to guarantee that fused modules (qkv, gate_up) end up in the same safetensors files, which assists `model_free_ptq` in fusing global scales.
+
+First, apply `llmcompressor.reindex_fused_weights` from the command line entrypoint
+```bash
+llmcompressor.reindex_fused_weights \
+    unsloth/Kimi-K2-Thinking-BF16 \
+    Kimi-K2-Thinking-BF16-reindexed \
+    --num_workers=10
+```
+
+Then, call `model_free_ptq` on the reindex files
+```python
+model_free_ptq(
+    model_stub="Kimi-K2-Thinking-BF16-reindexed",
+    save_directory="Kimi-K2-Thinking-BF16-NVFP4A16",
+    scheme="FP8_BLOCK",
+    ignore=[
+        "re:.*gate$",
+        "lm_head",
+        "re:.*kv_a_proj_with_mqa$",
+        "re:.*q_a_proj$",
+        "model.embed_tokens",
+    ],
+    max_workers=15,
+    device="cuda:0",
+)
+```
@@ -1,7 +1,7 @@
 from llmcompressor import model_free_ptq
 
 MODEL_ID = "unsloth/Kimi-K2-Thinking-BF16"
-SAVE_DIR = "Kimi-K2-Thinking-FP8-Block"
+SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-FP8-BLOCK"
 
 # Apply FP8-Block to the model
 # Once quantized, the model is saved
 
@@ -0,0 +1,36 @@
+"""
+NOTE: Please run the following script before using `model_free_ptq`
+
+This script is used to reindex the safetensors files of a model such that all fused
+modules (gate_up, qkv) are in the same safetensors file. This is required by
+model_free_ptq for microscale schemes (NVFP4A16, MXFP4A16)
+
+llmcompressor.reindex_fused_weights \
+    unsloth/Kimi-K2-Thinking-BF16 \
+    Kimi-K2-Thinking-BF16-reindexed \
+    --num_workers=10
+"""
+
+from llmcompressor import model_free_ptq
+
+MODEL_ID = "unsloth/Kimi-K2-Thinking-BF16"
+REINDEX_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-reindexed"
+SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-NVFP4A16"
+
+# See above notice pertaining to safetensors reindexing
+# After running `llmcompressor.reindex_fused_weights`,
+# use `model_free_ptq` to apply NVFP4A16 quantization
+model_free_ptq(
+    model_stub=REINDEX_DIR,
+    save_directory=SAVE_DIR,
+    scheme="FP8_BLOCK",
+    ignore=[
+        "re:.*gate$",
+        "lm_head",
+        "re:.*kv_a_proj_with_mqa$",
+        "re:.*q_a_proj$",
+        "model.embed_tokens",
+    ],
+    max_workers=15,
+    device="cuda:0",
+)
@@ -184,6 +184,7 @@ def localversion_func(version: ScmVersion) -> str:
     entry_points={
         "console_scripts": [
             "llmcompressor.trace=llmcompressor.transformers.tracing.debug:main",
+            "llmcompressor.reindex_fused_weights=llmcompressor.entrypoints.model_free.reindex_fused_weights:main",
         ]
     },
     python_requires=">=3.10",
 
@@ -7,27 +7,28 @@
 import torch
 import tqdm
 from compressed_tensors.quantization import QuantizationScheme
-from compressed_tensors.utils.match import _match_name
 from loguru import logger
-from safetensors.torch import load_file, save_file
 
-from llmcompressor.entrypoints.model_free.helpers import (
-    gpu_if_available,
-    validate_scheme,
-)
-from llmcompressor.entrypoints.model_free.lifecycle import (
-    calibrate_weights,
-    compress_module,
-    initialize_quantized_linear,
+from llmcompressor.entrypoints.model_free.helpers import gpu_if_available
+from llmcompressor.entrypoints.model_free.microscale import (
+    is_microscale_scheme,
 )
 from llmcompressor.entrypoints.model_free.model_utils import (
     get_checkpoint_files,
     is_weights_file,
 )
+from llmcompressor.entrypoints.model_free.process import (
+    process_file,
+    process_file_microscale_scheme,
+)
 from llmcompressor.entrypoints.model_free.save_utils import (
     update_config,
     update_safetensors_index,
 )
+from llmcompressor.entrypoints.model_free.validate import (
+    validate_safetensors_index,
+    validate_scheme,
+)
 
 __all__ = ["model_free_ptq"]
 
@@ -55,20 +56,24 @@ def model_free_ptq(
     model_files = get_checkpoint_files(model_stub)
     scheme_name, scheme = validate_scheme(scheme)
     device = gpu_if_available(device)
+    validate_safetensors_index(model_files, scheme)
 
     # 0. collect safetensors files, copy files
     jobs = []
-    for file_path, resolved_path in model_files:
+    job_fn = (
+        process_file
+        if not is_microscale_scheme(scheme)
+        else process_file_microscale_scheme
+    )
+    for file_path, resolved_path in model_files.items():
         save_path = Path(save_directory) / file_path
 
         if file_path.endswith("safetensors"):
-            jobs.append(
-                (_process_file, resolved_path, save_path, scheme, ignore, device)
-            )
+            jobs.append((job_fn, resolved_path, save_path, scheme, ignore, device))
 
         else:
             if is_weights_file(file_path):
-                logger.warning(f"Skipping weights file {file_path}")
+                logger.warning(f"Skip processing for weights file {file_path}")
             save_path.parent.mkdir(parents=True, exist_ok=True)
             logger.info(f"Copying {file_path} {save_path}")
             shutil.copyfile(resolved_path, save_path)
@@ -89,50 +94,3 @@ def model_free_ptq(
     # 5. update config and safetensors index
     update_config(save_directory, scheme_name, scheme, ignore)
     update_safetensors_index(save_directory, total_size, weight_map)
-
-
-def _process_file(
-    file_path: str | os.PathLike,
-    save_path: str | os.PathLike,
-    scheme: QuantizationScheme,
-    ignore: str | list[str],
-    device: str | torch.device,
-) -> tuple[int, dict[str, str]]:
-    """
-    Quantize and compress tensors in a given safetensors file
-
-    :param file_path: safetensors file to process
-    :param save_path: save path of file with quantized weights
-    :param scheme: quantization scheme to apply to tensors
-    :param ignore: modules to ignore. Modules ending with "norm" are automatically
-        ignored
-    :param device: device used to quantize and compress weights
-    """
-    tensors = load_file(file_path)
-
-    for name in list(tensors.keys()):
-        module_name, param_name = name.rsplit(".", 1)
-        is_linear_weight = param_name == "weight" and not module_name.endswith("norm")
-        is_ignored = any(_match_name(module_name, ign) for ign in ignore)
-        if not is_linear_weight or is_ignored:
-            continue
-
-        # 1. initialize module with qparams (on device)
-        module = initialize_quantized_linear(tensors[name], scheme, device)
-
-        # 2. calibrate weight qparams
-        calibrate_weights(module)
-
-        # 3. compress module using qparams
-        compress_module(module)
-
-        # 4. save compressed data (on cpu)
-        del tensors[name]
-        prefix = module_name + "."
-        for key, value in module.state_dict(prefix=prefix).items():
-            tensors[key] = value.to("cpu")
-
-    save_file(tensors, save_path)
-    total_size = sum(tensor.nbytes for tensor in tensors.values())
-    weight_map = {key: os.path.basename(save_path) for key in tensors.keys()}
-    return total_size, weight_map
@@ -1,51 +1,25 @@
-from typing import Optional
+import os
+from collections import defaultdict
+from typing import Mapping, TypeVar
 
 import torch
-from compressed_tensors.quantization import QuantizationScheme, preset_name_to_scheme
-from compressed_tensors.utils import getattr_chain
 from compressed_tensors.utils.match import _match_name
 from loguru import logger
+from transformers.file_utils import CONFIG_NAME
 
-__all__ = ["validate_scheme", "gpu_if_available", "is_match_name"]
+__all__ = [
+    "gpu_if_available",
+    "find_safetensors_index_path",
+    "find_config_path",
+    "find_safetensors_index_file",
+    "match_names_set_eager",
+    "MatchedNamesSet",
+    "invert_mapping",
+]
 
-
-def validate_scheme(scheme: QuantizationScheme) -> tuple[str, QuantizationScheme]:
-    # treat strings as preset schemes
-    if isinstance(scheme, str):
-        scheme_name, scheme = scheme, preset_name_to_scheme(scheme, [])
-    else:
-        scheme_name = "config_group_0"
-
-    # weight quantization must be provided
-    if scheme.weights is None:
-        raise ValueError(
-            "Must provide a weights quanitization scheme to perform weights-only PTQ"
-        )
-
-    # activation quantization must be dynamic
-    input_dynamic = getattr_chain(scheme, "input_activations.dynamic", True)
-    output_dynamic = getattr_chain(scheme, "output_activations.dynamic", True)
-    if input_dynamic is not True or output_dynamic is not True:
-        raise ValueError(
-            "Model Free PTQ cannot calibrate activations. "
-            "Please use `oneshot` instead."
-        )
-
-    # override with static observers
-    # Remove after https://github.com/vllm-project/compressed-tensors/pull/489
-    if scheme.weights.observer in ("minmax", "mse"):
-        new_observer = f"static_{scheme.weights.observer}"
-        logger.warning(
-            f"Scheme uses {scheme.weights.observer} weight observer. "
-            f"Using {new_observer} instead"
-        )
-        scheme.weights.observer = new_observer
-
-    # target all modules; filter by ignore list
-    # technically this should be "re:.*", but vllm's
-    # ct moe layer has a hard coded check for "Linear"
-    scheme.targets = ["Linear"]
-    return scheme_name, scheme
+KeyType = TypeVar("K")
+ValueType = TypeVar("V")
+MatchedNamesSet = dict[str, str | None]
 
 
 def gpu_if_available(device: torch.device | str | None) -> torch.device:
@@ -63,13 +37,70 @@ def gpu_if_available(device: torch.device | str | None) -> torch.device:
         return torch.device("cpu")
 
 
-def is_match_name(
-    name: str, targets: list[str], ignore: Optional[str | list[str]] = None
-) -> bool:
-    targets = targets if isinstance(targets, list) else [targets]
-    ignore = ignore if isinstance(ignore, list) else [ignore]
+def find_safetensors_index_path(save_directory: str | os.PathLike) -> str | None:
+    for file_name in os.listdir(save_directory):
+        if file_name.endswith("safetensors.index.json"):
+            return os.path.join(save_directory, file_name)
+
+    return None
+
+
+def find_config_path(save_directory: str | os.PathLike) -> str | None:
+    for file_name in os.listdir(save_directory):
+        if file_name in (CONFIG_NAME, "params.json"):
+            return os.path.join(save_directory, file_name)
+
+    return None
+
+
+def find_safetensors_index_file(model_files: dict[str, str]) -> str | None:
+    for file_path, resolved_path in model_files.items():
+        if file_path.endswith("safetensors.index.json"):
+            return resolved_path
+
+    return None
+
+
+def match_names_set_eager(
+    names: set[str] | list[str],
+    targets: set[str] | list[str],
+    return_unmatched: bool = True,
+) -> list[MatchedNamesSet] | tuple[list[MatchedNamesSet], MatchedNamesSet]:
+    matched_sets = []
+    matches = dict.fromkeys(targets, None)
+
+    for name in names:
+        # match until we get a full set
+        for target in targets:
+            if _match_name(name, target):
+                if matches[target] is None:
+                    matches[target] = name
+                else:
+                    # matched target twice without completing a set
+                    raise ValueError(
+                        f"Matched a {target} twice before "
+                        f"completing set ({matches[target]}, {name})"
+                    )
+
+        # once we have a full set, yield and reset
+        if all((matches[target] is not None for target in targets)):
+            matched_sets.append(matches)
+            matches = dict.fromkeys(targets, None)
+
+    unmatched_set = matches if any((v is not None for v in matches.values())) else None
+
+    if return_unmatched:
+        return matched_sets, unmatched_set
+    else:
+        return matched_sets
+
+
+def invert_mapping(
+    mapping: Mapping[KeyType, ValueType],
+) -> dict[ValueType, list[KeyType]]:
+    inverse = defaultdict(list)
 
-    matches_target = any(_match_name(name, target) for target in targets)
-    matches_ignore = any(_match_name(name, ign) for ign in ignore)
+    for key, value in mapping.items():
+        inverse[value].append(key)
 
-    return matches_target and not matches_ignore
+    return inverse
Original file line number	Diff line number	Diff line change
`@@ -184,6 +184,7 @@ def localversion_func(version: ScmVersion) -> str:`
`184`	`184`	`entry_points={`
`185`	`185`	`"console_scripts": [`
`186`	`186`	`"llmcompressor.trace=llmcompressor.transformers.tracing.debug:main",`
	`187`	`+ "llmcompressor.reindex_fused_weights=llmcompressor.entrypoints.model_free.reindex_fused_weights:main",`
`187`	`188`	`]`
`188`	`189`	`},`
`189`	`190`	`python_requires=">=3.10",`