NVIDIA
diff --git a/‎examples/vllm_serve/convert_amax_hf2vllm.py‎
Lines changed: 0 additions & 213 deletions b/‎examples/vllm_serve/convert_amax_hf2vllm.py‎
Lines changed: 0 additions & 213 deletions
diff --git a/‎examples/vllm_serve/fakequant_worker.py‎
Lines changed: 74 additions & 1 deletion b/‎examples/vllm_serve/fakequant_worker.py‎
Lines changed: 74 additions & 1 deletion
diff --git a/‎modelopt/torch/export/unified_export_hf.py‎
Lines changed: 43 additions & 2 deletions b/‎modelopt/torch/export/unified_export_hf.py‎
Lines changed: 43 additions & 2 deletions
@@ -15,7 +15,9 @@
 
 import dataclasses
 import os
+import re
 import warnings
+from collections import defaultdict
 from contextlib import contextmanager
 from typing import Any
 
@@ -30,6 +32,68 @@
 from modelopt.torch.utils.dataset_utils import get_dataset_dataloader
 
 
+def convert_amax_hf2vllm(
+    hf_state_dict: dict[str, torch.Tensor],
+) -> dict[str, torch.Tensor]:
+    """
+    Convert amax values from HuggingFace format to vLLM format.
+
+    This function merges:
+    - q_proj, k_proj, v_proj amax values into qkv_proj (taking max)
+    - gate_proj, up_proj amax values into gate_up_proj (taking max)
+
+    Args:
+        hf_state_dict: HuggingFace state dict containing amax values
+
+    Returns:
+        vLLM format state dict with merged amax values
+    """
+    vllm_state_dict = {}
+
+    # Group keys by their base pattern (without the specific projection name)
+    merge_groups = defaultdict(list)
+
+    for key, value in hf_state_dict.items():
+        if "_amax" not in key:
+            # Copy non-amax keys as-is
+            vllm_state_dict[key] = value
+            continue
+
+        # Check if this is a q/k/v projection that needs merging
+        qkv_match = re.search(r"(.*\.)([qkv])_proj(\..+_amax)$", key)
+        if qkv_match:
+            base_pattern = qkv_match.group(1) + "qkv_proj" + qkv_match.group(3)
+            merge_groups[base_pattern].append((key, value))
+            continue
+
+        # Check if this is a gate/up projection that needs merging
+        gate_up_match = "mixer" not in key and re.search(r"(.*\.)(gate|up)_proj(\..+_amax)$", key)
+        if gate_up_match:
+            base_pattern = gate_up_match.group(1) + "gate_up_proj" + gate_up_match.group(3)
+            merge_groups[base_pattern].append((key, value))
+            continue
+
+        # Copy other amax keys as-is (like o_proj, down_proj)
+        vllm_state_dict[key] = value
+
+    # Merge grouped amax values by taking the maximum
+    for merged_key, key_value_pairs in merge_groups.items():
+        if len(key_value_pairs) > 1:
+            # Take the maximum across all values for this merged key
+            values = [value for _, value in key_value_pairs]
+            merged_value = torch.stack(values).max(dim=0)[0]
+            vllm_state_dict[merged_key] = merged_value
+            print(f"Merged {len(key_value_pairs)} keys into {merged_key}")
+            for orig_key, _ in key_value_pairs:
+                print(f"  - {orig_key}")
+        else:
+            # Single key, just rename it
+            _, value = key_value_pairs[0]
+            vllm_state_dict[merged_key] = value
+
+    return vllm_state_dict
+
+
 @contextmanager
 def disable_compilation(model):
     do_not_compile = True
@@ -154,8 +218,17 @@ def calibrate_loop(model: Any = None) -> None:
     if amax_file_path:
         print(f"Loading amax values from {amax_file_path}")
         saved_amax_dict = torch.load(amax_file_path)
-        current_state_dict = model.state_dict()
+        # convert amax keys to vLLM format
+        if hasattr(self.model_runner.model, "hf_to_vllm_mapper"):
+            saved_amax_dict = self.model_runner.model.hf_to_vllm_mapper.apply_dict(saved_amax_dict)
+            saved_amax_dict = {
+                key.replace("quantizer_amax", "quantizer._amax"): value
+                for key, value in saved_amax_dict.items()
+                if key.endswith("quantizer_amax")
+            }
+        saved_amax_dict = convert_amax_hf2vllm(saved_amax_dict)
 
+        current_state_dict = model.state_dict()
         # Count amax keys in checkpoint and model
         checkpoint_amax_keys = [key for key in saved_amax_dict if key.endswith("_amax")]
         model_amax_keys = [key for key in current_state_dict if key.endswith("_amax")]
 
@@ -33,7 +33,11 @@
 from modelopt.torch.quantization import set_quantizer_by_cfg_context
 from modelopt.torch.quantization.nn import SequentialQuantizer, TensorQuantizer
 from modelopt.torch.quantization.qtensor import NVFP4QTensor
-from modelopt.torch.quantization.utils import fsdp2_aware_weight_update, quantizer_attr_names
+from modelopt.torch.quantization.utils import (
+    fsdp2_aware_weight_update,
+    get_quantizer_state_dict,
+    quantizer_attr_names,
+)
 
 from .convert_hf_config import convert_hf_quant_config_format
 from .layer_utils import (
@@ -73,7 +77,7 @@
     to_quantized_weight,
 )
 
-__all__ = ["export_hf_checkpoint"]
+__all__ = ["export_hf_bf16_weights_amax", "export_hf_checkpoint"]
 
 
 def _is_enabled_quantizer(quantizer):
@@ -588,3 +592,40 @@ def export_hf_checkpoint(
             " can be saved with torch.save for further inspection."
         )
         raise e
+
+
+def export_hf_bf16_weights_amax(
+    model: nn.Module,
+    export_dir: Path | str = tempfile.gettempdir(),
+):
+    """Exports the torch model weights and amax values separately which can be used for vLLM fakequant serve.
+
+    This function:
+    1. Extracts amax values for calibration
+    2. Deletes all quantizer parameters from state dict to store only weights in original dtype
+    3. Saves model checkpoint (with weights in original dtype) and amax values separately
+
+    Args:
+        model: The quantized model to export
+        export_dir: Directory to save the model and artifacts
+    """
+    export_dir = Path(export_dir)
+    export_dir.mkdir(parents=True, exist_ok=True)
+
+    amax_dict = {
+        name + "._amax": param["_amax"].detach().clone().cpu()
+        for name, param in get_quantizer_state_dict(model).items()
+        if "_amax" in param
+    }
+
+    # remove quantizer from model
+    for name, module in model.named_modules():
+        if is_quantlinear(module):
+            delattr(module, "weight_quantizer")
+            delattr(module, "input_quantizer")
+            delattr(module, "output_quantizer")
+            module.export()
+
+    # Save with model without quantizer parameters
+    model.save_pretrained(export_dir)
+    torch.save(amax_dict, f"{export_dir}/quant_amax.pth")