Updated docs

kinjalpatel27 · kinjalpatel27 · commit 99464630119a · 2025-11-19T22:05:20.000Z
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -32,7 +32,7 @@ Model Optimizer Changelog (Linux)
 - Add ``specdec_bench`` example to benchmark speculative decoding performance. See `examples/specdec_bench/README.md <https://github.com/NVIDIA/TensorRT-Model-Optimizer/tree/main/examples/specdec_bench#speculative-decoding-benchmark>`_ for more details.
 - Add FP8/NVFP4 KV cache quantization support for Megatron Core models.
 - Add flag ``trt_plugins_precision`` in ONNX autocast to indicate custom ops precision. This is similar to the flag already existing in the quantization workflow.
-
+- Added support for QAT fakequant evaluation in vLLM. in vLLM for fast evaluation of arbitrary quantization formats. See ``examples/vllm_serve/README.md#load-qatptq-model-and-serve-in-vllm-wip`` for more details.
 
 0.39 (2025-11-11)
 ^^^^^^^^^^^^^^^^^
diff --git a/examples/vllm_serve/README.md b/examples/vllm_serve/README.md
@@ -55,15 +55,18 @@ lm_eval --model local-completions --tasks gsm8k --model_args model=<model_name>,
 
 ## Load QAT/PTQ model and serve in vLLM (WIP)
 
-Overwrite the calibrated amax value with prepared values from either PTQ/QAT. This is only tested for Llama3.1
+Overwrite the calibrated amax value with prepared values from either QAT/PTQ.
 
-Step 1: convert amax to merged amax, using llama3.1 as an example:
+Step 1: export the model with bf16 weights and amax values.
 
-```bash
-python convert_amax_hf2vllm.py -i <amax.pth> -o <vllm_amax.pth>
-```
+- For HF model set `export_bf16_weights_amax` to export the model with function `modelopt.torch.export.unified_export_hf.export_hf_checkpoint`.
+- For MCore model use `export_bf16_weights_amax` to export the model with function `modelopt.torch.export.unified_export_megatron.export_mcore_gpt_to_hf`.
+
+Step 2: configure <quant_amax.pth> from exported model using AMAX_FILE_PATH environment variable in step 1. For example:
 
-Step 2: add `<vllm_amax.pth>` to `quant_config` in `vllm_serve_fakequant.py`
+```
+AMAX_FILE_PATH=<vllm_amax.pth> QUANT_CFG=<quant_config> python vllm_serve_fakequant.py <model_path> -tp 8 --host 0.0.0.0 --port 8000
+```
 
 ## Important Notes
 
@@ -85,3 +88,4 @@ torch.distributed.barrier()
 ## Known Problems
 
 1. AWQ is not yet supported in vLLM.
+2. PTQ/QAT checkpoint doesn't work with KV Cache quantization enabled.
diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py
@@ -76,7 +76,7 @@
     to_quantized_weight,
 )
 
-__all__ = ["export_hf_bf16_weights_amax", "export_hf_checkpoint"]
+__all__ = ["export_hf_checkpoint"]
 
 
 def _is_enabled_quantizer(quantizer):
@@ -531,11 +531,44 @@ def _export_hf_checkpoint(
     return quantized_state_dict, quant_config
 
 
+def _export_hf_bf16_weights_amax(
+    model: nn.Module,
+) -> tuple[dict[str, torch.Tensor], dict[str, Any]]:
+    """Exports the torch model weights and amax values separately.
+
+    This function:
+    1. Extracts amax values for calibration
+    2. Deletes all quantizer parameters from state dict to store only weights in original dtype
+
+    Args:
+        model: The quantized model to export
+
+    Returns:
+        post_state_dict: Dict containing quantized weights
+        amax_dict: Dict containing amax values
+    """
+    amax_dict = {
+        name + "._amax": param["_amax"].detach().clone().cpu()
+        for name, param in get_quantizer_state_dict(model).items()
+        if "_amax" in param
+    }
+
+    # remove quantizer from model
+    for name, module in model.named_modules():
+        if is_quantlinear(module):
+            delattr(module, "weight_quantizer")
+            delattr(module, "input_quantizer")
+            delattr(module, "output_quantizer")
+            module.export()
+    return model.state_dict(), amax_dict
+
+
 def export_hf_checkpoint(
     model: nn.Module,
     dtype: torch.dtype | None = None,
     export_dir: Path | str = tempfile.gettempdir(),
     save_modelopt_state: bool = False,
+    export_bf16_weights_amax: bool = False,
 ):
     """Exports the torch model to unified checkpoint and saves to export_dir.
 
@@ -557,13 +590,19 @@ def export_hf_checkpoint(
         return
 
     try:
-        post_state_dict, hf_quant_config = _export_hf_checkpoint(model, dtype)
+        if export_bf16_weights_amax:
+            post_state_dict, amax_dict = _export_hf_bf16_weights_amax(model)
+            hf_quant_config = None
+            torch.save(amax_dict, f"{export_dir}/quant_amax.pth")
+        else:
+            post_state_dict, hf_quant_config = _export_hf_checkpoint(model, dtype)
 
-        # Save hf_quant_config.json for backward compatibility
-        with open(f"{export_dir}/hf_quant_config.json", "w") as file:
-            json.dump(hf_quant_config, file, indent=4)
+        if hf_quant_config is not None:
+            # Save hf_quant_config.json for backward compatibility
+            with open(f"{export_dir}/hf_quant_config.json", "w") as file:
+                json.dump(hf_quant_config, file, indent=4)
 
-        hf_quant_config = convert_hf_quant_config_format(hf_quant_config)
+            hf_quant_config = convert_hf_quant_config_format(hf_quant_config)
 
         # Save model
         model.save_pretrained(
@@ -576,7 +615,8 @@ def export_hf_checkpoint(
         with open(original_config) as file:
             config_data = json.load(file)
 
-        config_data["quantization_config"] = hf_quant_config
+        if hf_quant_config is not None:
+            config_data["quantization_config"] = hf_quant_config
 
         with open(original_config, "w") as file:
             json.dump(config_data, file, indent=4)
@@ -587,40 +627,3 @@ def export_hf_checkpoint(
             " can be saved with torch.save for further inspection."
         )
         raise e
-
-
-def export_hf_bf16_weights_amax(
-    model: nn.Module,
-    export_dir: Path | str = tempfile.gettempdir(),
-):
-    """Exports the torch model weights and amax values separately which can be used for vLLM fakequant serve.
-
-    This function:
-    1. Extracts amax values for calibration
-    2. Deletes all quantizer parameters from state dict to store only weights in original dtype
-    3. Saves model checkpoint (with weights in original dtype) and amax values separately
-
-    Args:
-        model: The quantized model to export
-        export_dir: Directory to save the model and artifacts
-    """
-    export_dir = Path(export_dir)
-    export_dir.mkdir(parents=True, exist_ok=True)
-
-    amax_dict = {
-        name + "._amax": param["_amax"].detach().clone().cpu()
-        for name, param in get_quantizer_state_dict(model).items()
-        if "_amax" in param
-    }
-
-    # remove quantizer from model
-    for name, module in model.named_modules():
-        if is_quantlinear(module):
-            delattr(module, "weight_quantizer")
-            delattr(module, "input_quantizer")
-            delattr(module, "output_quantizer")
-            module.export()
-
-    # Save with model without quantizer parameters
-    model.save_pretrained(export_dir)
-    torch.save(amax_dict, f"{export_dir}/quant_amax.pth")