added test for vllm fq export

kinjalpatel27 · kinjalpatel27 · commit 32f7ae895744 · 2025-11-21T21:43:05.000Z
Signed-off-by: Kinjal Patel &lt;kinjalpravin@nvidia.com&gt;
diff --git a/tests/gpu/torch/export/test_vllm_fakequant_export.py b/tests/gpu/torch/export/test_vllm_fakequant_export.py
@@ -0,0 +1,194 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+import torch
+from copy import deepcopy
+from functools import partial
+import modelopt.torch.quantization as mtq
+from modelopt.torch.export.unified_export_hf import export_hf_checkpoint
+from modelopt.torch.export.unified_export_megatron import export_mcore_gpt_to_hf
+from _test_utils.torch.transformers_models import create_tiny_llama_dir
+from _test_utils.torch.distributed.utils import spawn_multiprocess_job
+from _test_utils.torch.megatron.models import get_mcore_gpt_model
+from _test_utils.import_helper import skip_if_no_megatron
+from transformers import AutoModelForCausalLM
+
+import os
+import json
+
+skip_if_no_megatron(apex_or_te_required=True)
+
+@pytest.mark.parametrize("quant_cfg", [mtq.FP8_DEFAULT_CFG])
+def test_hf_vllm_export(tmp_path, quant_cfg):
+    """Test HuggingFace model export for vLLM with fake quantization.
+    
+    This test verifies:
+    1. Model weights match before and after export
+    2. quant_amax.pth file is created, huggingface config file does not exist
+    3. Amax values are correctly extracted and saved in quant_amax.pth file
+    """
+    
+    # Create a tiny LLaMA model for testing
+    tiny_model_dir = create_tiny_llama_dir(tmp_path, with_tokenizer=True, num_hidden_layers=2)
+    
+    # Load the model
+    model = AutoModelForCausalLM.from_pretrained(tiny_model_dir)
+    model = model.cuda()
+    model.eval()
+    
+    # Quantize the model
+    def forward_loop(model):
+        input_ids = torch.randint(0, model.config.vocab_size, (1, 128)).cuda()
+        with torch.no_grad():
+            model(input_ids)
+    
+    model = mtq.quantize(model, quant_cfg, forward_loop)
+    
+    model_state_dict = deepcopy(model.state_dict())
+
+    # Export directory
+    export_dir = tmp_path / "vllm_export"
+    export_dir.mkdir(exist_ok=True)
+    
+    # Export for vLLM
+    export_hf_checkpoint(model, export_dir=export_dir, export_vllm_fq_weights_qstate=True)
+
+    # check if quant_amax.pth file exists
+    quant_amax_file = export_dir / "quant_amax.pth"
+    assert quant_amax_file.exists(), f"quant_amax.pth file should be created in {export_dir}"
+    
+    # make sure hf_quant_config.json file does not exist
+    hf_quant_config_file = export_dir / "hf_quant_config.json"
+    assert not hf_quant_config_file.exists(), f"hf_quant_config.json file should not be created in {export_dir}"
+
+    # check weights match before and after export
+    model_after = AutoModelForCausalLM.from_pretrained(export_dir)
+    model_after = model_after.cuda()
+    model_after.eval()
+    model_after_state_dict = model_after.state_dict()
+    amax_state_dict = {}
+    for key in model_state_dict.keys():
+        if key.endswith("_amax"):
+            amax_state_dict[key] = model_state_dict[key]
+            continue
+        
+        assert torch.allclose(model_state_dict[key], model_after_state_dict[key], atol=1e-6), (
+            f"Weight mismatch for {key}: "
+            f"before shape={model_state_dict[key].shape}, after shape={model_after_state_dict[key].shape}, "
+            f"max diff={torch.abs(model_state_dict[key] - model_after_state_dict[key]).max()}"
+        )
+
+    # Verify amax values are correct
+    amax_dict = torch.load(quant_amax_file)
+    assert len(amax_dict) > 0, "amax_dict should not be empty"
+    assert amax_dict.keys() == amax_state_dict.keys(), f"amax keys mismatch between before and after export"
+
+
+def _test_mcore_vllm_export(tmp_path, quant_cfg, rank, size):
+    """Test megatron-core model export for vLLM with fake quantization.
+    
+    """
+    # Create a tiny mcore GPT model
+    num_layers = 2
+    hidden_size = 64
+    num_attention_heads = 8
+    num_query_groups = size
+    ffn_hidden_size = 128
+    max_sequence_length = 32
+    vocab_size = 64
+    
+    model = get_mcore_gpt_model(
+        tensor_model_parallel_size=size,
+        pipeline_model_parallel_size=1,
+        initialize_megatron=True,
+        num_layers=num_layers,
+        hidden_size=hidden_size,
+        num_attention_heads=num_attention_heads,
+        num_query_groups=num_query_groups,
+        ffn_hidden_size=ffn_hidden_size,
+        max_sequence_length=max_sequence_length,
+        vocab_size=vocab_size,
+        activation_func="swiglu",
+        normalization="RMSNorm",
+        transformer_impl="modelopt",
+    ).cuda()
+    model.eval()
+    
+    # Quantize the model
+    def forward_loop(model):
+        batch_size = 1
+        seq_len = 32
+        input_ids = torch.randint(0, vocab_size, (batch_size, seq_len)).cuda()
+        position_ids = torch.arange(seq_len).unsqueeze(0).cuda()
+        # Create causal attention mask
+        attention_mask = torch.tril(torch.ones((1, 1, seq_len, seq_len))).cuda()
+        attention_mask = attention_mask < 0.5  # Convert to boolean mask
+        with torch.no_grad():
+            model(input_ids, position_ids, attention_mask)
+    
+    model = mtq.quantize(model, quant_cfg, forward_loop)
+    
+    model_state_dict = deepcopy(model.state_dict())
+
+    # Create HF config for export
+    pretrained_config = {
+        "architectures": ["LlamaForCausalLM"],
+        "attention_bias": False,
+        "hidden_size": hidden_size,
+        "intermediate_size": ffn_hidden_size,
+        "max_position_embeddings": max_sequence_length,
+        "model_type": "llama",
+        "num_attention_heads": num_attention_heads,
+        "num_hidden_layers": num_layers,
+        "num_key_value_heads": num_query_groups,
+        "torch_dtype": "bfloat16",
+    }
+    
+    with open(tmp_path / "config.json", "w") as f:
+        json.dump(pretrained_config, f)
+
+    # Export directory
+    export_dir = tmp_path / "vllm_export"
+    export_dir.mkdir(exist_ok=True)
+    
+    # Export for vLLM
+    export_mcore_gpt_to_hf(
+        model,
+        pretrained_model_name_or_path=tmp_path,
+        dtype=torch.bfloat16,
+        export_dir=str(export_dir),
+        export_vllm_fq_weights_qstate=True,
+    )
+
+    # check if quant_amax.pth file exists
+    quant_amax_file = export_dir / "quant_amax.pth"
+    assert quant_amax_file.exists(), f"quant_amax.pth file should be created in {export_dir}"
+    
+    # make sure hf_quant_config.json file does not exist
+    hf_quant_config_file = export_dir / "hf_quant_config.json"
+    assert not hf_quant_config_file.exists(), f"hf_quant_config.json file should not be created in {export_dir}"
+
+
+@pytest.mark.parametrize("quant_cfg", [mtq.FP8_DEFAULT_CFG])
+def test_mcore_vllm_export(tmp_path, quant_cfg):
+    """Wrapper test function for mcore vLLM export."""
+    spawn_multiprocess_job(
+        size=1,
+        job=partial(_test_mcore_vllm_export, tmp_path, quant_cfg),
+        backend="nccl",
+    )
+
+