chore: address reviews

keehyuna · keehyuna · commit efa63db23fb7 · 2025-09-21T12:48:32.000+09:00
diff --git a/py/torch_tensorrt/dynamo/__init__.py b/py/torch_tensorrt/dynamo/__init__.py
@@ -15,7 +15,6 @@
         save_cross_compiled_exported_program,
     )
     from ._exporter import export
-    from ._quantization import quantize
     from ._refit import refit_module_weights
     from ._settings import CompilationSettings
     from ._SourceIR import SourceIR
diff --git a/py/torch_tensorrt/dynamo/_quantization.py b/py/torch_tensorrt/dynamo/_quantization.py
diff --git a/tools/llm/README.md b/tools/llm/README.md
@@ -39,6 +39,7 @@ We have officially verified support for the following models:
 #### Text-only LLMs: `run_llm.py`
 
 ```bash
+python run_llm.py --model meta-llama/Llama-3.2-1B-Instruct --prompt "What is parallel programming?" --model_precision FP16 --num_tokens 128 --cache static_v2 --benchmark
 python run_llm.py --model meta-llama/Llama-3.2-1B-Instruct --prompt "What is parallel programming?" --precision FP16 --num_tokens 128 --cache static_v2 --benchmark
 ```
 
@@ -54,8 +55,8 @@ python run_vlm.py --model nvidia/Eagle2-2B --precision FP16 --num_tokens 128 --c
 - `--tokenizer`: (Optional) Tokenizer name; defaults to model.
 - `--prompt`: Input prompt for generation.
 - `--image_path`: (Optional) Path to input image file for VLM models. If not provided, will use a sample image.
-- `--precision`: Precision mode (`FP16`, `FP32`).
-- `--quant_format`: Quantization format (`fp8`, `nvfp4`) to apply.
+- `--model_precision`: Precision of model weight/buffer (`FP16`, `BF16`, `FP32`).
+- `--quant_format`: (Optional) Quantization format (`fp8`, `nvfp4`) to apply.
 - `--num_tokens`: Number of output tokens to generate.
 - `--cache`: KV cache type (`static_v1`, `static_v2`, or empty for no KV caching).
 - `--benchmark`: Enable benchmarking mode.
@@ -68,17 +69,26 @@ Torch-TensorRT supports quantization to reduce model memory footprint and improv
 #### Using Pre-quantized Models
 
 To use pre-quantized models from HuggingFace:
+If a model contains quantization configuration (detected automatically), the model's linear layers are converted to TensorRT quantized versions using the specified quantization algorithm (e.g., FP8, NVFP4). The quantization algorithm type is displayed during conversion.
+
+**Note:** The `--quant_format` option will raise an error if it's used with pre-quantized models, as quantization cannot be applied to models that are already quantized.
 
 ```bash
-python run_llm.py --model nvidia/Llama-3.1-8B-Instruct-FP8 --prompt "What is parallel programming?" --precision FP16 --num_tokens 128
+python run_llm.py --model nvidia/Llama-3.1-8B-Instruct-FP8 --prompt "What is parallel programming?" --model_precision FP16 --num_tokens 128
+```
+
+**Expected output:**
+```
+Model is FP8 pre-quantized hf model. Quantized linear layers are applied
 ```
 
 #### Applying quantization by ModelOpt
 
-Apply fp8 quantization from HuggingFace:
+To apply quantization to non-quantized models using ModelOpt:
+The `--quant_format` option calls `mtq.quantize()` to apply ModelOpt post-training quantization to the model.
 
 ```bash
-python run_llm.py --model meta-llama/Llama-3.1-8B --quant_format fp8 --prompt "What is parallel programming?" --precision FP16 --num_tokens 128
+python run_llm.py --model meta-llama/Llama-3.1-8B --quant_format fp8 --prompt "What is parallel programming?" --model_precision FP16 --num_tokens 128
 ```
 
 #### Quantization Requirements
diff --git a/tools/llm/quantize_utils.py b/tools/llm/quantize_utils.py
@@ -4,7 +4,6 @@
 
 import huggingface_hub
 import torch
-import torch_tensorrt
 from huggingface_hub import snapshot_download
 
 logger = logging.getLogger(__name__)
@@ -25,6 +24,11 @@
 )
 from safetensors import safe_open
 
+# FP8 E4M3 format has a maximum representable value of 448.0
+MAX_BOUND_FP8 = 448.0
+# Additional scaling factor for NVFP4
+MAX_BOUND_NVFP4 = 6.0
+
 
 def quantize_model(model, args, tokenizer):
     """
@@ -52,11 +56,17 @@ def quantize_model(model, args, tokenizer):
         num_samples=512,
         device="cuda:0",
     )
-
+    if args.quant_format == "fp8":
+        quant_cfg = mtq.FP8_DEFAULT_CFG
+    elif args.quant_format == "nvfp4":
+        quant_cfg = mtq.NVFP4_DEFAULT_CFG
+    else:
+        raise RuntimeError("Unsupported quantization format")
     calibrate_loop = create_forward_loop(dataloader=calib_dataloader)
-    model = torch_tensorrt.dynamo.quantize(
-        model, args.quant_format, calibrate_loop, debug=args.debug
-    )
+
+    model = mtq.quantize(model, quant_cfg, forward_loop=calibrate_loop)
+    if args.debug:
+        mtq.print_quant_summary(model)
 
     return model
 
@@ -83,12 +93,6 @@ def __init__(
         # Store reference to original linear layer for weight access
         self.original_linear = original_linear
 
-        # Copy bias from original layer if it exists
-        if original_linear.bias is not None:
-            self.bias = torch.nn.Parameter(original_linear.bias.clone()).cuda()
-        else:
-            self.bias = None
-
         # Create quantizers for input and weight tensors
         self.input_quantizer = TensorQuantizer(
             quant_attribute_cfg=quant_cfg, amax=input_amax
@@ -100,7 +104,7 @@ def __init__(
     def forward(self, input):
         input = self.input_quantizer(input)
         weight = self.weight_quantizer(self.original_linear.weight)
-        return torch.nn.functional.linear(input, weight, self.bias)
+        return torch.nn.functional.linear(input, weight, self.original_linear.bias)
 
 
 def load_quantization_config(model_name):
@@ -134,7 +138,7 @@ def load_quantization_config(model_name):
     return hf_quant_config
 
 
-def convert_linear_to_tensorrt_quantized(model, hf_quant_config):
+def convert_linear_to_tensorrt_quantized(model, model_precision, hf_quant_config):
     """
     Convert linear layers in a model to TensorRT quantized versions from pre-quantized weights.
 
@@ -177,6 +181,13 @@ def convert_linear_to_tensorrt_quantized(model, hf_quant_config):
     if hf_quant_algo != "FP8" and hf_quant_algo != "NVFP4":
         raise RuntimeError("Only FP8 or NVFP4 quantization is supported")
 
+    if model_precision == "FP16":
+        weight_dtype = torch.float16
+    elif model_precision == "BF16":
+        weight_dtype = torch.bfloat16
+    else:
+        weight_dtype = torch.float32
+
     # Iterate through all modules in the model
     for name, module in model.named_modules():
         # Check if the module is a linear layer
@@ -195,14 +206,13 @@ def convert_linear_to_tensorrt_quantized(model, hf_quant_config):
                 continue
 
             if hf_quant_algo == "FP8":
-                # FP8 E4M3 format has a maximum representable value of 448.0
                 # Scale the quantization parameters accordingly
                 weight_scale = tensors.pop(weight_scale_name)
-                weight_amax = weight_scale * 448.0
-                input_amax = tensors.pop(input_scale_name) * 448.0
+                weight_amax = weight_scale * MAX_BOUND_FP8
+                input_amax = tensors.pop(input_scale_name) * MAX_BOUND_FP8
 
                 # Dequantize the weight using the scale factor
-                dequantized_weight_data = module.weight.to(torch.float32) * weight_scale
+                dequantized_weight_data = module.weight.to(weight_dtype) * weight_scale
 
                 # Configure quantizer for FP8 format (4 exponent bits, 3 mantissa bits)
                 quantizer_attribute_config = QuantizerAttributeConfig(
@@ -218,15 +228,15 @@ def convert_linear_to_tensorrt_quantized(model, hf_quant_config):
                 weight_scale2 = tensors.pop(weight_scale2_name)
 
                 # Calculate amax values with additional scaling factor for NVFP4
-                input_amax = input_scale * 448.0 * 6.0
-                weight_amax = weight_scale2 * 448.0 * 6.0
+                input_amax = input_scale * MAX_BOUND_FP8 * MAX_BOUND_NVFP4
+                weight_amax = weight_scale2 * MAX_BOUND_FP8 * MAX_BOUND_NVFP4
 
                 # Handle NVFP4 tensor format
                 weight_data = tensors.pop(weight_name)
                 original_shape = list(weight_data.shape)
                 original_shape[-1] *= 2  # NVFP4 packs 2 values per element
                 nvfp4_tensor = NVFP4QTensor(
-                    torch.Size(original_shape), torch.float32, weight_data
+                    torch.Size(original_shape), weight_dtype, weight_data
                 )
 
                 # Dequantize using both scales and block size configuration
diff --git a/tools/llm/run_llm.py b/tools/llm/run_llm.py
@@ -71,12 +71,20 @@ def get_model(args):
 
     hf_quant_config = load_quantization_config(args.model)
     if hf_quant_config:
-        model = convert_linear_to_tensorrt_quantized(model, hf_quant_config).cuda()
-        print(f"Model converted to TensorRT quantized")
+        model = convert_linear_to_tensorrt_quantized(
+            model, args.model_precision, hf_quant_config
+        ).cuda()
+        print(
+            f"Model is {hf_quant_config['quant_algo']} pre-quantized hf model. Quantized linear layers are applied"
+        )
+        if args.quant_format:
+            raise RuntimeError(
+                f"Quantization cannot be applied for pre-quantized hf model"
+            )
 
-    if args.precision == "FP16":
+    if args.model_precision == "FP16":
         model = model.to(torch.float16)
-    elif args.precision == "BF16":
+    elif args.model_precision == "BF16":
         model = model.to(torch.bfloat16)
     else:
         model = model.to(torch.float32)
@@ -112,11 +120,11 @@ def compile_torchtrt(model, input_ids, args):
     # Set precision specific flags
     use_fp32_acc = False
     use_explicit_typing = False
-    if args.precision == "FP16":
+    if args.model_precision == "FP16":
         enabled_precisions = {torch.float32}
         use_fp32_acc = True
         use_explicit_typing = True
-    elif args.precision == "BF16":
+    elif args.model_precision == "BF16":
         enabled_precisions = {torch.bfloat16}
         use_fp32_acc = False
     else:
@@ -204,7 +212,7 @@ def measure_perf(trt_model, input_signature, backend_name):
         "--prompt", type=str, default="What is parallel programming ?", help="Prompt"
     )
     arg_parser.add_argument(
-        "--precision",
+        "--model_precision",
         type=str,
         default="FP16",
         help="Precision to use in the model. Options: FP16, BF16, FP32",
@@ -299,7 +307,7 @@ def measure_perf(trt_model, input_signature, backend_name):
                 pyt_stats = record_stats(
                     "PyTorch",
                     pyt_timings,
-                    args.precision,
+                    args.model_precision,
                     batch_size=args.batch_size,
                     compile_time_s=None,
                 )
@@ -357,7 +365,7 @@ def measure_perf(trt_model, input_signature, backend_name):
             trt_stats = record_stats(
                 "TensorRT",
                 trt_timings,
-                args.precision,
+                args.model_precision,
                 batch_size=args.batch_size,
                 compile_time_s=None,
             )
diff --git a/tools/llm/utils.py b/tools/llm/utils.py
@@ -247,7 +247,7 @@ def record_stats(backend, timings, precision, batch_size=1, compile_time_s=None)
 
     stats = {
         "Backend": backend,
-        "Precision": precision,
+        "Model Precision": precision,
         "Batch size": batch_size,
         "Median(FPS)": speed_med,
         "Mean(FPS)": speed_mean,

Original file line number	Diff line number	Diff line change
`@@ -15,7 +15,6 @@`
`15`	`15`	`save_cross_compiled_exported_program,`
`16`	`16`	`)`
`17`	`17`	`from ._exporter import export`
`18`		`- from ._quantization import quantize`
`19`	`18`	`from ._refit import refit_module_weights`
`20`	`19`	`from ._settings import CompilationSettings`
`21`	`20`	`from ._SourceIR import SourceIR`