NVIDIA
diff --git a/‎CHANGELOG.rst‎
Lines changed: 1 addition & 0 deletions b/‎CHANGELOG.rst‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/llm_eval/gen_model_answer.py‎
Lines changed: 34 additions & 1 deletion b/‎examples/llm_eval/gen_model_answer.py‎
Lines changed: 34 additions & 1 deletion
diff --git a/‎examples/llm_eval/lm_eval_hf.py‎
Lines changed: 23 additions & 2 deletions b/‎examples/llm_eval/lm_eval_hf.py‎
Lines changed: 23 additions & 2 deletions
diff --git a/‎examples/llm_eval/mmlu.py‎
Lines changed: 5 additions & 1 deletion b/‎examples/llm_eval/mmlu.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎examples/llm_eval/quantization_utils.py‎
Lines changed: 21 additions & 12 deletions b/‎examples/llm_eval/quantization_utils.py‎
Lines changed: 21 additions & 12 deletions
diff --git a/‎examples/llm_ptq/hf_ptq.py‎
Lines changed: 67 additions & 6 deletions b/‎examples/llm_ptq/hf_ptq.py‎
Lines changed: 67 additions & 6 deletions
@@ -15,6 +15,7 @@ Model Optimizer Changelog (Linux)
 - Add ``specdec_bench`` example to benchmark speculative decoding performance. See `examples/specdec_bench/README.md <https://github.com/NVIDIA/TensorRT-Model-Optimizer/tree/main/examples/specdec_bench#speculative-decoding-benchmark>`_ for more details.
 - Add FP8/NVFP4 KV cache quantization support for Megatron Core models.
 - Add KL Divergence loss based auto_quantize method. See `auto_quantize API docs <https://nvidia.github.io/TensorRT-Model-Optimizer/reference/generated/modelopt.torch.quantization.model_quant.html#modelopt.torch.quantization.model_quant.auto_quantize>`_ for more details.
+- Add support for saving and resuming auto_quantize search state. This speeds up the auto_quantize process by skipping the score estimation step if the search state is provided.
 - Add flag ``trt_plugins_precision`` in ONNX autocast to indicate custom ops precision. This is similar to the flag already existing in the quantization workflow.
 - Add support for PyTorch Geometric quantization.
 - Add per tensor and per channel MSE calibrator support.
 
@@ -201,8 +201,11 @@ def get_model_answers(
                 tokenizer,
                 args.calib_batch_size,
                 args.calib_size,
-                args.auto_quantize_bits,
                 test_generated=False,
+                auto_quantize_bits=args.auto_quantize_bits,
+                auto_quantize_method=args.auto_quantize_method,
+                auto_quantize_score_size=args.auto_quantize_score_size,
+                auto_quantize_checkpoint=args.auto_quantize_checkpoint,
             )
 
     for question in tqdm(questions):
@@ -450,6 +453,36 @@ def reorg_answer_file(answer_file):
             "regular quantization without auto_quantize search will be applied."
         ),
     )
+    parser.add_argument(
+        "--auto_quantize_method",
+        type=str,
+        default="gradient",
+        choices=["gradient", "kl_div"],
+        help=(
+            "Method for auto_quantize sensitivity analysis. 'gradient' uses gradient-based method "
+            "(requires labels in dataset). 'kl_div' uses KL divergence between original and "
+            "quantized model outputs (no labels required). Default: 'gradient'"
+        ),
+    )
+    parser.add_argument(
+        "--auto_quantize_score_size",
+        type=int,
+        default=128,
+        help=(
+            "Number of samples to use for auto_quantize scoring. Most of auto_quantize time is spent on "
+            "sensitivity score estimation, so reducing this speeds it up while only minimally affecting "
+            "final model accuracy compared to lowering --calib_size (the number of samples used for calibration)."
+        ),
+    )
+    parser.add_argument(
+        "--auto_quantize_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Path to checkpoint file for saving/restoring auto_quantize search state "
+            "(sensitivity scores, costs, etc.). Only used when auto_quantize_bits is specified."
+        ),
+    )
     parser.add_argument(
         "--trust_remote_code",
         help="Set trust_remote_code for Huggingface models and tokenizers",
 
@@ -54,6 +54,8 @@ def create_from_arg_obj(cls: type[T], arg_dict: dict, additional_config: dict |
     quant_cfg = arg_dict.pop("quant_cfg", None)
     auto_quantize_bits = arg_dict.pop("auto_quantize_bits", None)
     auto_quantize_method = arg_dict.pop("auto_quantize_method", "gradient")
+    auto_quantize_score_size = arg_dict.pop("auto_quantize_score_size", 128)
+    auto_quantize_checkpoint = arg_dict.pop("auto_quantize_checkpoint", None)
     calib_batch_size = arg_dict.pop("calib_batch_size", None)
     calib_size = arg_dict.pop("calib_size", 512)
     compress = arg_dict.pop("compress", False)
@@ -83,8 +85,10 @@ def create_from_arg_obj(cls: type[T], arg_dict: dict, additional_config: dict |
             calib_size=calib_size,
             auto_quantize_bits=auto_quantize_bits,
             auto_quantize_method=auto_quantize_method,
+            auto_quantize_score_size=auto_quantize_score_size,
             test_generated=False,
             compress=compress,
+            auto_quantize_checkpoint=auto_quantize_checkpoint,
         )
 
     return model_obj
@@ -103,6 +107,12 @@ def setup_parser_with_modelopt_args():
             "comma-separated list of quantization quantization formats that will be searched by `auto_quantize`"
         ),
     )
+    parser.add_argument(
+        "--calib_batch_size", type=int, help="Batch size for quantization calibration"
+    )
+    parser.add_argument(
+        "--calib_size", type=int, help="Calibration size for quantization", default=512
+    )
     parser.add_argument(
         "--auto_quantize_bits",
         type=float,
@@ -123,10 +133,19 @@ def setup_parser_with_modelopt_args():
         ),
     )
     parser.add_argument(
-        "--calib_batch_size", type=int, help="Batch size for quantization calibration"
+        "--auto_quantize_score_size",
+        type=int,
+        default=128,
+        help=(
+            "Number of samples to use for auto_quantize scoring. Most of auto_quantize time is spent on "
+            "sensitivity score estimation, so reducing this speeds it up while only minimally affecting "
+            "final model accuracy compared to lowering --calib_size (the number of samples used for calibration)."
+        ),
     )
     parser.add_argument(
-        "--calib_size", type=int, help="Calibration size for quantization", default=512
+        "--auto_quantize_checkpoint",
+        type=str,
+        help=("Path to checkpoint file for saving/restoring auto_quantize search state. "),
     )
     parser.add_argument(
         "--compress",
@@ -153,6 +172,8 @@ def setup_parser_with_modelopt_args():
             "quant_cfg": args.quant_cfg,
             "auto_quantize_bits": args.auto_quantize_bits,
             "auto_quantize_method": args.auto_quantize_method,
+            "auto_quantize_score_size": args.auto_quantize_score_size,
+            "auto_quantize_checkpoint": args.auto_quantize_checkpoint,
             "calib_batch_size": args.calib_batch_size,
             "calib_size": args.calib_size,
             "compress": args.compress,
 
@@ -224,10 +224,12 @@ def main(
     ntrain: int = 5,
     quant_cfg: str | None = None,
     auto_quantize_bits: float | None = None,
-    auto_quantize_method: str = "gradient",
     batch_size: int = 0,
     calib_size: int = 512,
     dtype: str = "bfloat16",
+    auto_quantize_method: str = "gradient",
+    auto_quantize_score_size: int = 128,
+    auto_quantize_checkpoint: str | None = None,
     **kwargs,
 ):
     random.seed(RAND_SEED)
@@ -283,6 +285,8 @@ def main(
                     calib_size=calib_size,
                     auto_quantize_bits=auto_quantize_bits,
                     auto_quantize_method=auto_quantize_method,
+                    auto_quantize_score_size=auto_quantize_score_size,
+                    auto_quantize_checkpoint=auto_quantize_checkpoint,
                 )
 
     for subject in tqdm(subjects):
 
@@ -67,8 +67,10 @@ def _quantize_model_with_dataset(
     calib_dataset,
     auto_quantize_bits=None,
     auto_quantize_method="gradient",
+    auto_quantize_score_size=128,
     batch_size=1,
     compress=False,
+    auto_quantize_checkpoint=None,
 ):
     if hasattr(lm, "gpt2"):
         net = lm.gpt2
@@ -112,11 +114,12 @@ def forward_step(model, batch):
             forward_step=forward_step,
             loss_func=loss_func,
             num_calib_steps=len(calib_dataset),
-            num_score_steps=min(
-                len(calib_dataset), 128 // batch_size
-            ),  # Limit the number of score steps to avoid long calibration time
+            # Most time is spent on score estimation; fewer samples speed it up with little accuracy impact.
+            num_score_steps=min(len(calib_dataset), max(auto_quantize_score_size // batch_size, 1)),
             verbose=True,
             method=auto_quantize_method,
+            # disabled_layers=["*lm_head*", "*mlp.gate.*"],
+            checkpoint=auto_quantize_checkpoint,
         )
     else:
         mtq_cfg = CUSTOM_CONFIG.get(quant_cfg)  # type: ignore [arg-type]
@@ -160,11 +163,13 @@ def quantize_model(
     tokenizer,
     batch_size,
     calib_size,
-    auto_quantize_bits=None,
-    auto_quantize_method="gradient",
     data="cnn_dailymail",
     test_generated=True,
     compress=False,
+    auto_quantize_bits=None,
+    auto_quantize_method="gradient",
+    auto_quantize_score_size=128,
+    auto_quantize_checkpoint=None,
 ):
     """Quantizes the model with the provided calibration dataset.
 
@@ -175,11 +180,14 @@ def quantize_model(
         tokenizer: the tokenizer.
         batch_size: the calibration batch size for each calibration inference run.
         calib_size: the total calibration dataset size.
-        auto_quantize_bits: The effective bits constraint for auto_quantize.
-        auto_quantize_method: The method for auto_quantize ('gradient' or 'kl_div').
         data: the name of the calibration dataset.
         test_generated:  If ``True``, test the generated text before and after quantization.
         compress: If ``True``, compress the model after quantization.
+        auto_quantize_bits: The effective bits constraint for auto_quantize.
+        auto_quantize_method: The method for auto_quantize ('gradient' or 'kl_div').
+        auto_quantize_score_size: Number of samples used for auto_quantize scoring.
+        auto_quantize_checkpoint: Path to checkpoint file for saving/restoring auto_quantize search state
+            (sensitivity scores, costs, etc.). Only used when auto_quantize_bits is specified.
     """
     if "AWQ" in quant_cfg:
         print(
@@ -191,8 +199,10 @@ def quantize_model(
     if hasattr(model, "model"):
         device = model.model.device
 
+    is_gradient_based = auto_quantize_bits is not None and auto_quantize_method == "gradient"
+
     if batch_size == 0:
-        if auto_quantize_bits is not None or torch.distributed.is_initialized():
+        if is_gradient_based or torch.distributed.is_initialized():
             raise ValueError("We dont support automatic batch size inference for this case.")
 
         net = model.gpt2 if hasattr(model, "gpt2") else model.model
@@ -201,16 +211,13 @@ def quantize_model(
         batch_size = get_max_batch_size(net)
         print(f"Update calib batch {batch_size}")
 
-    # Labels are only needed for gradient-based auto_quantize
-    include_labels = auto_quantize_bits is not None and auto_quantize_method == "gradient"
-
     calib_dataloader = get_dataset_dataloader(
         dataset_name=data,
         tokenizer=tokenizer,
         batch_size=batch_size,
         num_samples=calib_size,
         device=device,
-        include_labels=include_labels,
+        include_labels=is_gradient_based,
     )
 
     if test_generated:
@@ -223,8 +230,10 @@ def quantize_model(
         calib_dataloader,
         auto_quantize_bits,
         auto_quantize_method,
+        auto_quantize_score_size,
         batch_size,
         compress,
+        auto_quantize_checkpoint,
     )
 
     if test_generated:
 
@@ -95,7 +95,15 @@
 
 
 def auto_quantize(
-    model, qformat, auto_quantize_bits, calib_dataloader, calibrate_loop, batch_size=1
+    model,
+    qformat,
+    calib_dataloader,
+    calibrate_loop,
+    auto_quantize_bits,
+    batch_size=1,
+    auto_quantize_method="gradient",
+    auto_quantize_score_size=128,
+    auto_quantize_checkpoint=None,
 ):
     qformat_list = qformat.split(",")
     assert qformat_list, "No quantization formats provided"
@@ -122,18 +130,34 @@ def loss_func(output, data):
         # which contains the loss attribute.
         return output.loss
 
+    if auto_quantize_method == "gradient":
+        # For gradient-based method, return full output with loss
+        def forward_step(model, batch):
+            return model(**batch)
+    elif auto_quantize_method == "kl_div":
+        # For KL divergence method, return only logits
+        def forward_step(model, batch):
+            return model(**batch).logits
+    else:
+        raise ValueError(
+            f"Invalid auto_quantize_method: {auto_quantize_method}. Must be 'gradient' or 'kl_div'"
+        )
+
     model, _ = mtq.auto_quantize(
         model,
         constraints={"effective_bits": auto_quantize_bits},
         data_loader=calib_dataloader,
-        forward_step=lambda model, batch: model(**batch),
-        loss_func=loss_func,
+        forward_step=forward_step,
+        loss_func=loss_func,  # Only used for gradient-based method
         # TRTLLM only support one quantization format or None (do not quantize, internally supported)
         quantization_formats=[QUANT_CFG_CHOICES[format] for format in qformat_list],
         num_calib_steps=len(calib_dataloader),
-        num_score_steps=len(calib_dataloader),
+        # AutoQuantize scoring is the costly phase; allow smaller sample counts than calibration.
+        num_score_steps=min(len(calib_dataloader), max(auto_quantize_score_size // batch_size, 1)),
         verbose=True,
         disabled_layers=["*lm_head*"],
+        method=auto_quantize_method,
+        checkpoint=auto_quantize_checkpoint,
     )
 
     # We need to explicitly calibrate for kv cache quantization
@@ -191,10 +215,13 @@ def quantize_model(model, quant_cfg, args, calib_dataloader=None, calibration_on
         model = auto_quantize(
             model,
             args.qformat,
-            args.auto_quantize_bits,
             calib_dataloader,
             calibrate_loop,
+            args.auto_quantize_bits,
             args.batch_size,
+            args.auto_quantize_method,
+            args.auto_quantize_score_size,
+            args.auto_quantize_checkpoint,
         )
     elif calibration_only:
         model = mtq.calibrate(model, quant_cfg["algorithm"], forward_loop=calibrate_loop)
@@ -444,13 +471,17 @@ def main(args):
             assert tokenizer is not None and isinstance(
                 tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast)
             ), "The PreTrainedTokenizer must be set"
+            # Labels are only needed for gradient-based auto_quantize
+            include_labels = (
+                args.auto_quantize_bits is not None and args.auto_quantize_method == "gradient"
+            )
             calib_dataloader = get_dataset_dataloader(
                 dataset_name=args.dataset,
                 tokenizer=tokenizer,
                 batch_size=args.batch_size,
                 num_samples=args.calib_size,
                 device=device,
-                include_labels=args.auto_quantize_bits is not None,
+                include_labels=include_labels,
             )
 
         quant_cfg = build_quant_cfg(
@@ -803,6 +834,36 @@ def output_decode(generated_ids, input_shape):
         default=None,
         type=str,
     )
+    parser.add_argument(
+        "--auto_quantize_method",
+        type=str,
+        default="gradient",
+        choices=["gradient", "kl_div"],
+        help=(
+            "Method for auto_quantize sensitivity analysis. 'gradient' uses gradient-based method "
+            "(requires labels in dataset). 'kl_div' uses KL divergence between original and "
+            "quantized model outputs (no labels required). Default: 'gradient'"
+        ),
+    )
+    parser.add_argument(
+        "--auto_quantize_score_size",
+        type=int,
+        default=128,
+        help=(
+            "Number of samples to use for auto_quantize scoring. Most of auto_quantize time is spent on "
+            "sensitivity score estimation, so reducing this speeds it up while only minimally affecting "
+            "final model accuracy compared to lowering --calib_size (the number of samples used for calibration)."
+        ),
+    )
+    parser.add_argument(
+        "--auto_quantize_checkpoint",
+        type=str,
+        default=None,
+        help=(
+            "Path to checkpoint file for saving/restoring auto_quantize search state "
+            "(sensitivity scores, costs, etc.). Only used when auto_quantize_bits is specified."
+        ),
+    )
 
     args = parser.parse_args()