NVIDIA · jingyu-ml · Dec 5, 2025 · Dec 5, 2025 · Dec 5, 2025 · Dec 5, 2025
@@ -131,9 +131,21 @@ Please reference our [framework scripts](#framework-scripts) and our [docs](http
 | QwQ | ✅ | - | - | - | ✅ |
 | T5 | ✅ | ✅ | ✅ | ✅ | - |
 | Whisper | ✅ | ❌ | ❌ | ❌ | - |
+| Kimi-K2-Thinking-BF16 | ✅ | ❌ | ❌ | ❌ | ✅ |
 
 > *This is a subset of the models supported. For the full list please check the [TensorRT-LLM support matrix](https://nvidia.github.io/TensorRT-LLM/reference/precision.html#support-matrix)*
 
+> We recommend upcasting Kimi-K2-Thinking from INT4 to BF16 before running quantization.
+
+```python
+from transformers import AutoModelForCausalLM
+from transformers.utils.quantization_config import CompressedTensorsConfig
+
+model = AutoModelForCausalLM.from_pretrained("moonshotai/Kimi-K2-Thinking", torch_dtype="auto", device_map = "auto", local_files_only = True, trust_remote_code = True, quantization_config = CompressedTensorsConfig(run_compressed=False))
+
+# And then save it with save_pretrained
+```
+
 > *<sup>1.</sup>The w4a8_awq is an experimental quantization scheme that may result in a higher accuracy penalty.* \
 > *<sup>2.</sup>For some models, there is only support for exporting quantized checkpoints.* \
 > *<sup>3.</sup>W4A8_AWQ is only available on some models but not all* \

@@ -80,6 +80,7 @@
     "w4a8_nvfp4_fp8": mtq.W4A8_NVFP4_FP8_CFG,
     "w4a8_mxfp4_fp8": mtq.W4A8_MXFP4_FP8_CFG,
     "nvfp4_mlp_only": mtq.NVFP4_MLP_ONLY_CFG,
+    "nvfp4_experts_only": mtq.NVFP4_EXPERTS_ONLY_CFG,
 }
 
 KV_QUANT_CFG_CHOICES = {
@@ -121,6 +122,7 @@ def auto_quantize(
             "fp8_pb_wo",
             "w4a8_mxfp4_fp8",
             "nvfp4_mlp_only",
+            "nvfp4_experts_only",
         ]
         for qformat in qformat_list
     ), "One or more quantization formats provided are not supported for unified checkpoint export"

@@ -623,6 +623,25 @@
     "algorithm": "max",
 }
 
+NVFP4_EXPERTS_ONLY_CFG = {
+    "quant_cfg": {
+        "*mlp.experts*weight_quantizer": {
+            "num_bits": (2, 1),
+            "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)},
+            "enable": True,
+            "pass_through_bwd": True,
+        },
+        "*mlp.experts*input_quantizer": {
+            "num_bits": (2, 1),
+            "block_sizes": {-1: 16, "type": "dynamic", "scale_bits": (4, 3)},
+            "enable": True,
+            "pass_through_bwd": True,
+        },
+        **_default_disabled_quantizer_cfg,
+    },
+    "algorithm": "max",
+}
+
 choices: set[str] = {
     "FP8_2D_BLOCKWISE_WEIGHT_ONLY_CFG",
     "FP8_AFFINE_KV_CFG",
@@ -652,6 +671,7 @@
     "NVFP4_MLP_WEIGHT_ONLY_CFG",
     "MXFP4_MLP_WEIGHT_ONLY_CFG",
     "NVFP4_MLP_ONLY_CFG",
+    "NVFP4_EXPERTS_ONLY_CFG",
 }
 
 BiasType = Literal["static", "dynamic"]