Removed warning and override of mxfp6 for internal use (#277)

quic-amitraj · web-flow · commit f8b5db6add0c · 2025-02-28T21:57:37.000+05:30
compilation fix and enabled mxfp6 for vision encoder

---------

Signed-off-by: Amit Raj &lt;quic_amitraj@quicinc.com&gt;
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
@@ -52,8 +52,6 @@
 from QEfficient.utils.cache import to_hashable
 from QEfficient.utils.logging_utils import logger
 
-MODELS_WITH_ACCURACY_ISSUE_FOR_MXFP6 = ["MllamaForConditionalGeneration"]
-
 
 class QEFFTransformersBase(QEFFBaseModel):
     """
@@ -627,17 +625,12 @@ def compile(
         ):
             self.export()
 
-        if mxfp6_matmul and self.model_name in MODELS_WITH_ACCURACY_ISSUE_FOR_MXFP6:
-            logger.warning(
-                "Due to accuracy issues of vision model fixing it's precision to fp16, while language model will be compiled for mxfp6"
-            )
-
         self.vision_model._compile(
             compile_dir,
             compile_only=True,
             specializations=specializations["vision"],
             convert_to_fp16=True,
-            mxfp6_matmul=False,
+            mxfp6_matmul=mxfp6_matmul,
             mdp_ts_num_devices=num_devices,
             aic_num_cores=num_cores,
             custom_io=custom_io_vision,
@@ -946,11 +939,6 @@ def compile(
             if output_name.endswith("_RetainedState"):
                 custom_io[output_name] = kv_cache_dtype
 
-        if self.model_name in MODELS_WITH_ACCURACY_ISSUE_FOR_MXFP6 and mxfp6_matmul:
-            logger.warning(
-                f"It is advised to use fp16 precision during compilation for {self.model.__class__.__name__} to avoid accuracy issues, got mxfp6_matmul=True"
-            )
-
         self._compile(
             onnx_path,
             compile_dir,
@@ -1147,16 +1135,7 @@ class QEFFAutoModelForImageTextToText:
 
     _hf_auto_class = AutoModelForImageTextToText
 
-    def __new__(self, model: nn.Module, kv_offload: Optional[bool] = None, **kwargs):
-        if model.config.architectures[0] in MODELS_WITH_ACCURACY_ISSUE_FOR_MXFP6 and not kv_offload:
-            # For models with mxfp6 accuracy issue, we will use kv_offload=True by default
-            if kv_offload is None:
-                kv_offload = True
-            else:
-                logger.warning(f"Advised to use kv_offload=True for {model.__class__.__name__}")
-        elif kv_offload is None:
-            kv_offload = False
-
+    def __new__(self, model: nn.Module, kv_offload: Optional[bool] = True, **kwargs):
         if kv_offload:
             return _QEffAutoModelForImageTextToTextDualQPC(model, **kwargs)
         else: