|
52 | 52 | from QEfficient.utils.cache import to_hashable
|
53 | 53 | from QEfficient.utils.logging_utils import logger
|
54 | 54 |
|
55 |
| -MODELS_WITH_ACCURACY_ISSUE_FOR_MXFP6 = ["MllamaForConditionalGeneration"] |
56 |
| - |
57 | 55 |
|
58 | 56 | class QEFFTransformersBase(QEFFBaseModel):
|
59 | 57 | """
|
@@ -627,17 +625,12 @@ def compile(
|
627 | 625 | ):
|
628 | 626 | self.export()
|
629 | 627 |
|
630 |
| - if mxfp6_matmul and self.model_name in MODELS_WITH_ACCURACY_ISSUE_FOR_MXFP6: |
631 |
| - logger.warning( |
632 |
| - "Due to accuracy issues of vision model fixing it's precision to fp16, while language model will be compiled for mxfp6" |
633 |
| - ) |
634 |
| - |
635 | 628 | self.vision_model._compile(
|
636 | 629 | compile_dir,
|
637 | 630 | compile_only=True,
|
638 | 631 | specializations=specializations["vision"],
|
639 | 632 | convert_to_fp16=True,
|
640 |
| - mxfp6_matmul=False, |
| 633 | + mxfp6_matmul=mxfp6_matmul, |
641 | 634 | mdp_ts_num_devices=num_devices,
|
642 | 635 | aic_num_cores=num_cores,
|
643 | 636 | custom_io=custom_io_vision,
|
@@ -946,11 +939,6 @@ def compile(
|
946 | 939 | if output_name.endswith("_RetainedState"):
|
947 | 940 | custom_io[output_name] = kv_cache_dtype
|
948 | 941 |
|
949 |
| - if self.model_name in MODELS_WITH_ACCURACY_ISSUE_FOR_MXFP6 and mxfp6_matmul: |
950 |
| - logger.warning( |
951 |
| - f"It is advised to use fp16 precision during compilation for {self.model.__class__.__name__} to avoid accuracy issues, got mxfp6_matmul=True" |
952 |
| - ) |
953 |
| - |
954 | 942 | self._compile(
|
955 | 943 | onnx_path,
|
956 | 944 | compile_dir,
|
@@ -1147,16 +1135,7 @@ class QEFFAutoModelForImageTextToText:
|
1147 | 1135 |
|
1148 | 1136 | _hf_auto_class = AutoModelForImageTextToText
|
1149 | 1137 |
|
1150 |
| - def __new__(self, model: nn.Module, kv_offload: Optional[bool] = None, **kwargs): |
1151 |
| - if model.config.architectures[0] in MODELS_WITH_ACCURACY_ISSUE_FOR_MXFP6 and not kv_offload: |
1152 |
| - # For models with mxfp6 accuracy issue, we will use kv_offload=True by default |
1153 |
| - if kv_offload is None: |
1154 |
| - kv_offload = True |
1155 |
| - else: |
1156 |
| - logger.warning(f"Advised to use kv_offload=True for {model.__class__.__name__}") |
1157 |
| - elif kv_offload is None: |
1158 |
| - kv_offload = False |
1159 |
| - |
| 1138 | + def __new__(self, model: nn.Module, kv_offload: Optional[bool] = True, **kwargs): |
1160 | 1139 | if kv_offload:
|
1161 | 1140 | return _QEffAutoModelForImageTextToTextDualQPC(model, **kwargs)
|
1162 | 1141 | else:
|
|
0 commit comments