fix gguf regression for large models (#680)

wenhuach21 · web-flow · commit 21bb06b63db3 · 2025-07-22T21:52:38.000+08:00
diff --git a/auto_round/autoround.py b/auto_round/autoround.py
@@ -876,6 +876,8 @@ def get_act_max_hook(module, input, output):
             pbar = tqdm(all_to_quantized_module_names)
             block_names_cnt = len(flatten_list(get_block_names(self.model,True)))
             clear_mem_freq = len(all_to_quantized_module_names)//block_names_cnt
+            if clear_mem_freq == 0:
+                clear_mem_freq = 1
             cnt = 1
             for name in pbar:
                 pbar.set_description(f"Quantizing {name}")
@@ -895,26 +897,27 @@ def get_act_max_hook(module, input, output):
                     model = model.to("cpu")
                     clear_memory()
                     self.quantize_via_rtn_blockwise(all_to_quantized_module_names)
-                except Exception:
-                    # Final fallback: warn and use CPU-only quantization
-                    logger.warning("Fallback to CPU. "
-                                "Consider enabling `low_gpu_mem_usage` or using more GPUs via `--device 0,1,2,3`.")
-                    model = model.to("cpu")
-                    clear_memory()
-                    if hasattr(model, "hf_device_map") and len(model.hf_device_map) > 1:
-                        import accelerate
-                        accelerate.hooks.remove_hook_from_submodules(model)
-
-                    orig_device = self.device
-                    self.device = "cpu"
-                    self.quantize_via_rtn_blockwise(all_to_quantized_module_names)
-                    self.device = orig_device
-                finally:
-                    # Always remove hooks
-                    for hook in hooks:
-                        hook.remove()
-            else:
-                raise
+                except RuntimeError as e:
+                    if "CUDA out of memory" in str(e) or "MODULE:PT_DEVMEM" in str(e):
+                        # Final fallback: warn and use CPU-only quantization
+                        logger.warning("Fallback to CPU. "
+                                    "Consider enabling `low_gpu_mem_usage` or using more GPUs via `--device 0,1,2,3`.")
+                        model = model.to("cpu")
+                        clear_memory()
+                        if hasattr(model, "hf_device_map") and len(model.hf_device_map) > 1:
+                            import accelerate
+                            accelerate.hooks.remove_hook_from_submodules(model)
+
+                        orig_device = self.device
+                        self.device = "cpu"
+                        self.quantize_via_rtn_blockwise(all_to_quantized_module_names)
+                        self.device = orig_device
+                    else:
+                        raise
+        finally:
+            # Always remove hooks
+            for hook in hooks:
+                hook.remove()
 
         # Move back to CPU and free memory
         model.to("cpu")
@@ -1119,6 +1122,8 @@ def quantize_rtn(self) -> tuple[torch.nn.Module, dict[str, Any]]:
         else:
             block_names_cnt = len(flatten_list(get_block_names(self.model, True)))
             clear_mem_freq = len(all_to_quantized_module_names) // block_names_cnt
+            if clear_mem_freq == 0:
+                clear_mem_freq = 1
             pbar = tqdm(all_to_quantized_module_names)
             cnt = 1
             for name in pbar:
@@ -1223,6 +1228,8 @@ def quantize_via_rtn_blockwise(self, all_to_quantized_module_names: list[str]) -
         cnt = 1
         block_names_cnt = len(flatten_list(get_block_names(self.model, True)))
         clear_mem_freq = len(all_to_quantized_module_names) // block_names_cnt
+        if clear_mem_freq == 0:
+            clear_mem_freq = 1
         # Process remaining layers not in blocks
         for name in all_to_quantized_module_names:
             self.quantize_layer_via_rtn(name)
diff --git a/auto_round/export/export_to_gguf/export.py b/auto_round/export/export_to_gguf/export.py
@@ -19,7 +19,8 @@
 from pathlib import Path
 import time
 from auto_round.export.export_to_gguf.convert import ModelBase, ModelType, get_model_architecture
-from auto_round.utils import logger, LazyImport, get_block_names, flatten_list, check_to_quantized, get_module
+from auto_round.utils import logger, LazyImport, get_block_names, flatten_list, check_to_quantized, get_module, \
+    clear_memory
 
 TMP_DIR_NAME = "tmp_dir"
 
@@ -159,8 +160,7 @@ def pack_gguf_layer(
                 m.weight = None
             if hasattr(m, "bias"):
                 m.bias = None
-        import gc
-        gc.collect()
+        clear_memory()
         model.last_layer_name_to_block_name.pop(name)
         if len(model.last_layer_name_to_block_name) == 0:
             for gguf_model in gguf_model_instance_global:
diff --git a/auto_round/script/mllm.py b/auto_round/script/mllm.py
@@ -23,8 +23,7 @@
     get_device_and_parallelism,
     set_cuda_visible_devices,
     logger,
-    )
-
+)
 
 os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
 
@@ -301,7 +300,6 @@ def tune(args):
     set_cuda_visible_devices(args.device)
     device_str, use_auto_mapping = get_device_and_parallelism(args.device)
 
-
     import torch
     if not args.disable_deterministic_algorithms:
         torch.use_deterministic_algorithms(True, warn_only=True)
@@ -449,8 +447,8 @@ def tune(args):
         model_kwargs=model_kwargs,
         data_type=args.data_type,
         disable_opt_rtn=args.disable_opt_rtn,
-        )
-    
+    )
+
     model_name = args.model.rstrip("/")
 
     if model_name.split('/')[-1].strip('.') == "" and "gguf" not in args.format:
@@ -569,6 +567,3 @@ def lmms_eval(args):
         apply_chat_template=False,
     )
     return results
-
-
-