Enable MiniMax-M3 MXFP4 (AttnFP8) on top of the BF16 M3 support

thpereir · thpereir · commit 9d0b3282018d · 2026-06-24T12:07:35.000-04:00
The in-tree MiniMax-M3 model already covers the BF16 checkpoint. This
adds the small pieces the quantized amd/MiniMax-M3-MXFP4-AttnFP8 build
needs, without disturbing the BF16 path.

- config.py: register the minimax_m3_vl multimodal wrapper and parse its
  text sub-config (which declares no model_type) with the base
  PretrainedConfig so every field is retained and no deepseek/MLA
  defaults leak in; stamp model_type=minimax_m3 from the top-level type.
  The quark quantization_config (already propagated from the root) and
  the original architectures are preserved, so loading resolves to the
  existing MiniMaxM3Sparse model. The BF16 checkpoint keeps its direct
  minimax_m3 model_type and is unaffected.
- linear.py: pad the MXFP4 Linear contraction dim to 256. The a4w4 asm
  GEMM reads K in 256-wide tiles, so an unaligned K (M3's shared-expert
  down_proj at TP=8, K=384) faults on GPU. LinearBase._pad_mxfp4_input_dim()
  zero-pads the fp4x2 weight, its e8m0 scale, and the activation up to
  256-alignment; no-op when already aligned.
diff --git a/atom/config.py b/atom/config.py
@@ -586,6 +586,7 @@ def _remap_layer_name(name: str) -> list[str]:
     "qwen3_5": "text_config",
     "qwen3_5_moe": "text_config",
     "mistral3": "text_config",
+    "minimax_m3_vl": "text_config",
 }
 
 # multimodal models fully supported by plugin mode
@@ -630,10 +631,20 @@ def _get_hf_token() -> str | None:
             and "quantization_config" in config_dict
         ):
             text_config_dict["quantization_config"] = config_dict["quantization_config"]
-        text_model_type = text_config_dict.get("model_type", "deepseek_v3")
-        mapped_type = _CONFIG_REGISTRY.get(text_model_type, text_model_type)
-        config_class = AutoConfig.for_model(mapped_type)
-        hf_config = config_class.from_dict(text_config_dict)
+        if "model_type" not in text_config_dict:
+            # The text sub-config declares no `model_type` of its own (e.g.
+            # MiniMax-M3's minimax_m3_vl wrapper). Parse it with the base
+            # config so every field is retained and no foreign (e.g. deepseek
+            # MLA) defaults are injected; the model class reads its own fields
+            # via getattr. Stamp the model_type from the top-level type so
+            # downstream policy can key off it.
+            hf_config = PretrainedConfig(**text_config_dict)
+            hf_config.model_type = model_type.removesuffix("_vl")
+        else:
+            text_model_type = text_config_dict.get("model_type", "deepseek_v3")
+            mapped_type = _CONFIG_REGISTRY.get(text_model_type, text_model_type)
+            config_class = AutoConfig.for_model(mapped_type)
+            hf_config = config_class.from_dict(text_config_dict)
         # Override architectures so that ATOM selects the correct model class
         # which can handle the multimodal weight prefix during loading.
         original_arch = config_dict.get("architectures", [])
diff --git a/atom/model_ops/linear.py b/atom/model_ops/linear.py
@@ -542,6 +542,46 @@ def online_quantize_weight(self):
             "quant_dtype": str(online_quant_dtype),
         }
 
+    def _pad_mxfp4_input_dim(self):
+        """Zero-pad the MXFP4 (per-1x32 fp4x2) contraction dim up to 256.
+
+        The a4w4 asm GEMM's preshuffle + e8m0 scale layout reads the K
+        dimension in 256-wide tiles. Per-rank shapes whose K is not a
+        multiple of 256 (e.g. the TP=8 shared-expert down_proj with
+        K=384) otherwise trigger an out-of-bounds GPU memory access
+        fault. Padded weight bytes are zero so the extra K contributes
+        nothing to the result. Mirrors FusedMoE's pad_align=256.
+        """
+        self._mxfp4_in_pad = 0
+        if not (
+            self.quant_type == QuantType.per_1x32
+            and self.params_dtype == dtypes.fp4x2
+            and self.weight.dim() == 2
+            and self.weight.data.dtype == dtypes.fp4x2
+        ):
+            return
+        weight_scale = getattr(self, "weight_scale", None)
+        if weight_scale is None:
+            return
+        align = 256
+        k = weight_scale.shape[-1] * MXFP4_QUANT_BLOCK_SIZE
+        k_pad = (k + align - 1) // align * align
+        if k_pad == k:
+            return
+        scale_pad = k_pad // MXFP4_QUANT_BLOCK_SIZE - weight_scale.shape[-1]
+        # weight_scale is e8m0 (exponent-only); 0.0 is not representable, so pad
+        # the raw bytes (0x00 -> 2^-127, harmless since padded weights are zero).
+        scale_u8 = weight_scale.data.view(torch.uint8)
+        self.weight_scale.data = torch.nn.functional.pad(scale_u8, (0, scale_pad)).view(
+            weight_scale.data.dtype
+        )
+        weight_u8 = self.weight.data.view(torch.uint8)
+        weight_pad = k_pad // 2 - weight_u8.shape[-1]
+        self.weight.data = torch.nn.functional.pad(weight_u8, (0, weight_pad)).view(
+            dtypes.fp4x2
+        )
+        self._mxfp4_in_pad = k_pad - k
+
     def process_weights_after_loading(self):
         # Re-quantize before process_weights if online quantization is enabled
         if self.quant_config is not None and self.quant_config.online_quant:
@@ -580,6 +620,7 @@ def process_weights_after_loading(self):
             )
             self.weight.data = w_q
             self.weight_scale = atom_parameter(w_s)
+            self._pad_mxfp4_input_dim()
             # Only quantized 2D GEMM weights use aiter's preshuffle layout.
             # Qwen3-Next/Qwen3.5 GDN conv1d expands its weight to 3D, so FP8/blocked
             # quantized models must keep that tensor unshuffled here.
@@ -591,6 +632,8 @@ def process_weights_after_loading(self):
                 self.quant_type == QuantType.per_1x32
                 and self.params_dtype == dtypes.fp4x2
             )
+            if is_fp4_blockscale:
+                self._pad_mxfp4_input_dim()
             need_shuffle = (
                 self.quant_type == QuantType.per_Token
                 and self.params_dtype == dtypes.fp8
@@ -688,6 +731,9 @@ def forward(
                 if self.bias is not None:
                     y += self.bias
             elif self.quant_type.value == QuantType.per_1x32.value:
+                in_pad = getattr(self, "_mxfp4_in_pad", 0)
+                if in_pad and x_scale is None:
+                    x = torch.nn.functional.pad(x, (0, in_pad))
                 y = gemm_a4w4_quant(
                     x,
                     x_scale,