minor update

Edwardf0t1 · Edwardf0t1 · commit 8d7fe0b7914e · 2025-11-06T00:30:19.000-08:00
Signed-off-by: Zhiyu Cheng &lt;zhiyuc@nvidia.com&gt;
diff --git a/modelopt/torch/export/quant_utils.py b/modelopt/torch/export/quant_utils.py
@@ -776,36 +776,23 @@ def to_quantized_weight(
 
     if quantization == QUANTIZATION_FP8_PC_PT:
         if weight.dim() == 3:
-            # for MOE stacked weights
-            # For standard MoE: weight (num_experts, output_dim, input_dim)
-            #                   scale (num_experts, output_dim)
-            # For BMM-style transposed experts: weight (num_experts, output_dim, input_dim)
-            #                                    scale (num_experts, input_dim)
-
             # Handle different scale tensor shapes
             if weights_scaling_factor.dim() == 1:
                 # Per-expert scaling only: (num_experts,) -> (num_experts, 1, 1)
                 return (weight / weights_scaling_factor[:, None, None]).to(torch.float8_e4m3fn)
             elif weights_scaling_factor.dim() == 2:
                 # Per-channel scaling: check which dimension matches
                 if weights_scaling_factor.shape[-1] == weight.shape[-1]:
-                    # Scale matches last dim (input_dim) - BMM-style transposed case
-                    # (num_experts, input_dim) -> (num_experts, 1, input_dim)
+                    # (num_experts, input_dim) -> (num_experts, 1, input_dim), BMM-style
                     return (weight / weights_scaling_factor.unsqueeze(-2)).to(torch.float8_e4m3fn)
                 elif weights_scaling_factor.shape[-1] == weight.shape[-2]:
-                    # Scale matches second-to-last dim (output_dim) - standard MoE case
-                    # (num_experts, output_dim) -> (num_experts, output_dim, 1)
+                    # (num_experts, output_dim) -> (num_experts, output_dim, 1), Standard MoE case
                     return (weight / weights_scaling_factor.unsqueeze(-1)).to(torch.float8_e4m3fn)
                 else:
-                    # Shape mismatch - try to infer correct broadcasting
                     raise ValueError(
                         f"Cannot determine correct unsqueeze dimension for FP8_PC_PT quantization. "
                         f"weight shape: {weight.shape}, scale shape: {weights_scaling_factor.shape}"
                     )
-            else:
-                raise ValueError(
-                    f"Unexpected scaling factor dimension for 3D weight: {weights_scaling_factor.dim()}"
-                )
         return (weight / weights_scaling_factor[:, None]).to(torch.float8_e4m3fn)
 
     if quantization in [QUANTIZATION_INT4_AWQ, QUANTIZATION_W4A8_AWQ]:
diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py
@@ -355,8 +355,6 @@ def _export_quantized_weight(
         )
     elif quantization_format == QUANTIZATION_FP8_PC_PT and is_bmm_expert_weight:
         # For FP8_PC_PT with BMM-style experts, transpose only the weight (not weight_scale)
-        # Transpose weight from (num_experts, input_dim, output_dim) to (num_experts, output_dim, input_dim)
-        # weight_scale remains (num_experts, output_dim) for per-channel quantization
         weight, _ = maybe_transpose_expert_weight_dimensions(
             weight, is_bmm_expert_weight=is_bmm_expert_weight
         )

Original file line number	Diff line number	Diff line change
`@@ -355,8 +355,6 @@ def _export_quantized_weight(`
`355`	`355`	`)`
`356`	`356`	`elif quantization_format == QUANTIZATION_FP8_PC_PT and is_bmm_expert_weight:`
`357`	`357`	`# For FP8_PC_PT with BMM-style experts, transpose only the weight (not weight_scale)`
`358`		`- # Transpose weight from (num_experts, input_dim, output_dim) to (num_experts, output_dim, input_dim)`
`359`		`- # weight_scale remains (num_experts, output_dim) for per-channel quantization`
`360`	`358`	`weight, _ = maybe_transpose_expert_weight_dimensions(`
`361`	`359`	`weight, is_bmm_expert_weight=is_bmm_expert_weight`
`362`	`360`	`)`