From 202af786d2fe7e6b53e83eccc58b64485d9e1000 Mon Sep 17 00:00:00 2001 From: JessicaJiang-123 Date: Sun, 14 Jun 2026 04:08:45 +0000 Subject: [PATCH 1/5] Disable gradient-accumulation-fusion for ROCm blockwise FP8 Qwen3-30B-A3B (first-cut) --- examples/low_precision/run-qwen3-30b-a3b-fp8-two-nodes.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/examples/low_precision/run-qwen3-30b-a3b-fp8-two-nodes.sh b/examples/low_precision/run-qwen3-30b-a3b-fp8-two-nodes.sh index e761a3acdf..87c872de35 100644 --- a/examples/low_precision/run-qwen3-30b-a3b-fp8-two-nodes.sh +++ b/examples/low_precision/run-qwen3-30b-a3b-fp8-two-nodes.sh @@ -88,6 +88,10 @@ PERF_ARGS=( --bf16 --fp8-format e4m3 --fp8-recipe blockwise + # ROCm/gfx950 blockwise grouped FP8 first-cut does not yet implement wgrad fusion. + # Disable it so the MoE GroupedLinear returns a plain wgrad and Megatron's DDP + # post-hook accumulates it into the fp32 main_grad (numerically equivalent for bring-up). + --no-gradient-accumulation-fusion # --fp8-param-gather ) From 8bb4d135d341ccbbb5009c5b0f75af325c906ce3 Mon Sep 17 00:00:00 2001 From: Zhiyao Jiang Date: Tue, 16 Jun 2026 05:13:21 +0000 Subject: [PATCH 2/5] Enable restore_weights_before_loading for fp8 quant in RL weight update --- .../megatron_utils/update_weight/update_weight_from_tensor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/miles/backends/megatron_utils/update_weight/update_weight_from_tensor.py b/miles/backends/megatron_utils/update_weight/update_weight_from_tensor.py index b017b796cb..ef30e09c7b 100644 --- a/miles/backends/megatron_utils/update_weight/update_weight_from_tensor.py +++ b/miles/backends/megatron_utils/update_weight/update_weight_from_tensor.py @@ -195,7 +195,7 @@ def update_weights(self) -> None: if ( not skip_base_sync and self.quantization_config - and self.quantization_config["quant_method"] in ["compressed-tensors"] + and self.quantization_config["quant_method"] in ["compressed-tensors", "fp8"] ): post_process_weights( rollout_engines=self.rollout_engines, From bbedc7f755b5440ab6970d72a2df5ab965a1b055 Mon Sep 17 00:00:00 2001 From: Zhiyao Jiang Date: Fri, 19 Jun 2026 20:17:19 +0000 Subject: [PATCH 3/5] Add AMD MI350X (gfx950) fp8 blockwise path to run_qwen3_30b_a3b.py; revert example .sh ROCm edit --- .../run-qwen3-30b-a3b-fp8-two-nodes.sh | 4 -- scripts/run_qwen3_30b_a3b.py | 47 ++++++++++++++++++- 2 files changed, 45 insertions(+), 6 deletions(-) diff --git a/examples/low_precision/run-qwen3-30b-a3b-fp8-two-nodes.sh b/examples/low_precision/run-qwen3-30b-a3b-fp8-two-nodes.sh index 87c872de35..e761a3acdf 100644 --- a/examples/low_precision/run-qwen3-30b-a3b-fp8-two-nodes.sh +++ b/examples/low_precision/run-qwen3-30b-a3b-fp8-two-nodes.sh @@ -88,10 +88,6 @@ PERF_ARGS=( --bf16 --fp8-format e4m3 --fp8-recipe blockwise - # ROCm/gfx950 blockwise grouped FP8 first-cut does not yet implement wgrad fusion. - # Disable it so the MoE GroupedLinear returns a plain wgrad and Megatron's DDP - # post-hook accumulates it into the fp32 main_grad (numerically equivalent for bring-up). - --no-gradient-accumulation-fusion # --fp8-param-gather ) diff --git a/scripts/run_qwen3_30b_a3b.py b/scripts/run_qwen3_30b_a3b.py index cb9e225f7c..711530c0de 100644 --- a/scripts/run_qwen3_30b_a3b.py +++ b/scripts/run_qwen3_30b_a3b.py @@ -1,3 +1,4 @@ +import os from dataclasses import dataclass from typing import Literal @@ -13,7 +14,7 @@ class ScriptArgs(U.ExecuteTrainConfig): model_name: str = "Qwen3-30B-A3B" megatron_model_type: str = "qwen3-30B-A3B" num_gpus_per_node: int | None = None - hardware: Literal["H100", "B200", "B300", "GB200", "GB300"] = "H100" + hardware: Literal["H100", "B200", "B300", "GB200", "GB300", "MI300X", "MI350X"] = "H100" enable_eval: bool = True extra_args: str = "" data_dir: str = "/root/datasets" @@ -31,7 +32,10 @@ class ScriptArgs(U.ExecuteTrainConfig): tis_use_rs: bool = True def __post_init__(self): - self.num_gpus_per_node = self.num_gpus_per_node or U.NUM_GPUS_OF_HARDWARE[self.hardware] + if self.hardware in ("MI300X", "MI350X"): + self.num_gpus_per_node = self.num_gpus_per_node or 8 + else: + self.num_gpus_per_node = self.num_gpus_per_node or U.NUM_GPUS_OF_HARDWARE[self.hardware] if self.rollout_int4: assert not self.rollout_fp8, "rollout_int4 and rollout_fp8 cannot be enabled at the same time" assert not self.rollout_mxfp8, "rollout_int4 and rollout_mxfp8 cannot be enabled at the same time" @@ -173,6 +177,9 @@ def execute(args: ScriptArgs): "--use-fault-tolerance " f"--dump-details {args.output_dir}/{args.run_id}/dump_details " ) + if args.hardware in ("MI300X", "MI350X"): + os.environ.setdefault("RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES", "1") + os.environ.setdefault("RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES", "1") misc_env_vars = {} if args.rollout_int4: @@ -205,6 +212,24 @@ def execute(args: ScriptArgs): misc_env_vars |= { "NVTE_FP8_BLOCK_SCALING_FP32_SCALES": "1", } + case "MI300X" | "MI350X": + # ROCm gfx950 blockwise FP8 via ported Triton kernels. ROCm has no + # wgrad fusion yet, so disable gradient-accumulation-fusion (wgrad + # returns a plain grad; Megatron DDP accumulates into fp32 main_grad). + misc_args += ( + "--transformer-impl transformer_engine " + "--bf16 " + "--fp8-format e4m3 " + "--fp8-recipe blockwise " + "--no-gradient-accumulation-fusion " + ) + misc_env_vars |= { + "NVTE_FP8_BLOCK_SCALING_FP32_SCALES": "1", + "NVTE_ROCM_ENABLE_FP8_BLOCK_SCALING": "1", + # keep Ray from blanking HIP/CUDA visibility for the job entrypoint + "RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES": "1", + "RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES": "1", + } if args.enable_megatron_bridge: misc_args += "--megatron-to-hf-mode bridge " @@ -268,6 +293,24 @@ def execute(args: ScriptArgs): ) else: sglang_args += "--rollout-num-gpus-per-engine 4 " "--sglang-cuda-graph-max-bs 512 " + case ("MI300X" | "MI350X", 1): + perf_args += ( + "--tensor-model-parallel-size 1 " + "--sequence-parallel " + "--pipeline-model-parallel-size 2 " + "--context-parallel-size 2 " + "--expert-model-parallel-size 4 " + "--expert-tensor-parallel-size 1 " + "--max-tokens-per-gpu 8192 " + ) + sglang_args = ( + "--rollout-num-gpus-per-engine 8 " + "--sglang-mem-fraction-static 0.7 " + "--sglang-max-running-requests 512 " + ) + optimizer_args += ( + "--optimizer-cpu-offload " "--overlap-cpu-optimizer-d2h-h2d " "--use-precision-aware-optimizer " + ) case _: raise NotImplementedError From 6716938f49737fed5bc0476580d32e06e799664c Mon Sep 17 00:00:00 2001 From: Zhiyao Jiang Date: Fri, 19 Jun 2026 20:42:21 +0000 Subject: [PATCH 4/5] Clean up AMD fp8 launcher: MI350X/MI355X (gfx950), drop redundant num_gpus/NOSET special-cases --- scripts/run_qwen3_30b_a3b.py | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/scripts/run_qwen3_30b_a3b.py b/scripts/run_qwen3_30b_a3b.py index 711530c0de..16e1d32994 100644 --- a/scripts/run_qwen3_30b_a3b.py +++ b/scripts/run_qwen3_30b_a3b.py @@ -1,4 +1,3 @@ -import os from dataclasses import dataclass from typing import Literal @@ -14,7 +13,7 @@ class ScriptArgs(U.ExecuteTrainConfig): model_name: str = "Qwen3-30B-A3B" megatron_model_type: str = "qwen3-30B-A3B" num_gpus_per_node: int | None = None - hardware: Literal["H100", "B200", "B300", "GB200", "GB300", "MI300X", "MI350X"] = "H100" + hardware: Literal["H100", "B200", "B300", "GB200", "GB300", "MI350X", "MI355X"] = "H100" enable_eval: bool = True extra_args: str = "" data_dir: str = "/root/datasets" @@ -32,10 +31,7 @@ class ScriptArgs(U.ExecuteTrainConfig): tis_use_rs: bool = True def __post_init__(self): - if self.hardware in ("MI300X", "MI350X"): - self.num_gpus_per_node = self.num_gpus_per_node or 8 - else: - self.num_gpus_per_node = self.num_gpus_per_node or U.NUM_GPUS_OF_HARDWARE[self.hardware] + self.num_gpus_per_node = self.num_gpus_per_node or U.NUM_GPUS_OF_HARDWARE[self.hardware] if self.rollout_int4: assert not self.rollout_fp8, "rollout_int4 and rollout_fp8 cannot be enabled at the same time" assert not self.rollout_mxfp8, "rollout_int4 and rollout_mxfp8 cannot be enabled at the same time" @@ -177,9 +173,6 @@ def execute(args: ScriptArgs): "--use-fault-tolerance " f"--dump-details {args.output_dir}/{args.run_id}/dump_details " ) - if args.hardware in ("MI300X", "MI350X"): - os.environ.setdefault("RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES", "1") - os.environ.setdefault("RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES", "1") misc_env_vars = {} if args.rollout_int4: @@ -212,10 +205,9 @@ def execute(args: ScriptArgs): misc_env_vars |= { "NVTE_FP8_BLOCK_SCALING_FP32_SCALES": "1", } - case "MI300X" | "MI350X": - # ROCm gfx950 blockwise FP8 via ported Triton kernels. ROCm has no - # wgrad fusion yet, so disable gradient-accumulation-fusion (wgrad - # returns a plain grad; Megatron DDP accumulates into fp32 main_grad). + case "MI350X" | "MI355X": + # ROCm gfx950: blockwise FP8 via ported Triton kernels. + # ROCm has no wgrad fusion yet, so turn off gradient-accumulation-fusion. misc_args += ( "--transformer-impl transformer_engine " "--bf16 " @@ -293,7 +285,7 @@ def execute(args: ScriptArgs): ) else: sglang_args += "--rollout-num-gpus-per-engine 4 " "--sglang-cuda-graph-max-bs 512 " - case ("MI300X" | "MI350X", 1): + case ("MI350X" | "MI355X", 1): perf_args += ( "--tensor-model-parallel-size 1 " "--sequence-parallel " From b60ade26bebe6d472f221aab22061db9e5092754 Mon Sep 17 00:00:00 2001 From: Zhiyao Jiang Date: Sat, 20 Jun 2026 21:41:15 +0000 Subject: [PATCH 5/5] AMD MI350X fp8: max-tokens-per-gpu 16384 (+11%) and rollout-num-gpus-per-engine 2 (eval fix) Co-authored-by: Xinyu Jiang --- scripts/run_qwen3_30b_a3b.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/run_qwen3_30b_a3b.py b/scripts/run_qwen3_30b_a3b.py index 16e1d32994..110ab4ee86 100644 --- a/scripts/run_qwen3_30b_a3b.py +++ b/scripts/run_qwen3_30b_a3b.py @@ -293,10 +293,10 @@ def execute(args: ScriptArgs): "--context-parallel-size 2 " "--expert-model-parallel-size 4 " "--expert-tensor-parallel-size 1 " - "--max-tokens-per-gpu 8192 " + "--max-tokens-per-gpu 16384 " ) sglang_args = ( - "--rollout-num-gpus-per-engine 8 " + "--rollout-num-gpus-per-engine 2 " "--sglang-mem-fraction-static 0.7 " "--sglang-max-running-requests 512 " )