From 202af786d2fe7e6b53e83eccc58b64485d9e1000 Mon Sep 17 00:00:00 2001
From: JessicaJiang-123 <jessicajiang324@gmail.com>
Date: Sun, 14 Jun 2026 04:08:45 +0000
Subject: [PATCH 1/5] Disable gradient-accumulation-fusion for ROCm blockwise
 FP8 Qwen3-30B-A3B (first-cut)

---
 examples/low_precision/run-qwen3-30b-a3b-fp8-two-nodes.sh | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/examples/low_precision/run-qwen3-30b-a3b-fp8-two-nodes.sh b/examples/low_precision/run-qwen3-30b-a3b-fp8-two-nodes.sh
index e761a3acdf..87c872de35 100644
--- a/examples/low_precision/run-qwen3-30b-a3b-fp8-two-nodes.sh
+++ b/examples/low_precision/run-qwen3-30b-a3b-fp8-two-nodes.sh
@@ -88,6 +88,10 @@ PERF_ARGS=(
    --bf16
    --fp8-format e4m3
    --fp8-recipe blockwise
+   # ROCm/gfx950 blockwise grouped FP8 first-cut does not yet implement wgrad fusion.
+   # Disable it so the MoE GroupedLinear returns a plain wgrad and Megatron's DDP
+   # post-hook accumulates it into the fp32 main_grad (numerically equivalent for bring-up).
+   --no-gradient-accumulation-fusion
    # --fp8-param-gather
 )
 

From 8bb4d135d341ccbbb5009c5b0f75af325c906ce3 Mon Sep 17 00:00:00 2001
From: Zhiyao Jiang <jessicajiang324@gmail.com>
Date: Tue, 16 Jun 2026 05:13:21 +0000
Subject: [PATCH 2/5] Enable restore_weights_before_loading for fp8 quant in RL
 weight update

---
 .../megatron_utils/update_weight/update_weight_from_tensor.py   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/miles/backends/megatron_utils/update_weight/update_weight_from_tensor.py b/miles/backends/megatron_utils/update_weight/update_weight_from_tensor.py
index b017b796cb..ef30e09c7b 100644
--- a/miles/backends/megatron_utils/update_weight/update_weight_from_tensor.py
+++ b/miles/backends/megatron_utils/update_weight/update_weight_from_tensor.py
@@ -195,7 +195,7 @@ def update_weights(self) -> None:
             if (
                 not skip_base_sync
                 and self.quantization_config
-                and self.quantization_config["quant_method"] in ["compressed-tensors"]
+                and self.quantization_config["quant_method"] in ["compressed-tensors", "fp8"]
             ):
                 post_process_weights(
                     rollout_engines=self.rollout_engines,

From bbedc7f755b5440ab6970d72a2df5ab965a1b055 Mon Sep 17 00:00:00 2001
From: Zhiyao Jiang <jessicajiang324@gmail.com>
Date: Fri, 19 Jun 2026 20:17:19 +0000
Subject: [PATCH 3/5] Add AMD MI350X (gfx950) fp8 blockwise path to
 run_qwen3_30b_a3b.py; revert example .sh ROCm edit

---
 .../run-qwen3-30b-a3b-fp8-two-nodes.sh        |  4 --
 scripts/run_qwen3_30b_a3b.py                  | 47 ++++++++++++++++++-
 2 files changed, 45 insertions(+), 6 deletions(-)

diff --git a/examples/low_precision/run-qwen3-30b-a3b-fp8-two-nodes.sh b/examples/low_precision/run-qwen3-30b-a3b-fp8-two-nodes.sh
index 87c872de35..e761a3acdf 100644
--- a/examples/low_precision/run-qwen3-30b-a3b-fp8-two-nodes.sh
+++ b/examples/low_precision/run-qwen3-30b-a3b-fp8-two-nodes.sh
@@ -88,10 +88,6 @@ PERF_ARGS=(
    --bf16
    --fp8-format e4m3
    --fp8-recipe blockwise
-   # ROCm/gfx950 blockwise grouped FP8 first-cut does not yet implement wgrad fusion.
-   # Disable it so the MoE GroupedLinear returns a plain wgrad and Megatron's DDP
-   # post-hook accumulates it into the fp32 main_grad (numerically equivalent for bring-up).
-   --no-gradient-accumulation-fusion
    # --fp8-param-gather
 )
 
diff --git a/scripts/run_qwen3_30b_a3b.py b/scripts/run_qwen3_30b_a3b.py
index cb9e225f7c..711530c0de 100644
--- a/scripts/run_qwen3_30b_a3b.py
+++ b/scripts/run_qwen3_30b_a3b.py
@@ -1,3 +1,4 @@
+import os
 from dataclasses import dataclass
 from typing import Literal
 
@@ -13,7 +14,7 @@ class ScriptArgs(U.ExecuteTrainConfig):
     model_name: str = "Qwen3-30B-A3B"
     megatron_model_type: str = "qwen3-30B-A3B"
     num_gpus_per_node: int | None = None
-    hardware: Literal["H100", "B200", "B300", "GB200", "GB300"] = "H100"
+    hardware: Literal["H100", "B200", "B300", "GB200", "GB300", "MI300X", "MI350X"] = "H100"
     enable_eval: bool = True
     extra_args: str = ""
     data_dir: str = "/root/datasets"
@@ -31,7 +32,10 @@ class ScriptArgs(U.ExecuteTrainConfig):
     tis_use_rs: bool = True
 
     def __post_init__(self):
-        self.num_gpus_per_node = self.num_gpus_per_node or U.NUM_GPUS_OF_HARDWARE[self.hardware]
+        if self.hardware in ("MI300X", "MI350X"):
+            self.num_gpus_per_node = self.num_gpus_per_node or 8
+        else:
+            self.num_gpus_per_node = self.num_gpus_per_node or U.NUM_GPUS_OF_HARDWARE[self.hardware]
         if self.rollout_int4:
             assert not self.rollout_fp8, "rollout_int4 and rollout_fp8 cannot be enabled at the same time"
             assert not self.rollout_mxfp8, "rollout_int4 and rollout_mxfp8 cannot be enabled at the same time"
@@ -173,6 +177,9 @@ def execute(args: ScriptArgs):
         "--use-fault-tolerance "
         f"--dump-details {args.output_dir}/{args.run_id}/dump_details "
     )
+    if args.hardware in ("MI300X", "MI350X"):
+        os.environ.setdefault("RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES", "1")
+        os.environ.setdefault("RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES", "1")
     misc_env_vars = {}
 
     if args.rollout_int4:
@@ -205,6 +212,24 @@ def execute(args: ScriptArgs):
                 misc_env_vars |= {
                     "NVTE_FP8_BLOCK_SCALING_FP32_SCALES": "1",
                 }
+            case "MI300X" | "MI350X":
+                # ROCm gfx950 blockwise FP8 via ported Triton kernels. ROCm has no
+                # wgrad fusion yet, so disable gradient-accumulation-fusion (wgrad
+                # returns a plain grad; Megatron DDP accumulates into fp32 main_grad).
+                misc_args += (
+                    "--transformer-impl transformer_engine "
+                    "--bf16 "
+                    "--fp8-format e4m3 "
+                    "--fp8-recipe blockwise "
+                    "--no-gradient-accumulation-fusion "
+                )
+                misc_env_vars |= {
+                    "NVTE_FP8_BLOCK_SCALING_FP32_SCALES": "1",
+                    "NVTE_ROCM_ENABLE_FP8_BLOCK_SCALING": "1",
+                    # keep Ray from blanking HIP/CUDA visibility for the job entrypoint
+                    "RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES": "1",
+                    "RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES": "1",
+                }
 
     if args.enable_megatron_bridge:
         misc_args += "--megatron-to-hf-mode bridge "
@@ -268,6 +293,24 @@ def execute(args: ScriptArgs):
                 )
             else:
                 sglang_args += "--rollout-num-gpus-per-engine 4 " "--sglang-cuda-graph-max-bs 512 "
+        case ("MI300X" | "MI350X", 1):
+            perf_args += (
+                "--tensor-model-parallel-size 1 "
+                "--sequence-parallel "
+                "--pipeline-model-parallel-size 2 "
+                "--context-parallel-size 2 "
+                "--expert-model-parallel-size 4 "
+                "--expert-tensor-parallel-size 1 "
+                "--max-tokens-per-gpu 8192 "
+            )
+            sglang_args = (
+                "--rollout-num-gpus-per-engine 8 "
+                "--sglang-mem-fraction-static 0.7 "
+                "--sglang-max-running-requests 512 "
+            )
+            optimizer_args += (
+                "--optimizer-cpu-offload " "--overlap-cpu-optimizer-d2h-h2d " "--use-precision-aware-optimizer "
+            )
         case _:
             raise NotImplementedError
 

From 6716938f49737fed5bc0476580d32e06e799664c Mon Sep 17 00:00:00 2001
From: Zhiyao Jiang <jessicajiang324@gmail.com>
Date: Fri, 19 Jun 2026 20:42:21 +0000
Subject: [PATCH 4/5] Clean up AMD fp8 launcher: MI350X/MI355X (gfx950), drop
 redundant num_gpus/NOSET special-cases

---
 scripts/run_qwen3_30b_a3b.py | 20 ++++++--------------
 1 file changed, 6 insertions(+), 14 deletions(-)

diff --git a/scripts/run_qwen3_30b_a3b.py b/scripts/run_qwen3_30b_a3b.py
index 711530c0de..16e1d32994 100644
--- a/scripts/run_qwen3_30b_a3b.py
+++ b/scripts/run_qwen3_30b_a3b.py
@@ -1,4 +1,3 @@
-import os
 from dataclasses import dataclass
 from typing import Literal
 
@@ -14,7 +13,7 @@ class ScriptArgs(U.ExecuteTrainConfig):
     model_name: str = "Qwen3-30B-A3B"
     megatron_model_type: str = "qwen3-30B-A3B"
     num_gpus_per_node: int | None = None
-    hardware: Literal["H100", "B200", "B300", "GB200", "GB300", "MI300X", "MI350X"] = "H100"
+    hardware: Literal["H100", "B200", "B300", "GB200", "GB300", "MI350X", "MI355X"] = "H100"
     enable_eval: bool = True
     extra_args: str = ""
     data_dir: str = "/root/datasets"
@@ -32,10 +31,7 @@ class ScriptArgs(U.ExecuteTrainConfig):
     tis_use_rs: bool = True
 
     def __post_init__(self):
-        if self.hardware in ("MI300X", "MI350X"):
-            self.num_gpus_per_node = self.num_gpus_per_node or 8
-        else:
-            self.num_gpus_per_node = self.num_gpus_per_node or U.NUM_GPUS_OF_HARDWARE[self.hardware]
+        self.num_gpus_per_node = self.num_gpus_per_node or U.NUM_GPUS_OF_HARDWARE[self.hardware]
         if self.rollout_int4:
             assert not self.rollout_fp8, "rollout_int4 and rollout_fp8 cannot be enabled at the same time"
             assert not self.rollout_mxfp8, "rollout_int4 and rollout_mxfp8 cannot be enabled at the same time"
@@ -177,9 +173,6 @@ def execute(args: ScriptArgs):
         "--use-fault-tolerance "
         f"--dump-details {args.output_dir}/{args.run_id}/dump_details "
     )
-    if args.hardware in ("MI300X", "MI350X"):
-        os.environ.setdefault("RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES", "1")
-        os.environ.setdefault("RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES", "1")
     misc_env_vars = {}
 
     if args.rollout_int4:
@@ -212,10 +205,9 @@ def execute(args: ScriptArgs):
                 misc_env_vars |= {
                     "NVTE_FP8_BLOCK_SCALING_FP32_SCALES": "1",
                 }
-            case "MI300X" | "MI350X":
-                # ROCm gfx950 blockwise FP8 via ported Triton kernels. ROCm has no
-                # wgrad fusion yet, so disable gradient-accumulation-fusion (wgrad
-                # returns a plain grad; Megatron DDP accumulates into fp32 main_grad).
+            case "MI350X" | "MI355X":
+                # ROCm gfx950: blockwise FP8 via ported Triton kernels.
+                # ROCm has no wgrad fusion yet, so turn off gradient-accumulation-fusion.
                 misc_args += (
                     "--transformer-impl transformer_engine "
                     "--bf16 "
@@ -293,7 +285,7 @@ def execute(args: ScriptArgs):
                 )
             else:
                 sglang_args += "--rollout-num-gpus-per-engine 4 " "--sglang-cuda-graph-max-bs 512 "
-        case ("MI300X" | "MI350X", 1):
+        case ("MI350X" | "MI355X", 1):
             perf_args += (
                 "--tensor-model-parallel-size 1 "
                 "--sequence-parallel "

From b60ade26bebe6d472f221aab22061db9e5092754 Mon Sep 17 00:00:00 2001
From: Zhiyao Jiang <jessicajiang324@gmail.com>
Date: Sat, 20 Jun 2026 21:41:15 +0000
Subject: [PATCH 5/5] AMD MI350X fp8: max-tokens-per-gpu 16384 (+11%) and
 rollout-num-gpus-per-engine 2 (eval fix)

Co-authored-by: Xinyu Jiang <xinyuj2@andrew.cmu.edu>
---
 scripts/run_qwen3_30b_a3b.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/run_qwen3_30b_a3b.py b/scripts/run_qwen3_30b_a3b.py
index 16e1d32994..110ab4ee86 100644
--- a/scripts/run_qwen3_30b_a3b.py
+++ b/scripts/run_qwen3_30b_a3b.py
@@ -293,10 +293,10 @@ def execute(args: ScriptArgs):
                 "--context-parallel-size 2 "
                 "--expert-model-parallel-size 4 "
                 "--expert-tensor-parallel-size 1 "
-                "--max-tokens-per-gpu 8192 "
+                "--max-tokens-per-gpu 16384 "
             )
             sglang_args = (
-                "--rollout-num-gpus-per-engine 8 "
+                "--rollout-num-gpus-per-engine 2 "
                 "--sglang-mem-fraction-static 0.7 "
                 "--sglang-max-running-requests 512 "
             )