[train][inference] qwen_gr00t supports ascend and musa platform (flagos-ai#1178)

1016qqz · web-flow · commit d2bcde2b089d · 2026-04-01T14:46:39.000+08:00
### PR Category
&lt;!-- One of [ Train | Inference | Compress | Serve | RL | Core |
Hardware | CICD | Tools | Others ] --&gt;
Train | Inference
### PR Types
&lt;!-- One of [ User Experience | New Features | Bug Fixes | Improvements
| Performance | Breaking Change| Deprecations | Test Case | Docs |
Others ] --&gt;
Others
### PR Description
&lt;!-- Describe what you’ve done --&gt;
1, qwen_gr00t train/inference adapt ascend and musa platform
diff --git a/examples/qwen_gr00t/README.md b/examples/qwen_gr00t/README.md
@@ -24,6 +24,7 @@ Install FlagScale and training dependencies:
 
 ```sh
 cd FlagScale/
+# "[cuda-train]" is for NVIDIA GPUs; replace with "[ascend-train]" on Huawei Ascend, or "[musa-train]" on Moore Threads MUSA
 pip install ".[cuda-train]" --verbose
 ```
 
@@ -116,10 +117,12 @@ vim examples/qwen_gr00t/conf/train.yaml
 
 Configure the following fields:
 
-- `experiment.envs.CUDA_VISIBLE_DEVICES` - GPU devices to use (default: `"0,1,2,3,4,5,6,7"` for 8 GPUs)
-- `experiment.envs.CUDA_DEVICE_MAX_CONNECTIONS` - Connection limit (typically `1`)
+- `experiment.envs.CUDA_VISIBLE_DEVICES` - GPU devices to use (e.g., `"0,1,2,3"` for 4 GPUs). Use `ASCEND_RT_VISIBLE_DEVICES` for Huawei Ascend, `MUSA_VISIBLE_DEVICES` for Moore Threads MUSA
+- `experiment.envs.CUDA_DEVICE_MAX_CONNECTIONS` - Connection limit (typically `1`). Use `MUSA_DEVICE_MAX_CONNECTIONS` for Moore Threads MUSA
+- `experiment.envs.MUSA_LAUNCH_BLOCKING` - Set to `"1"` on Moore Threads MUSA to enable synchronous kernel execution, useful for debugging
 - `experiment.exp_name` - Experiment name
 - `experiment.exp_dir` - Output directory for checkpoints and logs
+- `experiment.runner.nproc_per_node` - Number of processes per node for multi-GPU training (required for Huawei Ascend)
 
 #### Task-Level Config
 
@@ -199,7 +202,7 @@ model:
 - `data.vla_data.obs` - Observation image keys (default: `["image_0"]`)
 - `data.observation_delta_indices` - Observation delta indices (default: `[0]`)
 - `data.action_delta_indices` - Action delta indices (default: `[0,1,2,3,4,5,6,7]`)
-- `data.preprocessor` - Preprocessor pipeline configuration
+- `data.preprocessor` - Preprocessor pipeline configuration. For Moore Threads MUSA, set `device_processor.config.device` to `"musa"`, for Huawei Ascend, set to `"npu"`,
 - `data.postprocessor` - Postprocessor pipeline configuration
 
 ### Start Training
@@ -250,7 +253,7 @@ Configure the following fields:
 **Engine settings:**
 - `engine.model_variant` - Model variant (default: `"QwenGr00t"`)
 - `engine.model` - Path to trained checkpoint (e.g., `/workspace/outputs/qwen_gr00t_train/checkpoints/last`)
-- `engine.device` - Device to use (e.g., `"cuda"`)
+- `engine.device` - Device to use (e.g., `"cuda", "musa", "npu"`)
 
 **Generate settings:**
 - `generate.images` - Dictionary mapping image keys to file paths:
diff --git a/flagscale/inference/inference_qwen_gr00t.py b/flagscale/inference/inference_qwen_gr00t.py
@@ -8,6 +8,7 @@
 from flagscale.logger import logger
 from flagscale.models.utils.constants import OBS_STATE
 from flagscale.models.vla import TrainablePolicy
+from flagscale.platforms import get_platform  # noqa: F401 must be before model imports
 from flagscale.train.processor import PolicyProcessorPipeline
 
 
diff --git a/flagscale/models/vla/base_policy.py b/flagscale/models/vla/base_policy.py
@@ -157,7 +157,7 @@ def from_pretrained(cls, pretrained_path, device="cpu", *, config=None):
         missing, unexpected = load_model(
             model,
             str(weights_path),
-            device=device,
+            device="cpu" if str(device) == "musa" else device,
             strict=False,
         )
         if missing:
diff --git a/flagscale/models/vla/qwen_gr00t/modeling_qwen_gr00t.py b/flagscale/models/vla/qwen_gr00t/modeling_qwen_gr00t.py
@@ -31,6 +31,7 @@
 from flagscale.models.vla.base_policy import TrainablePolicy
 from flagscale.models.vla.registry import build_action_model, build_vlm
 from flagscale.models.vla.utils import get_vlm_config
+from flagscale.platforms.platform_manager import get_platform
 
 
 class QwenGr00t(TrainablePolicy):
@@ -91,7 +92,7 @@ def forward(
         qwen_inputs = self.vlm.build_qwenvl_inputs(images, instructions)
 
         # TODO: (yupu) Hard-coded autocast and dtype, matches starVLA
-        with torch.autocast("cuda", dtype=torch.bfloat16):
+        with torch.autocast(get_platform().amp_device_type(), dtype=torch.bfloat16):
             vlm_output = self.vlm.forward(qwen_inputs, output_attentions=False)
             # last_hidden_state: [B, seq_len, H]
             last_hidden = vlm_output["hidden_states"][-1]  # [B, L, H]
@@ -122,7 +123,7 @@ def forward(
             padded_actions.append(final_a)
             action_masks.append(mask)
 
-        with torch.autocast("cuda", dtype=torch.float32):
+        with torch.autocast(get_platform().amp_device_type(), dtype=torch.float32):
             # TODO: (yupu) Is this a bug or a feature? The action dtype would stay as bf16 under this autocast.
             actions = torch.stack(padded_actions).to(
                 device=last_hidden.device, dtype=last_hidden.dtype
@@ -156,7 +157,7 @@ def forward(
         result = {"loss": output["loss"]}
 
         if vlm_batch is not None:
-            with torch.autocast("cuda", dtype=torch.bfloat16):
+            with torch.autocast(get_platform().amp_device_type(), dtype=torch.bfloat16):
                 vlm_loss = self.vlm.model(**vlm_batch, return_dict=True).loss
             result["vlm_loss"] = vlm_loss
 
@@ -194,7 +195,7 @@ def predict_action(self, batch: list[dict] | dict) -> dict:
 
         qwen_inputs = self.vlm.build_qwenvl_inputs(images, instructions)
 
-        with torch.autocast("cuda", dtype=torch.bfloat16):
+        with torch.autocast(get_platform().amp_device_type(), dtype=torch.bfloat16):
             vlm_output = self.vlm.forward(qwen_inputs, output_attentions=False)
             # last_hidden_state: [B, seq_len, H]
             last_hidden = vlm_output["hidden_states"][-1]  # [B, L, H]
@@ -207,7 +208,7 @@ def predict_action(self, batch: list[dict] | dict) -> dict:
             state = state.to(device=last_hidden.device, dtype=last_hidden.dtype)
 
         # Step 4: Action Expert Forward
-        with torch.autocast("cuda", dtype=torch.float32):
+        with torch.autocast(get_platform().amp_device_type(), dtype=torch.float32):
             vlm_output_for_action = {"hidden_states": last_hidden}
             action_input = {"state": state}
             output = self.action_model.predict_action(vlm_output_for_action, action_input)
diff --git a/flagscale/models/vla/vlm/qwenvl_backbone.py b/flagscale/models/vla/vlm/qwenvl_backbone.py
@@ -20,6 +20,7 @@
 
 from flagscale.logger import logger
 from flagscale.models.vla.registry import register_vlm
+from flagscale.platforms.platform_manager import get_platform
 
 
 @dataclass
@@ -142,7 +143,7 @@ def forward(self, batch: dict[str, torch.Tensor], **kwargs) -> dict[str, torch.T
             f"[VLM.forward] input keys={list(batch.keys())} "
             + " ".join(f"{k}={v.shape}" for k, v in batch.items() if isinstance(v, torch.Tensor))
         )
-        with torch.autocast("cuda", dtype=torch.bfloat16):
+        with torch.autocast(get_platform().amp_device_type(), dtype=torch.bfloat16):
             outputs = self.model(
                 **batch,
                 output_hidden_states=True,
@@ -205,7 +206,7 @@ def build_qwenvl_inputs(
 
         # Use current CUDA device instead of self.model.device, which returns
         # a DTensor device under FSDP2 and causes mixed Tensor/DTensor errors.
-        return batch_input.to(f"cuda:{torch.cuda.current_device()}")
+        return batch_input.to(get_platform().device())
 
 
 @register_vlm("qwen3-vl")
@@ -253,4 +254,4 @@ def build_qwenvl_inputs(
 
         # Use current CUDA device instead of self.model.device, which returns
         # a DTensor device under FSDP2 and causes mixed Tensor/DTensor errors.
-        return batch_inputs.to(f"cuda:{torch.cuda.current_device()}")
+        return batch_inputs.to(get_platform().device())

Original file line number	Diff line number	Diff line change
`@@ -157,7 +157,7 @@ def from_pretrained(cls, pretrained_path, device="cpu", *, config=None):`
`157`	`157`	`missing, unexpected = load_model(`
`158`	`158`	`model,`
`159`	`159`	`str(weights_path),`
`160`		`- device=device,`
	`160`	`+ device="cpu" if str(device) == "musa" else device,`
`161`	`161`	`strict=False,`
`162`	`162`	`)`
`163`	`163`	`if missing:`