Skip to content

Commit 006d645

Browse files
zihugithubzihugithub
andauthored
[CICD] Add Ascend NPU and MetaX multi-chip test support (flagos-ai#1160)
### PR Category CICD ### PR Types Test Case ### PR Description - Add hardware config files for Ascend (.github/configs/ascend.yml) and MetaX (.github/configs/metax.yml) platforms - Add dedicated CI workflows: all_tests_ascend.yml, all_tests_metax.yml - Add platform-specific test configs under tests/test_utils/config/platforms/ - Add Qwen3 test cases for Ascend (inference/serve 4b_tp1) and MetaX (train 0_6b) - Support env-aware CLI setup in functional test workflows - Fix NPU-specific issues in inference, serve, and train code paths - Update check_results.py and parse_config.py to handle multi-chip configs --------- Co-authored-by: zihugithub <fbye@baai.ac.cn>
1 parent a3d880f commit 006d645

25 files changed

Lines changed: 729 additions & 22 deletions

.github/configs/ascend.yml

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
# Ascend NPU Hardware Configuration
2+
# This file defines CI/CD settings for Huawei Ascend NPU-based testing
3+
# Test configurations are defined in tests/test_utils/config/platforms/ascend.yaml
4+
5+
hardware_name: ascend
6+
display_name: "Ascend NPU Tests"
7+
8+
# Docker image for this hardware
9+
ci_image: harbor.baai.ac.cn/flagscale/vllm-plugin-fl:v0.1.0-ascend-ci
10+
ci_train_image: harbor.baai.ac.cn/flagscale/vllm-plugin-fl:v0.1.0-ascend-ci
11+
ci_inference_image: harbor.baai.ac.cn/flagscale/vllm-plugin-fl:v0.1.0-ascend-ci
12+
13+
# Runner labels for this hardware
14+
runner_labels:
15+
- self-hosted
16+
- Linux
17+
- ARM64
18+
- ascend-72
19+
- npus-8
20+
21+
# Container volumes (hardware-specific paths)
22+
container_volumes:
23+
- /home/flagscale_cicd/flask/static:/workspace/report
24+
- /home/flagscale_cicd/flask/config:/workspace/config
25+
- /home/flagscale_cicd/docker/docker_build/docker_data:/home/gitlab-runner/data
26+
- /home/flagscale_cicd/docker/docker_build/docker_tokenizers:/home/gitlab-runner/tokenizers
27+
- /usr/local/Ascend/driver:/usr/local/Ascend/driver:ro
28+
- /usr/local/Ascend/add-ons:/usr/local/Ascend/add-ons:ro
29+
- /usr/local/sbin/npu-smi:/usr/local/sbin/npu-smi:ro
30+
31+
# Container options for Ascend NPU (no --gpus, use --device for NPU cards)
32+
container_options: >-
33+
--device /dev/davinci0
34+
--device /dev/davinci1
35+
--device /dev/davinci2
36+
--device /dev/davinci3
37+
--device /dev/davinci4
38+
--device /dev/davinci5
39+
--device /dev/davinci6
40+
--device /dev/davinci7
41+
--device /dev/davinci8
42+
--device /dev/davinci9
43+
--device /dev/davinci10
44+
--device /dev/davinci11
45+
--device /dev/davinci12
46+
--device /dev/davinci13
47+
--device /dev/davinci14
48+
--device /dev/davinci15
49+
--device /dev/davinci_manager
50+
--device /dev/devmm_svm
51+
--device /dev/hisi_hdc
52+
--privileged
53+
--ipc=host
54+
--shm-size=500g
55+
--hostname flagscale_cicd
56+
--user root
57+
--ulimit nofile=65535:65535
58+
59+
# =============================================================================
60+
# Package Manager Configuration
61+
# =============================================================================
62+
# Ascend CI image has all dependencies pre-installed; use pip directly.
63+
pkg_mgr: "pip"
64+
65+
# Environment path (not used for pip, kept for compatibility)
66+
env_path: ""
67+
68+
# Conda environment name (not used for pip)
69+
env_names:
70+
train: ""
71+
hetero_train: ""
72+
inference: ""
73+
serve: ""
74+
rl: ""

.github/configs/metax.yml

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
# MetaX C500 Hardware Configuration
2+
# This file defines CI/CD settings for MetaX C500 GPU (MACA) testing
3+
# Test configurations are defined in tests/test_utils/config/platforms/metax.yaml
4+
5+
hardware_name: metax
6+
display_name: "MetaX C500 Tests"
7+
8+
# Docker images for MetaX C500 (MACA-based)
9+
ci_image: localhost:5000/megatron-lm-with-te:202603231839
10+
ci_train_image: localhost:5000/megatron-lm-with-te:202603231839
11+
ci_inference_image: localhost:5000/megatron-lm-with-te:202603231839
12+
13+
# Runner labels for MetaX C500 hardware
14+
runner_labels:
15+
- self-hosted
16+
- Linux
17+
- X64
18+
- metax-c500
19+
- gpus-2
20+
21+
# Container volumes (hardware-specific paths)
22+
container_volumes:
23+
- /home/flagscale_cicd/docker/docker_build/docker_data:/home/gitlab-runner/data
24+
- /home/flagscale_cicd/docker/docker_build/docker_tokenizers:/home/gitlab-runner/tokenizers
25+
26+
# Container options for MetaX C500 (MACA runtime)
27+
container_options: "--hostname=flagscale-ci --ipc=host --privileged --group-add video --shm-size=100g --ulimit memlock=-1 --security-opt seccomp=unconfined --security-opt apparmor=unconfined --device=/dev/dri --device=/dev/mxcd --device=/dev/infiniband --user root --ulimit nofile=65535:65535"
28+
29+
# =============================================================================
30+
# Package Manager Configuration
31+
# =============================================================================
32+
pkg_mgr: "conda"
33+
34+
# Environment path (conda installation path)
35+
env_path: "/opt/conda"
36+
37+
# Conda environment names
38+
env_names:
39+
train: "base"
40+
hetero_train: ""
41+
inference: ""
42+
rl: ""
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
name: ascend_tests
2+
3+
on:
4+
push:
5+
branches: ["main"]
6+
pull_request:
7+
branches: ["main"]
8+
9+
concurrency:
10+
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.actor }}
11+
cancel-in-progress: true
12+
13+
jobs:
14+
run_tests:
15+
uses: ./.github/workflows/all_tests_common.yml
16+
with:
17+
platform: ascend
18+
19+
all_tests:
20+
needs: run_tests
21+
runs-on: ubuntu-latest
22+
if: always()
23+
steps:
24+
- name: Verify workflow status
25+
run: |
26+
if [ "${{ needs.run_tests.result }}" != "success" ]; then
27+
echo "❌ Tests workflow failed"
28+
exit 1
29+
fi
30+
echo "✅ All tests passed!"

.github/workflows/all_tests_common.yml

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -111,11 +111,15 @@ jobs:
111111
container_volumes: ${{ needs.checkout_and_config.outputs.container_volumes }}
112112
container_options: ${{ needs.checkout_and_config.outputs.container_options }}
113113
source_artifact: flagscale-source-${{ github.sha }}
114+
pkg_mgr: ${{ needs.checkout_and_config.outputs.pkg_mgr }}
115+
env_name: ${{ needs.checkout_and_config.outputs.env_name_train }}
116+
env_path: ${{ needs.checkout_and_config.outputs.env_path }}
114117

115118
unit_tests:
116119
needs:
117120
- checkout_and_config
118121
- cli_validation
122+
if: fromJson(needs.checkout_and_config.outputs.device_types)[0] != null
119123
strategy:
120124
fail-fast: false
121125
matrix:
@@ -266,12 +270,14 @@ jobs:
266270
# Check all test jobs (skip if not run)
267271
failed=false
268272
269-
if [ "${{ needs.unit_tests.result }}" != "success" ]; then
273+
if [ "${{ needs.unit_tests.result }}" != "success" ] && \
274+
[ "${{ needs.unit_tests.result }}" != "skipped" ]; then
270275
echo "❌ Unit tests failed"
271276
failed=true
272277
fi
273278
274-
if [ "${{ needs.cli_validation.result }}" != "success" ]; then
279+
if [ "${{ needs.cli_validation.result }}" != "success" ] && \
280+
[ "${{ needs.cli_validation.result }}" != "skipped" ]; then
275281
echo "❌ CLI validation failed"
276282
failed=true
277283
fi
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
name: metax_c500_tests
2+
3+
on:
4+
push:
5+
branches: ["main"]
6+
pull_request:
7+
branches: ["main"]
8+
9+
concurrency:
10+
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.actor }}
11+
cancel-in-progress: true
12+
13+
jobs:
14+
run_tests:
15+
# Hardware and environment settings are read from .github/configs/metax.yml
16+
# Test selection is configured in tests/test_utils/config/platforms/metax.yaml
17+
uses: ./.github/workflows/all_tests_common.yml
18+
with:
19+
platform: metax
20+
21+
all_tests:
22+
needs: run_tests
23+
runs-on: ubuntu-latest
24+
if: always()
25+
steps:
26+
- name: Verify workflow status
27+
run: |
28+
if [ "${{ needs.run_tests.result }}" != "success" ]; then
29+
echo "❌ Tests workflow failed"
30+
exit 1
31+
fi
32+
echo "✅ All MetaX C500 tests passed!"

.github/workflows/functional_tests_cli.yml

Lines changed: 72 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,21 @@ on:
2323
required: true
2424
type: string
2525
description: Name of the artifact containing source code
26+
pkg_mgr:
27+
required: false
28+
type: string
29+
description: Package manager (pip, uv, conda). Default uv.
30+
default: "uv"
31+
env_name:
32+
required: false
33+
type: string
34+
description: Conda environment name (for conda only)
35+
default: ""
36+
env_path:
37+
required: false
38+
type: string
39+
description: Environment path (venv path for uv, conda installation path for conda)
40+
default: "/opt/venv"
2641

2742
jobs:
2843
functional_test_cli:
@@ -93,11 +108,40 @@ jobs:
93108
set -euo pipefail
94109
cd $PROJECT_ROOT
95110
96-
echo "Installing FlagScale CLI (outside virtual environment)"
111+
PKG_MGR='${{ inputs.pkg_mgr }}'
112+
ENV_NAME='${{ inputs.env_name }}'
113+
ENV_PATH='${{ inputs.env_path }}'
114+
115+
echo "Installing FlagScale CLI"
116+
echo "Installing dependencies for training"
117+
echo "Package Manager: $PKG_MGR"
118+
echo "Environment Name: $ENV_NAME"
119+
echo "Environment Path: $ENV_PATH"
120+
121+
# Source environment utilities
122+
source ./tools/install/utils/pyenv_utils.sh
123+
124+
# Activate environment based on package manager
125+
case "$PKG_MGR" in
126+
conda)
127+
if [ -n "$ENV_NAME" ] && [ -n "$ENV_PATH" ]; then
128+
activate_conda "$ENV_NAME" "$ENV_PATH" || { echo "❌ Conda activation failed"; exit 1; }
129+
fi
130+
;;
131+
uv)
132+
if [ -n "$ENV_PATH" ] && [ -d "$ENV_PATH" ]; then
133+
activate_uv_env "$ENV_PATH" || { echo "❌ UV activation failed"; exit 1; }
134+
fi
135+
;;
136+
pip)
137+
echo "Using system Python with pip"
138+
;;
139+
esac
140+
97141
echo "Python location: $(which python)"
98142
echo "Python version: $(python --version)"
99143
100-
# Install FlagScale CLI using system pip
144+
# Install FlagScale CLI
101145
pip install . --no-build-isolation --root-user-action=ignore || { echo "❌ FlagScale CLI install failed"; exit 1; }
102146
103147
# Verify installation
@@ -111,7 +155,32 @@ jobs:
111155
set -euo pipefail
112156
cd $PROJECT_ROOT
113157
114-
echo "Validating CLI (outside virtual environment)"
158+
PKG_MGR='${{ inputs.pkg_mgr }}'
159+
ENV_NAME='${{ inputs.env_name }}'
160+
ENV_PATH='${{ inputs.env_path }}'
161+
162+
# Source environment utilities
163+
source ./tools/install/utils/pyenv_utils.sh
164+
165+
# Activate environment based on package manager
166+
# Activate environment based on package manager
167+
case "$PKG_MGR" in
168+
conda)
169+
if [ -n "$ENV_NAME" ] && [ -n "$ENV_PATH" ]; then
170+
activate_conda "$ENV_NAME" "$ENV_PATH" || { echo "❌ Conda activation failed"; exit 1; }
171+
fi
172+
;;
173+
uv)
174+
if [ -n "$ENV_PATH" ] && [ -d "$ENV_PATH" ]; then
175+
activate_uv_env "$ENV_PATH" || { echo "❌ UV activation failed"; exit 1; }
176+
fi
177+
;;
178+
pip)
179+
echo "Using system Python with pip"
180+
;;
181+
esac
182+
183+
echo "Validating CLI"
115184
echo "Python location: $(which python)"
116185
echo "Python version: $(python --version)"
117186

.github/workflows/functional_tests_inference.yml

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,7 @@ jobs:
115115
git config --global --add safe.directory $PROJECT_ROOT
116116
117117
- name: Setup environment for inference
118+
if: inputs.platform == 'cuda'
118119
run: |
119120
set -euo pipefail
120121
cd $PROJECT_ROOT
@@ -164,6 +165,27 @@ jobs:
164165
165166
timeout-minutes: 5
166167

168+
- name: Install dependencies for inference ascend
169+
if: inputs.platform == 'ascend'
170+
run: |
171+
set -euo pipefail
172+
173+
# Install vllm-plugin-FL
174+
pip install vllm-plugin-fl==0.1.0+vllm0.13.0 \
175+
--extra-index-url https://resource.flagos.net/repository/flagos-pypi-hosted/simple \
176+
|| { echo "❌ vllm-plugin-FL install failed"; exit 1; }
177+
echo "✅ vllm-plugin-FL installed successfully"
178+
179+
# Install FlagScale and dependencies
180+
cd $PROJECT_ROOT
181+
pip install . --no-build-isolation --root-user-action=ignore || { echo "❌ FlagScale CLI install failed"; exit 1; }
182+
183+
# Verify installation
184+
command -v flagscale || { echo "❌ FlagScale CLI not found in PATH"; exit 1; }
185+
echo "✅ FlagScale CLI installed successfully: $(flagscale --version 2>/dev/null || echo 'version unknown')"
186+
echo "✅ Environment ready for inference tests"
187+
timeout-minutes: 15
188+
167189
- name: Run functional tests
168190
id: functional_test
169191
run: |

.github/workflows/functional_tests_serve.yml

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,7 @@ jobs:
115115
git config --global --add safe.directory $PROJECT_ROOT
116116
117117
- name: Install dependencies for serve
118+
if: inputs.platform == 'cuda'
118119
run: |
119120
set -euo pipefail
120121
cd $PROJECT_ROOT
@@ -163,6 +164,27 @@ jobs:
163164
echo "Environment ready for serve tests"
164165
timeout-minutes: 5
165166

167+
- name: Install dependencies for serve ascend
168+
if: inputs.platform == 'ascend'
169+
run: |
170+
set -euo pipefail
171+
172+
# Install vllm-plugin-FL
173+
pip install vllm-plugin-fl==0.1.0+vllm0.13.0 \
174+
--extra-index-url https://resource.flagos.net/repository/flagos-pypi-hosted/simple \
175+
|| { echo "❌ vllm-plugin-FL install failed"; exit 1; }
176+
echo "✅ vllm-plugin-FL installed successfully"
177+
178+
# Install FlagScale
179+
cd $PROJECT_ROOT
180+
pip install . --no-build-isolation --root-user-action=ignore || { echo "❌ FlagScale CLI install failed"; exit 1; }
181+
182+
# Verify installation
183+
command -v flagscale || { echo "❌ FlagScale CLI not found in PATH"; exit 1; }
184+
echo "✅ FlagScale CLI installed successfully: $(flagscale --version 2>/dev/null || echo 'version unknown')"
185+
echo "✅ Environment ready for serve tests"
186+
timeout-minutes: 15
187+
166188
- name: Run functional tests
167189
id: functional_test
168190
env:

0 commit comments

Comments
 (0)