Skip to content

merge elastic test into nvidia #2

merge elastic test into nvidia

merge elastic test into nvidia #2

name: All Tests Nvidia
on:
push:
branches: ["main"]
paths-ignore:
- 'hardware/BI_V150/**'
- 'hardware/Cambricon_MLU/**'
- 'hardware/Huawei_Atlas800TA3/**'
- 'hardware/Hygon_BW1000/**'
- 'hardware/Kunlunxin_R310p/**'
- 'hardware/MUSA_S5000/**'
- 'hardware/Metax_C550/**'
- 'hardware/Tsing_micro/**'
pull_request:
branches: ["main"]
paths-ignore:
- 'hardware/BI_V150/**'
- 'hardware/Cambricon_MLU/**'
- 'hardware/Huawei_Atlas800TA3/**'
- 'hardware/Hygon_BW1000/**'
- 'hardware/Kunlunxin_R310p/**'
- 'hardware/MUSA_S5000/**'
- 'hardware/Metax_C550/**'
- 'hardware/Tsing_micro/**'
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.actor }}
cancel-in-progress: true
jobs:
set-env:
runs-on: ubuntu-latest
outputs:
ci_image: ${{ steps.set-env.outputs.ci_image }} # Declare output variable
steps:
- name: Set Environment Variable
id: set-env # Assign an ID to this step
run: |
echo "ci_image=localhost:5000/flagscale:cuda12.8.1-cudnn9.7.1-python3.12-torch2.7.0-time2507111538" >> $GITHUB_OUTPUT # Set output variable
# Train Megatron Unit Tests with Matrix
unit_tests_train_megatron:
needs:
- set-env
uses: ./.github/workflows/unit-tests-nvidia.yml
strategy:
matrix:
subset:
- data
- dist_checkpointing
- distributed
- export
- fusions
- inference
- models
- pipeline_parallel
- post_training
- ssm
- tensor_parallel
- transformer/moe
- transformer
- ./
name: "train_megatron-${{ matrix.subset == './' && 'root' || matrix.subset }}"
with:
backend: train_megatron
subset: ${{ matrix.subset }}
image: ${{ needs.set-env.outputs.ci_image }}
# Train Flagscale Unit Tests with Matrix
unit_tests_train_flagscale:
needs:
- set-env
- unit_tests_train_megatron
uses: ./.github/workflows/unit-tests-nvidia.yml
strategy:
matrix:
subset:
- runner
- ./
name: "train_flagscale-${{ matrix.subset == './' && 'root' || matrix.subset }}"
with:
backend: train_flagscale
subset: ${{ matrix.subset }}
image: ${{ needs.set-env.outputs.ci_image }}
# Inference Flagscale Unit Tests with Matrix
unit_tests_inference_flagscale:
needs:
- set-env
- unit_tests_train_flagscale
uses: ./.github/workflows/unit-tests-nvidia.yml
strategy:
matrix:
subset:
- inference
- transforms
name: "inference_flagscale-${{ matrix.subset == './' && 'root' || matrix.subset }}"
with:
backend: inference_flagscale
subset: ${{ matrix.subset }}
image: ${{ needs.set-env.outputs.ci_image }}
# Elastic Unit Tests
unit_tests_elastic:
needs:
- set-env
- unit_tests_train_flagscale
runs-on: [self-hosted, Linux, X64, nvidia-0, gpus-8]
container:
image: ${{ needs.set-env.outputs.ci_image }}
ports:
- 80
volumes:
- /home/flagscale_cicd/flask/static:/workspace/report
- /home/flagscale_cicd/flask/config:/workspace/config
- /home/flagscale_cicd/docker/docker_build/docker_data:/home/gitlab-runner/data
- /home/flagscale_cicd/docker/docker_build/docker_tokenizers:/home/gitlab-runner/tokenizers
options: --gpus all --shm-size=500g --hostname flagscale_cicd --user root --ulimit nofile=65535:65535
steps:
- name: Checkout Code
uses: actions/checkout@v4
with:
repository: ${{ github.event.pull_request.head.repo.full_name || github.repository }}
ref: ${{ github.event.pull_request.head.ref || github.ref }}
ssh-strict: true
ssh-user: git
persist-credentials: true
clean: true
sparse-checkout-cone-mode: true
fetch-tags: false
show-progress: true
lfs: false
submodules: false
set-safe-directory: true
- name: Setup Environment
run: |
echo "USER: $USER"
echo "UID: $(id -u)"
echo "GID: $(id -g)"
echo "Home: $HOME"
whoami
git config --global --add safe.directory /__w/FlagScale/FlagScale || true
- name: Install Dependencies
run: |
source /root/miniconda3/etc/profile.d/conda.sh
conda activate flagscale-train
pip install pylint pytest pytest-cov
python tools/patch/unpatch.py --backend Megatron-LM || true
export PYTHONPATH=./third_party/Megatron-LM:$PYTHONPATH
export NVTE_FLASH_ATTN=0
export NVTE_FUSED_ATTN=0
ulimit -n 65535
- name: Pylint Check - Elastic Module
run: |
source /root/miniconda3/etc/profile.d/conda.sh
conda activate flagscale-train
export PYTHONPATH=./third_party/Megatron-LM:$PYTHONPATH
echo "Running pylint on elastic module..."
pylint flagscale/runner/elastic/ --output-format=text --reports=yes --exit-zero > pylint_report.txt
# Display pylint results
cat pylint_report.txt
# Extract pylint score
PYLINT_SCORE=$(grep "Your code has been rated at" pylint_report.txt | sed 's/.*rated at \([0-9.]*\).*/\1/' || echo "0")
echo "Pylint Score: $PYLINT_SCORE"
# Fail if score is below 8.0
if (( $(echo "$PYLINT_SCORE < 8.0" | bc -l) )); then
echo "Pylint score $PYLINT_SCORE is below required threshold of 8.0"
exit 1
fi
- name: Run Elastic Unit Tests
run: |
source /root/miniconda3/etc/profile.d/conda.sh
conda activate flagscale-train
export PYTHONPATH=./third_party/Megatron-LM:$PYTHONPATH
export NVTE_FLASH_ATTN=0
export NVTE_FUSED_ATTN=0
ulimit -n 65535
echo "Running elastic module unit tests..."
pytest tests/unit_tests/runner/elastic/ \
--cov=flagscale/runner/elastic \
--cov-report=xml:coverage_elastic.xml \
--cov-report=html:coverage_elastic_html \
--cov-report=term \
-v \
--tb=short \
-x
- name: Coverage Report
run: |
source /root/miniconda3/etc/profile.d/conda.sh
conda activate flagscale-train
echo "Coverage Summary for Elastic Module:"
if [ -f coverage_elastic.xml ]; then
echo "Coverage XML report generated successfully"
python -c "
import xml.etree.ElementTree as ET

Check failure on line 208 in .github/workflows/all-tests-nvidia.yml

View workflow run for this annotation

GitHub Actions / .github/workflows/all-tests-nvidia.yml

Invalid workflow file

You have an error in your yaml syntax on line 208
try:
tree = ET.parse('coverage_elastic.xml')
root = tree.getroot()
coverage = root.attrib.get('line-rate', '0')
percentage = float(coverage) * 100
print(f'Line Coverage: {percentage:.1f}%')
if percentage < 80:
print(f'Warning: Coverage {percentage:.1f}% is below recommended 80%')
exit(1)
except Exception as e:
print(f'Error parsing coverage: {e}')
exit(1)
"
else
echo "Coverage XML report not found"
exit 1
fi
- name: Archive Test Results
uses: actions/upload-artifact@v3
if: always()
with:
name: elastic-test-results
path: |
pylint_report.txt
coverage_elastic.xml
coverage_elastic_html/
retention-days: 30
- name: Summary
if: always()
run: |
echo "=== Elastic Module CI/CD Summary ==="
echo "[INFO] Pylint check completed"
echo "[INFO] Unit tests executed"
echo "[INFO] Coverage report generated"
echo "=== Test artifacts uploaded to GitHub Actions ==="
# Functional Tests with Mision and Type Matrix
functional_tests_train:
needs:
- set-env
- unit_tests_inference_flagscale
- unit_tests_elastic
uses: ./.github/workflows/functional-tests-nvidia.yml
strategy:
matrix:
task:
- aquila
- deepseek
- mixtral
- llava_onevision
name: "train-${{ matrix.task }}"
with:
task: ${{ matrix.task }}
type: train
image: ${{ needs.set-env.outputs.ci_image }}
functional_tests_hetero:
needs:
- set-env
- functional_tests_train
# TODO: test need fix
uses: ./.github/workflows/functional-tests-nvidia.yml
strategy:
matrix:
task:
- aquila
name: "hetero_train-${{ matrix.task }}"
with:
task: ${{ matrix.task }}
type: hetero_train
image: ${{ needs.set-env.outputs.ci_image }}
functional_tests_inference:
needs:
- set-env
- functional_tests_hetero
uses: ./.github/workflows/functional-tests-nvidia.yml
strategy:
matrix:
task:
- deepseek_r1_distill_qwen
- deepseek_r1_distill_qwen-flaggems
- qwen3
- qwen3-flaggems
name: "inference-${{ matrix.task }}"
with:
task: ${{ matrix.task }}
type: inference
image: ${{ needs.set-env.outputs.ci_image }}
functional_tests_serve:
needs:
- set-env
- functional_tests_inference
uses: ./.github/workflows/functional-tests-nvidia.yml
strategy:
matrix:
task:
- qwen2_5
- base
name: "serve-${{ matrix.task }}"
with:
task: ${{ matrix.task }}
type: serve
image: ${{ needs.set-env.outputs.ci_image }}
functional_tests_rl:
needs:
- set-env
- functional_tests_serve
uses: ./.github/workflows/functional-tests-nvidia.yml
strategy:
matrix:
task:
- qwen2_5
name: "rl-${{ matrix.task }}"
with:
task: ${{ matrix.task }}
type: rl
image: ${{ needs.set-env.outputs.ci_image }}
# Check All Tests
all-tests:
needs:
- unit_tests_train_megatron
- unit_tests_train_flagscale
- unit_tests_inference_flagscale
- functional_tests_train
- functional_tests_hetero
- functional_tests_inference
- functional_tests_serve
- functional_tests_rl
runs-on: ubuntu-latest
steps:
- name: All Tests Completed
run: echo "All tests completed successfully!"