merge elastic test into nvidia #2

Workflow file for this run

.github/workflows/all-tests-nvidia.yml at 7564090

	name: All Tests Nvidia

	on:
	push:
	branches: ["main"]
	paths-ignore:
	- 'hardware/BI_V150/**'
	- 'hardware/Cambricon_MLU/**'
	- 'hardware/Huawei_Atlas800TA3/**'
	- 'hardware/Hygon_BW1000/**'
	- 'hardware/Kunlunxin_R310p/**'
	- 'hardware/MUSA_S5000/**'
	- 'hardware/Metax_C550/**'
	- 'hardware/Tsing_micro/**'
	pull_request:
	branches: ["main"]
	paths-ignore:
	- 'hardware/BI_V150/**'
	- 'hardware/Cambricon_MLU/**'
	- 'hardware/Huawei_Atlas800TA3/**'
	- 'hardware/Hygon_BW1000/**'
	- 'hardware/Kunlunxin_R310p/**'
	- 'hardware/MUSA_S5000/**'
	- 'hardware/Metax_C550/**'
	- 'hardware/Tsing_micro/**'

	concurrency:
	group: ${{ github.workflow }}-${{ github.event.pull_request.number \|\| github.ref }}-${{ github.actor }}
	cancel-in-progress: true

	jobs:
	set-env:
	runs-on: ubuntu-latest
	outputs:
	ci_image: ${{ steps.set-env.outputs.ci_image }} # Declare output variable
	steps:
	- name: Set Environment Variable
	id: set-env # Assign an ID to this step
	run: \|
	echo "ci_image=localhost:5000/flagscale:cuda12.8.1-cudnn9.7.1-python3.12-torch2.7.0-time2507111538" >> $GITHUB_OUTPUT # Set output variable

	# Train Megatron Unit Tests with Matrix
	unit_tests_train_megatron:
	needs:
	- set-env
	uses: ./.github/workflows/unit-tests-nvidia.yml
	strategy:
	matrix:
	subset:
	- data
	- dist_checkpointing
	- distributed
	- export
	- fusions
	- inference
	- models
	- pipeline_parallel
	- post_training
	- ssm
	- tensor_parallel
	- transformer/moe
	- transformer
	- ./
	name: "train_megatron-${{ matrix.subset == './' && 'root' \|\| matrix.subset }}"
	with:
	backend: train_megatron
	subset: ${{ matrix.subset }}
	image: ${{ needs.set-env.outputs.ci_image }}

	# Train Flagscale Unit Tests with Matrix
	unit_tests_train_flagscale:
	needs:
	- set-env
	- unit_tests_train_megatron
	uses: ./.github/workflows/unit-tests-nvidia.yml
	strategy:
	matrix:
	subset:
	- runner
	- ./
	name: "train_flagscale-${{ matrix.subset == './' && 'root' \|\| matrix.subset }}"
	with:
	backend: train_flagscale
	subset: ${{ matrix.subset }}
	image: ${{ needs.set-env.outputs.ci_image }}

	# Inference Flagscale Unit Tests with Matrix
	unit_tests_inference_flagscale:
	needs:
	- set-env
	- unit_tests_train_flagscale
	uses: ./.github/workflows/unit-tests-nvidia.yml
	strategy:
	matrix:
	subset:
	- inference
	- transforms
	name: "inference_flagscale-${{ matrix.subset == './' && 'root' \|\| matrix.subset }}"
	with:
	backend: inference_flagscale
	subset: ${{ matrix.subset }}
	image: ${{ needs.set-env.outputs.ci_image }}

	# Elastic Unit Tests
	unit_tests_elastic:
	needs:
	- set-env
	- unit_tests_train_flagscale
	runs-on: [self-hosted, Linux, X64, nvidia-0, gpus-8]
	container:
	image: ${{ needs.set-env.outputs.ci_image }}
	ports:
	- 80
	volumes:
	- /home/flagscale_cicd/flask/static:/workspace/report
	- /home/flagscale_cicd/flask/config:/workspace/config
	- /home/flagscale_cicd/docker/docker_build/docker_data:/home/gitlab-runner/data
	- /home/flagscale_cicd/docker/docker_build/docker_tokenizers:/home/gitlab-runner/tokenizers
	options: --gpus all --shm-size=500g --hostname flagscale_cicd --user root --ulimit nofile=65535:65535

	steps:
	- name: Checkout Code
	uses: actions/checkout@v4
	with:
	repository: ${{ github.event.pull_request.head.repo.full_name \|\| github.repository }}
	ref: ${{ github.event.pull_request.head.ref \|\| github.ref }}
	ssh-strict: true
	ssh-user: git
	persist-credentials: true
	clean: true
	sparse-checkout-cone-mode: true
	fetch-tags: false
	show-progress: true
	lfs: false
	submodules: false
	set-safe-directory: true

	- name: Setup Environment
	run: \|
	echo "USER: $USER"
	echo "UID: $(id -u)"
	echo "GID: $(id -g)"
	echo "Home: $HOME"
	whoami
	git config --global --add safe.directory /__w/FlagScale/FlagScale \|\| true

	- name: Install Dependencies
	run: \|
	source /root/miniconda3/etc/profile.d/conda.sh
	conda activate flagscale-train
	pip install pylint pytest pytest-cov
	python tools/patch/unpatch.py --backend Megatron-LM \|\| true
	export PYTHONPATH=./third_party/Megatron-LM:$PYTHONPATH
	export NVTE_FLASH_ATTN=0
	export NVTE_FUSED_ATTN=0
	ulimit -n 65535

	- name: Pylint Check - Elastic Module
	run: \|
	source /root/miniconda3/etc/profile.d/conda.sh
	conda activate flagscale-train
	export PYTHONPATH=./third_party/Megatron-LM:$PYTHONPATH

	echo "Running pylint on elastic module..."
	pylint flagscale/runner/elastic/ --output-format=text --reports=yes --exit-zero > pylint_report.txt

	# Display pylint results
	cat pylint_report.txt

	# Extract pylint score
	PYLINT_SCORE=$(grep "Your code has been rated at" pylint_report.txt \| sed 's/.rated at $[0-9.]$.*/\1/' \|\| echo "0")
	echo "Pylint Score: $PYLINT_SCORE"

	# Fail if score is below 8.0
	if (( $(echo "$PYLINT_SCORE < 8.0" \| bc -l) )); then
	echo "Pylint score $PYLINT_SCORE is below required threshold of 8.0"
	exit 1
	fi

	- name: Run Elastic Unit Tests
	run: \|
	source /root/miniconda3/etc/profile.d/conda.sh
	conda activate flagscale-train
	export PYTHONPATH=./third_party/Megatron-LM:$PYTHONPATH
	export NVTE_FLASH_ATTN=0
	export NVTE_FUSED_ATTN=0
	ulimit -n 65535

	echo "Running elastic module unit tests..."
	pytest tests/unit_tests/runner/elastic/ \
	--cov=flagscale/runner/elastic \
	--cov-report=xml:coverage_elastic.xml \
	--cov-report=html:coverage_elastic_html \
	--cov-report=term \
	-v \
	--tb=short \
	-x

	- name: Coverage Report
	run: \|
	source /root/miniconda3/etc/profile.d/conda.sh
	conda activate flagscale-train

	echo "Coverage Summary for Elastic Module:"
	if [ -f coverage_elastic.xml ]; then
	echo "Coverage XML report generated successfully"
	python -c "
	import xml.etree.ElementTree as ET
Check failure on line 208 in .github/workflows/all-tests-nvidia.yml View workflow run for this annotation GitHub Actions / .github/workflows/all-tests-nvidia.yml Invalid workflow file `You have an error in your yaml syntax on line 208`
	try:
	tree = ET.parse('coverage_elastic.xml')
	root = tree.getroot()
	coverage = root.attrib.get('line-rate', '0')
	percentage = float(coverage) * 100
	print(f'Line Coverage: {percentage:.1f}%')
	if percentage < 80:
	print(f'Warning: Coverage {percentage:.1f}% is below recommended 80%')
	exit(1)
	except Exception as e:
	print(f'Error parsing coverage: {e}')
	exit(1)
	"
	else
	echo "Coverage XML report not found"
	exit 1
	fi

	- name: Archive Test Results
	uses: actions/upload-artifact@v3
	if: always()
	with:
	name: elastic-test-results
	path: \|
	pylint_report.txt
	coverage_elastic.xml
	coverage_elastic_html/
	retention-days: 30

	- name: Summary
	if: always()
	run: \|
	echo "=== Elastic Module CI/CD Summary ==="
	echo "[INFO] Pylint check completed"
	echo "[INFO] Unit tests executed"
	echo "[INFO] Coverage report generated"
	echo "=== Test artifacts uploaded to GitHub Actions ==="

	# Functional Tests with Mision and Type Matrix
	functional_tests_train:
	needs:
	- set-env
	- unit_tests_inference_flagscale
	- unit_tests_elastic
	uses: ./.github/workflows/functional-tests-nvidia.yml
	strategy:
	matrix:
	task:
	- aquila
	- deepseek
	- mixtral
	- llava_onevision
	name: "train-${{ matrix.task }}"
	with:
	task: ${{ matrix.task }}
	type: train
	image: ${{ needs.set-env.outputs.ci_image }}

	functional_tests_hetero:
	needs:
	- set-env
	- functional_tests_train
	# TODO: test need fix
	uses: ./.github/workflows/functional-tests-nvidia.yml
	strategy:
	matrix:
	task:
	- aquila
	name: "hetero_train-${{ matrix.task }}"
	with:
	task: ${{ matrix.task }}
	type: hetero_train
	image: ${{ needs.set-env.outputs.ci_image }}

	functional_tests_inference:
	needs:
	- set-env
	- functional_tests_hetero
	uses: ./.github/workflows/functional-tests-nvidia.yml
	strategy:
	matrix:
	task:
	- deepseek_r1_distill_qwen
	- deepseek_r1_distill_qwen-flaggems
	- qwen3
	- qwen3-flaggems
	name: "inference-${{ matrix.task }}"
	with:
	task: ${{ matrix.task }}
	type: inference
	image: ${{ needs.set-env.outputs.ci_image }}

	functional_tests_serve:
	needs:
	- set-env
	- functional_tests_inference
	uses: ./.github/workflows/functional-tests-nvidia.yml
	strategy:
	matrix:
	task:
	- qwen2_5
	- base
	name: "serve-${{ matrix.task }}"
	with:
	task: ${{ matrix.task }}
	type: serve
	image: ${{ needs.set-env.outputs.ci_image }}

	functional_tests_rl:
	needs:
	- set-env
	- functional_tests_serve
	uses: ./.github/workflows/functional-tests-nvidia.yml
	strategy:
	matrix:
	task:
	- qwen2_5
	name: "rl-${{ matrix.task }}"
	with:
	task: ${{ matrix.task }}
	type: rl
	image: ${{ needs.set-env.outputs.ci_image }}

	# Check All Tests
	all-tests:
	needs:
	- unit_tests_train_megatron
	- unit_tests_train_flagscale
	- unit_tests_inference_flagscale
	- functional_tests_train
	- functional_tests_hetero
	- functional_tests_inference
	- functional_tests_serve
	- functional_tests_rl
	runs-on: ubuntu-latest
	steps:
	- name: All Tests Completed
	run: echo "All tests completed successfully!"

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

merge elastic test into nvidia #2

Workflow file

merge elastic test into nvidia #2

Uh oh!

Workflow file for this run

GitHub Actions / .github/workflows/all-tests-nvidia.yml