Merge branch 'flagos-ai:main' into test-ye #6

Workflow file for this run

.github/workflows/qa_l1_test.yml at 296181a

	name: QA L1 Tests

	on:
	push:
	branches:
	- test-ye
	pull_request:
	branches:
	- main

	concurrency:
	group: ${{ github.workflow }}-${{ github.event.pull_request.number \|\| github.ref }}-${{ github.actor }}
	cancel-in-progress: true

	jobs:
	qa-l1-test:
	runs-on: [ self-hosted, TE_FL ]
	defaults:
	run:
	shell: bash
	container:
	image: localhost:5000/flagscale-cuda12.8.1-torch2.7.1-python3.10-te2.9:20260209
	ports:
	- 80:80
	options: >-
	--gpus all
	--shm-size=500g
	--privileged
	--ipc=host
	--ulimit memlock=-1
	--ulimit stack=67108864
	--ulimit nofile=65535:65535
	--user root
	--pull always
	steps:
	- name: Checkout Code
	uses: actions/[email protected]
	with:
	repository: ${{ github.event.pull_request.head.repo.full_name }}
	ref: ${{ github.event.pull_request.head.ref }}
	ssh-strict: true
	ssh-user: git
	persist-credentials: true
	clean: true
	sparse-checkout-cone-mode: true
	fetch-tags: false
	show-progress: true
	lfs: false
	submodules: recursive
	set-safe-directory: true

	- name: Install dependencies and build transformer_engine
	# timeout-minutes: 30
	env:
	NVTE_FRAMEWORK: pytorch
	TE_WITH_NCCL: 1
	run: \|
	source /opt/miniconda3/etc/profile.d/conda.sh
	conda activate flagscale-train
	# Install MPI
	apt update
	apt install -y libopenmpi-dev openmpi-bin openmpi-common
	apt install -y libmpich-dev mpich

	# Verify the MPI header file
	mpicxx -show \| awk '{for(i=1;i<=NF;i++) if($i ~ /-I/) print substr($i,3)}'

	# Verify whether the MPI C++ environment is ready
	# 1. Verify whether the MPI C++ compiler (mpicxx) exists
	mpicxx --version
	# 2. Verify if the MPI library file exists
	ls /usr/lib/x86_64-linux-gnu/libmpi_cxx.so

	# Install dependencies
	pip install optree looseversion opt_einsum lightning_utilities

	# Clone lightning-thunder
	git clone --recurse-submodules https://github.com/Lightning-AI/lightning-thunder.git

	echo "Install transformer_engine"
	pip install --no-build-isolation -vvv . --no-deps

	# Verify installation
	python3 tests/pytorch/test_sanity_import.py

	- name: GPU Usage Check / Verification
	run: \|
	source .github/workflows/scripts/gpu_check.sh
	wait_for_gpu

	- name: L1 CPP Distributed
	id: L1_cpp_distributed
	# timeout-minutes: 10
	env:
	TE_PATH: .
	run: \|
	TE_LIB_PATH=$(pip3 show transformer-engine \| grep -E "Location:\|Editable project location:" \| tail -n 1 \| awk '{print $NF}')
	TE_CPP_LIB_PATH="${TE_LIB_PATH}/transformer_engine"
	export CMAKE_PREFIX_PATH="${TE_CPP_LIB_PATH}:${CMAKE_PREFIX_PATH}"
	export LD_LIBRARY_PATH="${TE_CPP_LIB_PATH}:${LD_LIBRARY_PATH}"

	source /opt/miniconda3/etc/profile.d/conda.sh
	conda activate flagscale-train

	bash ./qa/L1_cpp_distributed/test.sh

	- name: L1 Pytorch Thunder Integration
	id: L1_pytorch_thunder_integration
	env:
	XML_LOG_DIR: "/logs/pytorch/thunder"
	THUNDER_PATH: "lightning-thunder"
	run: \|
	source /opt/miniconda3/etc/profile.d/conda.sh
	conda activate flagscale-train
	bash ./qa/L1_pytorch_thunder_integration/test.sh

	- name: L1 Pytorch Distributed Unittest
	id: L1_pytorch_distributed_unittest
	continue-on-error: true
	env:
	XML_LOG_DIR: "/logs/pytorch/distributed"
	TE_PATH: .
	run: \|
	ignore_files=("tests/pytorch/distributed/test_sanity.py" "tests/pytorch/distributed/test_comm_gemm_overlap.py"
	"tests/pytorch/distributed/test_fusible_ops_with_userbuffers.py" "tests/pytorch/attention/test_attention_with_cp.py"
	"tests/pytorch/debug/test_distributed.py" "tests/pytorch/distributed/test_numerics.py"
	)

	for file in "${ignore_files[@]}"; do
	escaped_file=$(echo "$file" \| sed 's/\//\\\//g')
	sed -i "s/^.$$escaped_file$.$/#&/" ./qa/L1_pytorch_distributed_unittest/test.sh
	done

	cat ./qa/L1_pytorch_distributed_unittest/test.sh

	source /opt/miniconda3/etc/profile.d/conda.sh
	conda activate flagscale-train

	bash ./qa/L1_pytorch_distributed_unittest/test.sh

	- name: L1 Pytorch Onnx Unittest
	id: L1_pytorch_onnx_unittest
	env:
	XML_LOG_DIR: "/logs/pytorch/onnx"
	TE_PATH: .
	run: \|
	source /opt/miniconda3/etc/profile.d/conda.sh
	conda activate flagscale-train
	# Test items that need to be skipped
	SKIP_TESTS=(
	"test_trt_integration[None]"
	)

	# Execution file path
	TARGET_FILE="./qa/L1_pytorch_onnx_unittest/test.sh"
	OLD_CMD="python3 -m pytest --tb=auto --junitxml=\$XML_LOG_DIR/test_onnx_export.xml \$TE_PATH/tests/pytorch/test_onnx_export.py"

	SKIP_PARAM=""
	if [ ${#SKIP_TESTS[@]} -gt 0 ]; then
	SKIP_EXPR=""
	for test in "${SKIP_TESTS[@]}"; do
	if [ -z "$SKIP_EXPR" ]; then
	SKIP_EXPR="not $test"
	else
	SKIP_EXPR+=" and not $test"
	fi
	done
	SKIP_PARAM="-k \"${SKIP_EXPR}\""
	fi

	CMD="${OLD_CMD} ${SKIP_PARAM}"
	sed -i "s\|${OLD_CMD}\|${CMD}\|g" "${TARGET_FILE}"

	echo ""
	cat ./qa/L1_pytorch_onnx_unittest/test.sh
	echo ""

	bash ./qa/L1_pytorch_onnx_unittest/test.sh

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Merge branch 'flagos-ai:main' into test-ye #6

Workflow file

Merge branch 'flagos-ai:main' into test-ye #6

Uh oh!

Workflow file for this run