Update qa_l1_test.yml #9

Workflow file for this run

.github/workflows/qa_l1_test.yml at ae95ecd

	name: QA L1 Tests

	on:
	push:
	branches:
	- test-ye
	pull_request:
	branches:
	- main

	concurrency:
	group: ${{ github.workflow }}-${{ github.event.pull_request.number \|\| github.ref }}-${{ github.actor }}
	cancel-in-progress: true

	jobs:
	qa-l1-test:
	runs-on: [ self-hosted, TE_FL ]
	defaults:
	run:
	shell: bash
	container:
	image: localhost:5000/flagscale-cuda12.8.1-torch2.7.1-python3.10-te2.9:20260209
	ports:
	- 80:80
	options: >-
	--gpus all
	--shm-size=500g
	--privileged
	--ipc=host
	--ulimit memlock=-1
	--ulimit stack=67108864
	--ulimit nofile=65535:65535
	--user root
	--pull always
	steps:
	- name: Checkout Code
	uses: actions/[email protected]
	with:
	repository: ${{ github.event.pull_request.head.repo.full_name }}
	ref: ${{ github.event.pull_request.head.ref }}
	ssh-strict: true
	ssh-user: git
	persist-credentials: true
	clean: true
	sparse-checkout-cone-mode: true
	fetch-tags: false
	show-progress: true
	lfs: false
	submodules: recursive
	set-safe-directory: true

	- name: Install dependencies and build transformer_engine
	# timeout-minutes: 30
	env:
	NVTE_FRAMEWORK: pytorch
	TE_WITH_NCCL: 1
	run: \|
	source /opt/miniconda3/etc/profile.d/conda.sh
	conda activate flagscale-train
	# Install MPI
	apt update
	apt install -y libopenmpi-dev openmpi-bin openmpi-common
	apt install -y libmpich-dev mpich

	# Verify the MPI header file
	mpicxx -show \| awk '{for(i=1;i<=NF;i++) if($i ~ /-I/) print substr($i,3)}'

	# Verify whether the MPI C++ environment is ready
	# 1. Verify whether the MPI C++ compiler (mpicxx) exists
	mpicxx --version
	# 2. Verify if the MPI library file exists
	ls /usr/lib/x86_64-linux-gnu/libmpi_cxx.so

	# Install dependencies
	pip install optree looseversion opt_einsum lightning_utilities

	# Clone lightning-thunder
	git clone --recurse-submodules https://github.com/Lightning-AI/lightning-thunder.git

	echo "Install transformer_engine"
	pip install --no-build-isolation -vvv . --no-deps

	# Verify installation
	python3 tests/pytorch/test_sanity_import.py

	- name: GPU Usage Check / Verification
	run: \|
	source .github/workflows/scripts/gpu_check.sh
	wait_for_gpu

	# - name: L1 CPP Distributed # 待检验
	# id: L1_cpp_distributed
	# # timeout-minutes: 10
	# env:
	# TE_PATH: .
	# run: \|
	# source /opt/miniconda3/etc/profile.d/conda.sh
	# conda activate flagscale-train

	# TE_LIB_PATH=$(pip3 show transformer-engine \| grep -E "Location:\|Editable project location:" \| tail -n 1 \| awk '{print $NF}')
	# TE_CPP_LIB_PATH="${TE_LIB_PATH}/transformer_engine"
	# export CMAKE_PREFIX_PATH="${TE_CPP_LIB_PATH}:${CMAKE_PREFIX_PATH}"
	# export LD_LIBRARY_PATH="${TE_CPP_LIB_PATH}:${LD_LIBRARY_PATH}"

	# bash ./qa/L1_cpp_distributed/test.sh

	- name: L1 Pytorch Thunder Integration
	id: L1_pytorch_thunder_integration
	env:
	XML_LOG_DIR: "/logs/pytorch/thunder"
	THUNDER_PATH: "lightning-thunder"
	run: \|
	source /opt/miniconda3/etc/profile.d/conda.sh
	conda activate flagscale-train
	bash ./qa/L1_pytorch_thunder_integration/test.sh

	- name: L1 Pytorch Distributed Unittest
	id: L1_pytorch_distributed_unittest
	continue-on-error: true
	env:
	XML_LOG_DIR: "/logs/pytorch/distributed"
	TE_PATH: .
	run: \|
	source /opt/miniconda3/etc/profile.d/conda.sh
	conda activate flagscale-train

	bash ./qa/L1_pytorch_distributed_unittest/test.sh

	- name: L1 Pytorch Onnx Unittest
	id: L1_pytorch_onnx_unittest
	env:
	XML_LOG_DIR: "/logs/pytorch/onnx"
	TE_PATH: .
	run: \|
	source /opt/miniconda3/etc/profile.d/conda.sh
	conda activate flagscale-train

	bash ./qa/L1_pytorch_onnx_unittest/test.sh

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Update qa_l1_test.yml #9

Workflow file

Update qa_l1_test.yml #9

Uh oh!

Workflow file for this run