Add workflows #1

Workflow file for this run

.github/workflows/qa-l1-te-cpp-pytorch-tests.yml at 02f43e5

	name: QA L1 - Comprehensive Integration Tests

	on:
	push:
	branches: addWorkflows26014
	paths:
	- '.github/workflows/qa-l1-te-cpp-pytorch-tests.yml'
	- 'qa/L1_cpp_distributed/**'
	- 'tests/cpp_distributed/**'
	- 'qa/L1_pytorch_thunder_integration/**'
	- 'qa/L1_pytorch_distributed_unittest/**'
	- 'tests/pytorch/distributed/**'
	- 'tests/pytorch/attention/**'
	- 'qa/L1_pytorch_onnx_unittest/**'
	- 'tests/pytorch/test_onnx_export.py'

	pull_request:
	branches: main
	paths:
	- '.github/workflows/qa-l1-te-cpp-pytorch-tests.yml'
	- 'qa/L1_cpp_distributed/**'
	- 'tests/cpp_distributed/**'
	- 'qa/L1_pytorch_thunder_integration/**'
	- 'qa/L1_pytorch_distributed_unittest/**'
	- 'tests/pytorch/distributed/**'
	- 'tests/pytorch/attention/**'
	- 'qa/L1_pytorch_onnx_unittest/**'
	- 'tests/pytorch/test_onnx_export.py'

	concurrency:
	group: ${{ github.workflow }}-${{ github.event.pull_request.number \|\| github.ref }}-${{ github.actor }}
	cancel-in-progress: true

	jobs:
	run-qa-l1-comprehensive-tests:
	runs-on: [ self-hosted, Linux, X64, nvidia, gpu-8 ]
	defaults:
	run:
	shell: bash
	container:
	image: harbor.baai.ac.cn/flagscale/cuda12.8.1-torch2.7.1-python3.10-te2.9:20260209
	ports:
	- 80:80
	options: >-
	--gpus all
	--shm-size=500g
	--privileged
	--ipc=host
	--ulimit memlock=-1
	--ulimit stack=67108864
	--ulimit nofile=65535:65535
	--user root
	--pull always
	steps:
	- name: Checkout Code
	uses: actions/[email protected]
	with:
	repository: ${{ github.event.pull_request.head.repo.full_name }}
	ref: ${{ github.event.pull_request.head.ref }}
	ssh-strict: true
	ssh-user: git
	persist-credentials: true
	clean: true
	sparse-checkout-cone-mode: true
	fetch-tags: false
	show-progress: true
	lfs: false
	submodules: recursive
	set-safe-directory: true

	- name: Install Dependencies & Build Transformer Engine
	# timeout-minutes: 40
	env:
	NVTE_FRAMEWORK: pytorch
	TE_WITH_NCCL: 1
	run: \|
	# Activate conda environment
	echo "=== Activating Conda Environment ==="
	source /opt/miniconda3/etc/profile.d/conda.sh
	conda activate flagscale-train

	# Install MPI
	apt update
	apt install -y libopenmpi-dev openmpi-bin openmpi-common
	apt install -y libmpich-dev mpich

	# Verify the MPI header file
	mpicxx -show \| awk '{for(i=1;i<=NF;i++) if($i ~ /-I/) print substr($i,3)}'

	# Verify whether the MPI C++ environment is ready
	# 1. Verify whether the MPI C++ compiler (mpicxx) exists
	mpicxx --version
	# 2. Verify if the MPI library file exists
	ls /usr/lib/x86_64-linux-gnu/libmpi_cxx.so

	# Install dependencies
	pip install optree looseversion opt_einsum lightning_utilities

	# Clone lightning-thunder
	git clone --recurse-submodules https://github.com/Lightning-AI/lightning-thunder.git

	echo "Install transformer_engine"
	pip install --no-build-isolation -vvv . --no-deps

	# Verify installation
	python3 tests/pytorch/test_sanity_import.py

	- name: Verify GPU Availability & Health
	run: \|
	# Execute GPU check
	echo "=== Checking GPU Status ==="
	source .github/workflows/scripts/gpu_check.sh
	wait_for_gpu

	# - name: Run L1 PyTorch Thunder Integration Tests
	# env:
	# XML_LOG_DIR: "/logs/pytorch/thunder"
	# THUNDER_PATH: "lightning-thunder"
	# TE_PATH: .
	# run: \|
	# # Activate conda environment
	# source /opt/miniconda3/etc/profile.d/conda.sh
	# conda activate flagscale-train

	# # Run thunder integration tests
	# echo "=== Running L1 PyTorch Thunder Integration Tests ==="
	# bash ./qa/L1_pytorch_thunder_integration/test.sh
	# # timeout-minutes: 5

	# - name: Run L1 PyTorch Distributed Unit Tests
	# continue-on-error: true
	# env:
	# XML_LOG_DIR: "/logs/pytorch/distributed"
	# TE_PATH: .
	# run: \|
	# # Activate conda environment
	# source /opt/miniconda3/etc/profile.d/conda.sh
	# conda activate flagscale-train

	# # Run distributed unit tests
	# echo "=== Running L1 PyTorch Distributed Unit Tests ==="
	# bash ./qa/L1_pytorch_distributed_unittest/test.sh
	# # timeout-minutes: 5

	# - name: Run L1 PyTorch ONNX Unit Tests
	# env:
	# XML_LOG_DIR: "/logs/pytorch/onnx"
	# TE_PATH: .
	# run: \|
	# # Activate conda environment
	# source /opt/miniconda3/etc/profile.d/conda.sh
	# conda activate flagscale-train

	# # Run ONNX unit tests
	# echo "=== Running L1 PyTorch ONNX Unit Tests ==="
	# bash ./qa/L1_pytorch_onnx_unittest/test.sh
	# # timeout-minutes: 30

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Add workflows #1

Workflow file

Add workflows #1

Uh oh!

Workflow file for this run