merge elastic test into nvidia #2
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: All Tests Nvidia | ||
| on: | ||
| push: | ||
| branches: ["main"] | ||
| paths-ignore: | ||
| - 'hardware/BI_V150/**' | ||
| - 'hardware/Cambricon_MLU/**' | ||
| - 'hardware/Huawei_Atlas800TA3/**' | ||
| - 'hardware/Hygon_BW1000/**' | ||
| - 'hardware/Kunlunxin_R310p/**' | ||
| - 'hardware/MUSA_S5000/**' | ||
| - 'hardware/Metax_C550/**' | ||
| - 'hardware/Tsing_micro/**' | ||
| pull_request: | ||
| branches: ["main"] | ||
| paths-ignore: | ||
| - 'hardware/BI_V150/**' | ||
| - 'hardware/Cambricon_MLU/**' | ||
| - 'hardware/Huawei_Atlas800TA3/**' | ||
| - 'hardware/Hygon_BW1000/**' | ||
| - 'hardware/Kunlunxin_R310p/**' | ||
| - 'hardware/MUSA_S5000/**' | ||
| - 'hardware/Metax_C550/**' | ||
| - 'hardware/Tsing_micro/**' | ||
| concurrency: | ||
| group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.actor }} | ||
| cancel-in-progress: true | ||
| jobs: | ||
| set-env: | ||
| runs-on: ubuntu-latest | ||
| outputs: | ||
| ci_image: ${{ steps.set-env.outputs.ci_image }} # Declare output variable | ||
| steps: | ||
| - name: Set Environment Variable | ||
| id: set-env # Assign an ID to this step | ||
| run: | | ||
| echo "ci_image=localhost:5000/flagscale:cuda12.8.1-cudnn9.7.1-python3.12-torch2.7.0-time2507111538" >> $GITHUB_OUTPUT # Set output variable | ||
| # Train Megatron Unit Tests with Matrix | ||
| unit_tests_train_megatron: | ||
| needs: | ||
| - set-env | ||
| uses: ./.github/workflows/unit-tests-nvidia.yml | ||
| strategy: | ||
| matrix: | ||
| subset: | ||
| - data | ||
| - dist_checkpointing | ||
| - distributed | ||
| - export | ||
| - fusions | ||
| - inference | ||
| - models | ||
| - pipeline_parallel | ||
| - post_training | ||
| - ssm | ||
| - tensor_parallel | ||
| - transformer/moe | ||
| - transformer | ||
| - ./ | ||
| name: "train_megatron-${{ matrix.subset == './' && 'root' || matrix.subset }}" | ||
| with: | ||
| backend: train_megatron | ||
| subset: ${{ matrix.subset }} | ||
| image: ${{ needs.set-env.outputs.ci_image }} | ||
| # Train Flagscale Unit Tests with Matrix | ||
| unit_tests_train_flagscale: | ||
| needs: | ||
| - set-env | ||
| - unit_tests_train_megatron | ||
| uses: ./.github/workflows/unit-tests-nvidia.yml | ||
| strategy: | ||
| matrix: | ||
| subset: | ||
| - runner | ||
| - ./ | ||
| name: "train_flagscale-${{ matrix.subset == './' && 'root' || matrix.subset }}" | ||
| with: | ||
| backend: train_flagscale | ||
| subset: ${{ matrix.subset }} | ||
| image: ${{ needs.set-env.outputs.ci_image }} | ||
| # Inference Flagscale Unit Tests with Matrix | ||
| unit_tests_inference_flagscale: | ||
| needs: | ||
| - set-env | ||
| - unit_tests_train_flagscale | ||
| uses: ./.github/workflows/unit-tests-nvidia.yml | ||
| strategy: | ||
| matrix: | ||
| subset: | ||
| - inference | ||
| - transforms | ||
| name: "inference_flagscale-${{ matrix.subset == './' && 'root' || matrix.subset }}" | ||
| with: | ||
| backend: inference_flagscale | ||
| subset: ${{ matrix.subset }} | ||
| image: ${{ needs.set-env.outputs.ci_image }} | ||
| # Elastic Unit Tests | ||
| unit_tests_elastic: | ||
| needs: | ||
| - set-env | ||
| - unit_tests_train_flagscale | ||
| runs-on: [self-hosted, Linux, X64, nvidia-0, gpus-8] | ||
| container: | ||
| image: ${{ needs.set-env.outputs.ci_image }} | ||
| ports: | ||
| - 80 | ||
| volumes: | ||
| - /home/flagscale_cicd/flask/static:/workspace/report | ||
| - /home/flagscale_cicd/flask/config:/workspace/config | ||
| - /home/flagscale_cicd/docker/docker_build/docker_data:/home/gitlab-runner/data | ||
| - /home/flagscale_cicd/docker/docker_build/docker_tokenizers:/home/gitlab-runner/tokenizers | ||
| options: --gpus all --shm-size=500g --hostname flagscale_cicd --user root --ulimit nofile=65535:65535 | ||
| steps: | ||
| - name: Checkout Code | ||
| uses: actions/checkout@v4 | ||
| with: | ||
| repository: ${{ github.event.pull_request.head.repo.full_name || github.repository }} | ||
| ref: ${{ github.event.pull_request.head.ref || github.ref }} | ||
| ssh-strict: true | ||
| ssh-user: git | ||
| persist-credentials: true | ||
| clean: true | ||
| sparse-checkout-cone-mode: true | ||
| fetch-tags: false | ||
| show-progress: true | ||
| lfs: false | ||
| submodules: false | ||
| set-safe-directory: true | ||
| - name: Setup Environment | ||
| run: | | ||
| echo "USER: $USER" | ||
| echo "UID: $(id -u)" | ||
| echo "GID: $(id -g)" | ||
| echo "Home: $HOME" | ||
| whoami | ||
| git config --global --add safe.directory /__w/FlagScale/FlagScale || true | ||
| - name: Install Dependencies | ||
| run: | | ||
| source /root/miniconda3/etc/profile.d/conda.sh | ||
| conda activate flagscale-train | ||
| pip install pylint pytest pytest-cov | ||
| python tools/patch/unpatch.py --backend Megatron-LM || true | ||
| export PYTHONPATH=./third_party/Megatron-LM:$PYTHONPATH | ||
| export NVTE_FLASH_ATTN=0 | ||
| export NVTE_FUSED_ATTN=0 | ||
| ulimit -n 65535 | ||
| - name: Pylint Check - Elastic Module | ||
| run: | | ||
| source /root/miniconda3/etc/profile.d/conda.sh | ||
| conda activate flagscale-train | ||
| export PYTHONPATH=./third_party/Megatron-LM:$PYTHONPATH | ||
| echo "Running pylint on elastic module..." | ||
| pylint flagscale/runner/elastic/ --output-format=text --reports=yes --exit-zero > pylint_report.txt | ||
| # Display pylint results | ||
| cat pylint_report.txt | ||
| # Extract pylint score | ||
| PYLINT_SCORE=$(grep "Your code has been rated at" pylint_report.txt | sed 's/.*rated at \([0-9.]*\).*/\1/' || echo "0") | ||
| echo "Pylint Score: $PYLINT_SCORE" | ||
| # Fail if score is below 8.0 | ||
| if (( $(echo "$PYLINT_SCORE < 8.0" | bc -l) )); then | ||
| echo "Pylint score $PYLINT_SCORE is below required threshold of 8.0" | ||
| exit 1 | ||
| fi | ||
| - name: Run Elastic Unit Tests | ||
| run: | | ||
| source /root/miniconda3/etc/profile.d/conda.sh | ||
| conda activate flagscale-train | ||
| export PYTHONPATH=./third_party/Megatron-LM:$PYTHONPATH | ||
| export NVTE_FLASH_ATTN=0 | ||
| export NVTE_FUSED_ATTN=0 | ||
| ulimit -n 65535 | ||
| echo "Running elastic module unit tests..." | ||
| pytest tests/unit_tests/runner/elastic/ \ | ||
| --cov=flagscale/runner/elastic \ | ||
| --cov-report=xml:coverage_elastic.xml \ | ||
| --cov-report=html:coverage_elastic_html \ | ||
| --cov-report=term \ | ||
| -v \ | ||
| --tb=short \ | ||
| -x | ||
| - name: Coverage Report | ||
| run: | | ||
| source /root/miniconda3/etc/profile.d/conda.sh | ||
| conda activate flagscale-train | ||
| echo "Coverage Summary for Elastic Module:" | ||
| if [ -f coverage_elastic.xml ]; then | ||
| echo "Coverage XML report generated successfully" | ||
| python -c " | ||
| import xml.etree.ElementTree as ET | ||
| try: | ||
| tree = ET.parse('coverage_elastic.xml') | ||
| root = tree.getroot() | ||
| coverage = root.attrib.get('line-rate', '0') | ||
| percentage = float(coverage) * 100 | ||
| print(f'Line Coverage: {percentage:.1f}%') | ||
| if percentage < 80: | ||
| print(f'Warning: Coverage {percentage:.1f}% is below recommended 80%') | ||
| exit(1) | ||
| except Exception as e: | ||
| print(f'Error parsing coverage: {e}') | ||
| exit(1) | ||
| " | ||
| else | ||
| echo "Coverage XML report not found" | ||
| exit 1 | ||
| fi | ||
| - name: Archive Test Results | ||
| uses: actions/upload-artifact@v3 | ||
| if: always() | ||
| with: | ||
| name: elastic-test-results | ||
| path: | | ||
| pylint_report.txt | ||
| coverage_elastic.xml | ||
| coverage_elastic_html/ | ||
| retention-days: 30 | ||
| - name: Summary | ||
| if: always() | ||
| run: | | ||
| echo "=== Elastic Module CI/CD Summary ===" | ||
| echo "[INFO] Pylint check completed" | ||
| echo "[INFO] Unit tests executed" | ||
| echo "[INFO] Coverage report generated" | ||
| echo "=== Test artifacts uploaded to GitHub Actions ===" | ||
| # Functional Tests with Mision and Type Matrix | ||
| functional_tests_train: | ||
| needs: | ||
| - set-env | ||
| - unit_tests_inference_flagscale | ||
| - unit_tests_elastic | ||
| uses: ./.github/workflows/functional-tests-nvidia.yml | ||
| strategy: | ||
| matrix: | ||
| task: | ||
| - aquila | ||
| - deepseek | ||
| - mixtral | ||
| - llava_onevision | ||
| name: "train-${{ matrix.task }}" | ||
| with: | ||
| task: ${{ matrix.task }} | ||
| type: train | ||
| image: ${{ needs.set-env.outputs.ci_image }} | ||
| functional_tests_hetero: | ||
| needs: | ||
| - set-env | ||
| - functional_tests_train | ||
| # TODO: test need fix | ||
| uses: ./.github/workflows/functional-tests-nvidia.yml | ||
| strategy: | ||
| matrix: | ||
| task: | ||
| - aquila | ||
| name: "hetero_train-${{ matrix.task }}" | ||
| with: | ||
| task: ${{ matrix.task }} | ||
| type: hetero_train | ||
| image: ${{ needs.set-env.outputs.ci_image }} | ||
| functional_tests_inference: | ||
| needs: | ||
| - set-env | ||
| - functional_tests_hetero | ||
| uses: ./.github/workflows/functional-tests-nvidia.yml | ||
| strategy: | ||
| matrix: | ||
| task: | ||
| - deepseek_r1_distill_qwen | ||
| - deepseek_r1_distill_qwen-flaggems | ||
| - qwen3 | ||
| - qwen3-flaggems | ||
| name: "inference-${{ matrix.task }}" | ||
| with: | ||
| task: ${{ matrix.task }} | ||
| type: inference | ||
| image: ${{ needs.set-env.outputs.ci_image }} | ||
| functional_tests_serve: | ||
| needs: | ||
| - set-env | ||
| - functional_tests_inference | ||
| uses: ./.github/workflows/functional-tests-nvidia.yml | ||
| strategy: | ||
| matrix: | ||
| task: | ||
| - qwen2_5 | ||
| - base | ||
| name: "serve-${{ matrix.task }}" | ||
| with: | ||
| task: ${{ matrix.task }} | ||
| type: serve | ||
| image: ${{ needs.set-env.outputs.ci_image }} | ||
| functional_tests_rl: | ||
| needs: | ||
| - set-env | ||
| - functional_tests_serve | ||
| uses: ./.github/workflows/functional-tests-nvidia.yml | ||
| strategy: | ||
| matrix: | ||
| task: | ||
| - qwen2_5 | ||
| name: "rl-${{ matrix.task }}" | ||
| with: | ||
| task: ${{ matrix.task }} | ||
| type: rl | ||
| image: ${{ needs.set-env.outputs.ci_image }} | ||
| # Check All Tests | ||
| all-tests: | ||
| needs: | ||
| - unit_tests_train_megatron | ||
| - unit_tests_train_flagscale | ||
| - unit_tests_inference_flagscale | ||
| - functional_tests_train | ||
| - functional_tests_hetero | ||
| - functional_tests_inference | ||
| - functional_tests_serve | ||
| - functional_tests_rl | ||
| runs-on: ubuntu-latest | ||
| steps: | ||
| - name: All Tests Completed | ||
| run: echo "All tests completed successfully!" | ||