From 2314a2165247eacbc8e4bf023d7afd71bd0b25d5 Mon Sep 17 00:00:00 2001 From: zihugithub Date: Tue, 10 Mar 2026 17:27:07 +0800 Subject: [PATCH 01/29] [CICD] Add auto build and push CUDA Docker images to Harbor pipeline --- .github/workflows/all_tests_cuda.yml | 20 ++ .github/workflows/build_image_cuda.yml | 294 ++++++++++++++++++++++++ .github/workflows/push_image_harbor.yml | 126 ++++++++++ requirements/cuda/serve.txt | 2 +- 4 files changed, 441 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/build_image_cuda.yml create mode 100644 .github/workflows/push_image_harbor.yml diff --git a/.github/workflows/all_tests_cuda.yml b/.github/workflows/all_tests_cuda.yml index 0b34872207..6b05048c1a 100644 --- a/.github/workflows/all_tests_cuda.yml +++ b/.github/workflows/all_tests_cuda.yml @@ -1,6 +1,11 @@ name: cuda_tests on: + # Trigger after Build Docker Images - CUDA succeeds + workflow_run: + workflows: ["Build Docker Images - CUDA"] + types: [completed] + push: branches: ["main"] pull_request: @@ -12,6 +17,10 @@ concurrency: jobs: run_tests: + # Skip if triggered by workflow_run but the build did not succeed + if: >- + github.event_name != 'workflow_run' || + github.event.workflow_run.conclusion == 'success' # Package manager and environment settings are read from .github/configs/cuda.yml uses: ./.github/workflows/all_tests_common.yml with: @@ -29,3 +38,14 @@ jobs: exit 1 fi echo "✅ All tests passed!" + + push_images_to_harbor: + needs: all_tests + if: >- + needs.all_tests.result == 'success' && + github.event_name == 'workflow_run' && + github.event.workflow_run.conclusion == 'success' + uses: ./.github/workflows/push_image_harbor.yml + with: + platform: cuda + secrets: inherit \ No newline at end of file diff --git a/.github/workflows/build_image_cuda.yml b/.github/workflows/build_image_cuda.yml new file mode 100644 index 0000000000..12d531cffa --- /dev/null +++ b/.github/workflows/build_image_cuda.yml @@ -0,0 +1,294 @@ +name: Build Docker Images - CUDA + +on: + # Manual trigger with configurable options + workflow_dispatch: + inputs: + task: + description: 'Task to build' + required: true + type: choice + options: + - train + - inference + - all + default: 'train' + target: + description: 'Build target stage (dev includes dev tools, release is production)' + required: true + type: choice + options: + - dev + - release + default: 'dev' + push: + description: 'Push image to registry' + type: boolean + default: true + no_cache: + description: 'Build without Docker cache' + type: boolean + default: false + pkg_mgr: + description: 'Package manager to use' + required: true + type: choice + options: + - conda + - uv + default: 'conda' + build_all_tasks: + description: 'Build all tasks (train, inference, all) - overrides task selection' + type: boolean + default: false + + # Trigger on PRs that modify docker-related files + pull_request: + branches: [main] + paths: + - 'docker/cuda/**' + - 'docker/build.sh' + - 'tools/install/**' + - 'requirements/**' + - '.github/workflows/build_image_cuda.yml' + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +env: + # Local registry used by CI runners (see .github/configs/cuda.yml) + REGISTRY: localhost:5000 + # Default build versions (keep in sync with docker/build.sh) + CUDA_VERSION: '12.8.1' + UBUNTU_VERSION: '22.04' + PYTHON_VERSION: '3.12' + UV_VERSION: '0.7.2' + PKG_MGR: ${{ inputs.pkg_mgr || 'conda' }} + +jobs: + # --------------------------------------------------------------------------- + # Prepare: compute build matrix and parameters based on trigger type + # --------------------------------------------------------------------------- + prepare: + name: Prepare build matrix + runs-on: ubuntu-latest + outputs: + matrix: ${{ steps.set-matrix.outputs.matrix }} + target: ${{ steps.params.outputs.target }} + push: ${{ steps.params.outputs.push }} + no_cache: ${{ steps.params.outputs.no_cache }} + steps: + - name: Determine build matrix + id: set-matrix + run: | + EVENT="${{ github.event_name }}" + + if [ "$EVENT" = "workflow_dispatch" ] && [ "${{ inputs.build_all_tasks }}" != "true" ]; then + # Manual trigger: build selected task only + echo 'matrix={"task":["${{ inputs.task }}"]}' >> $GITHUB_OUTPUT + else + # Push to main or build_all_tasks=true: build all tasks + echo 'matrix={"task":["train","inference","all"]}' >> $GITHUB_OUTPUT + fi + + - name: Set build parameters + id: params + run: | + EVENT="${{ github.event_name }}" + + if [ "$EVENT" = "push" ]; then + # Push to main: always build dev images and push + echo "target=dev" >> $GITHUB_OUTPUT + echo "push=true" >> $GITHUB_OUTPUT + echo "no_cache=false" >> $GITHUB_OUTPUT + else + echo "target=${{ inputs.target || 'dev' }}" >> $GITHUB_OUTPUT + echo "push=${{ inputs.push }}" >> $GITHUB_OUTPUT + echo "no_cache=${{ inputs.no_cache }}" >> $GITHUB_OUTPUT + fi + + # --------------------------------------------------------------------------- + # Build: build and push Docker images (matrix across tasks) + # --------------------------------------------------------------------------- + build: + name: Build ${{ matrix.task }} + needs: prepare + runs-on: [self-hosted, Linux, X64, nvidia-0, gpus-8] + strategy: + fail-fast: false + matrix: ${{ fromJson(needs.prepare.outputs.matrix) }} + outputs: + train_tag: ${{ steps.export.outputs.train_tag }} + inference_tag: ${{ steps.export.outputs.inference_tag }} + all_tag: ${{ steps.export.outputs.all_tag }} + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + with: + # Use docker driver to avoid pulling moby/buildkit from Docker Hub + driver: docker + + - name: Compute build metadata + id: meta + run: | + set -euo pipefail + + TASK="${{ matrix.task }}" + TARGET="${{ needs.prepare.outputs.target }}" + CUDA_VERSION="${{ env.CUDA_VERSION }}" + PYTHON_VERSION="${{ env.PYTHON_VERSION }}" + UBUNTU_VERSION="${{ env.UBUNTU_VERSION }}" + + CUDA_MAJOR=$(echo "$CUDA_VERSION" | cut -d. -f1) + CUDA_MINOR=$(echo "$CUDA_VERSION" | cut -d. -f2) + TIMESTAMP=$(date +%Y%m%d%H%M%S) + + # Image naming follows docker/build.sh convention: + # flagscale-:-cu-py- + IMAGE_NAME="flagscale-${TASK}" + TAG="${TARGET}-cu${CUDA_MAJOR}${CUDA_MINOR}-py${PYTHON_VERSION}-${TIMESTAMP}" + + # Local registry tag (for CI runners) + LOCAL_TAG="${{ env.REGISTRY }}/${IMAGE_NAME}:${TAG}" + + # Derived build arguments + BASE_IMAGE="nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}" + PYTORCH_INDEX="https://download.pytorch.org/whl/cu${CUDA_MAJOR}${CUDA_MINOR}" + + # Tags list: local registry only + TAGS="${LOCAL_TAG}" + + echo "image_name=${IMAGE_NAME}" >> $GITHUB_OUTPUT + echo "tag=${TAG}" >> $GITHUB_OUTPUT + echo "local_tag=${LOCAL_TAG}" >> $GITHUB_OUTPUT + echo "base_image=${BASE_IMAGE}" >> $GITHUB_OUTPUT + echo "pytorch_index=${PYTORCH_INDEX}" >> $GITHUB_OUTPUT + + # Multi-line tags output + { + echo "tags<> $GITHUB_OUTPUT + + # Job summary + { + echo "### Build: ${IMAGE_NAME}" + echo "" + echo "| Parameter | Value |" + echo "|---|---|" + echo "| Task | \`${TASK}\` |" + echo "| Target | \`${TARGET}\` |" + echo "| CUDA | \`${CUDA_VERSION}\` |" + echo "| Python | \`${PYTHON_VERSION}\` |" + echo "| Dockerfile | \`docker/cuda/Dockerfile.${TASK}\` |" + echo "| Local Tag | \`${LOCAL_TAG}\` |" + } >> $GITHUB_STEP_SUMMARY + + echo "PKG_MGR: ${{ env.PKG_MGR }}" + + - name: Build Docker image + uses: docker/build-push-action@v6 + with: + context: . + file: docker/cuda/Dockerfile.${{ matrix.task }} + target: ${{ needs.prepare.outputs.target }} + load: true + tags: ${{ steps.meta.outputs.tags }} + build-args: | + BASE_IMAGE=${{ steps.meta.outputs.base_image }} + CUDA_VERSION=${{ env.CUDA_VERSION }} + UBUNTU_VERSION=${{ env.UBUNTU_VERSION }} + PYTHON_VERSION=${{ env.PYTHON_VERSION }} + UV_VERSION=${{ env.UV_VERSION }} + PYTORCH_INDEX=${{ steps.meta.outputs.pytorch_index }} + PKG_MGR=${{ env.PKG_MGR }} + no-cache: ${{ needs.prepare.outputs.no_cache == 'true' }} + + - name: Push Docker image + if: needs.prepare.outputs.push == 'true' + run: docker push ${{ steps.meta.outputs.local_tag }} + + - name: Export image tag for config update + id: export + if: success() && needs.prepare.outputs.push == 'true' + run: | + TASK="${{ matrix.task }}" + echo "${TASK}_tag=${{ steps.meta.outputs.local_tag }}" >> $GITHUB_OUTPUT + + - name: Print build result + if: success() + run: | + echo "" >> $GITHUB_STEP_SUMMARY + echo "**Result:** Built successfully" >> $GITHUB_STEP_SUMMARY + echo "**Pushed:** ${{ needs.prepare.outputs.push }}" >> $GITHUB_STEP_SUMMARY + + # --------------------------------------------------------------------------- + # Update Config: update cuda.yml with localhost tags (temporary, for test validation) + # After tests pass, push_image_harbor.yml will promote to Harbor and update to final tags + # --------------------------------------------------------------------------- + update-config: + name: Update CI config + needs: build + runs-on: [self-hosted, Linux, X64] + if: needs.prepare.outputs.push == 'true' && needs.build.result == 'success' + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ github.ref }} + token: ${{ secrets.GITHUB_TOKEN }} + + - name: Update cuda.yml with new image tags + run: | + set -euo pipefail + CONFIG_FILE=".github/configs/cuda.yml" + + TRAIN_TAG="${{ needs.build.outputs.train_tag }}" + INFERENCE_TAG="${{ needs.build.outputs.inference_tag }}" + + if [ -n "$TRAIN_TAG" ]; then + echo "Updating ci_train_image to: $TRAIN_TAG" + sed -i "s|^ci_train_image:.*|ci_train_image: ${TRAIN_TAG}|" "$CONFIG_FILE" + fi + + if [ -n "$INFERENCE_TAG" ]; then + echo "Updating ci_inference_image to: $INFERENCE_TAG" + sed -i "s|^ci_inference_image:.*|ci_inference_image: ${INFERENCE_TAG}|" "$CONFIG_FILE" + fi + + echo "Updated config:" + cat "$CONFIG_FILE" + + - name: Commit and push config update + run: | + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" + git add .github/configs/cuda.yml + if git diff --cached --quiet; then + echo "No config changes to commit" + else + git commit -m "ci: update CUDA image tags [skip ci]" + git push + fi + + # --------------------------------------------------------------------------- + # Summary: verify all builds completed + # --------------------------------------------------------------------------- + summary: + name: Build summary + needs: update-config + runs-on: ubuntu-latest + if: always() + steps: + - name: Verify build results + run: | + if [ "${{ needs.update-config.result }}" != "success" ]; then + echo "::error::One or more image builds failed" + exit 1 + fi + echo "All Docker images built successfully!" diff --git a/.github/workflows/push_image_harbor.yml b/.github/workflows/push_image_harbor.yml new file mode 100644 index 0000000000..2a89aff39d --- /dev/null +++ b/.github/workflows/push_image_harbor.yml @@ -0,0 +1,126 @@ +name: Push Images to Harbor + +on: + workflow_call: + inputs: + platform: + required: true + type: string + description: "Platform name (e.g. cuda), used to locate .github/configs/.yml" + +env: + REMOTE_REGISTRY: harbor.baai.ac.cn + REMOTE_IMAGE_PREFIX: flagscale + +jobs: + promote: + name: Push validated images to Harbor + runs-on: [self-hosted, Linux, X64, nvidia-0, gpus-8] + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Read image tags from config + id: tags + run: | + set -euo pipefail + CONFIG_FILE=".github/configs/${{ inputs.platform }}.yml" + + TRAIN_TAG=$(grep '^ci_train_image:' "$CONFIG_FILE" | awk '{print $2}') + INFERENCE_TAG=$(grep '^ci_inference_image:' "$CONFIG_FILE" | awk '{print $2}') + + echo "train_tag=${TRAIN_TAG}" >> $GITHUB_OUTPUT + echo "inference_tag=${INFERENCE_TAG}" >> $GITHUB_OUTPUT + + # Check if images are from localhost (freshly built, need promotion) + if echo "${TRAIN_TAG}${INFERENCE_TAG}" | grep -q 'localhost'; then + echo "needs_promotion=true" >> $GITHUB_OUTPUT + echo "Images are from localhost, promotion needed" + else + echo "needs_promotion=false" >> $GITHUB_OUTPUT + echo "Images already on Harbor, skipping promotion" + fi + + - name: Login to Harbor registry + if: steps.tags.outputs.needs_promotion == 'true' + uses: docker/login-action@v3 + with: + registry: ${{ env.REMOTE_REGISTRY }} + username: ${{ secrets.REGISTRY_USERNAME }} + password: ${{ secrets.CONTAINER_REGISTRY }} + + - name: Promote train image to Harbor + id: promote_train + if: steps.tags.outputs.needs_promotion == 'true' && steps.tags.outputs.train_tag != '' + run: | + set -euo pipefail + LOCAL_TAG="${{ steps.tags.outputs.train_tag }}" + IMAGE_AND_TAG="${LOCAL_TAG#*/}" + REMOTE_TAG="${{ env.REMOTE_REGISTRY }}/${{ env.REMOTE_IMAGE_PREFIX }}/${IMAGE_AND_TAG}" + + echo "Promoting: ${LOCAL_TAG} → ${REMOTE_TAG}" + docker pull "${LOCAL_TAG}" + docker tag "${LOCAL_TAG}" "${REMOTE_TAG}" + docker push "${REMOTE_TAG}" + + echo "remote_tag=${REMOTE_TAG}" >> $GITHUB_OUTPUT + echo "Successfully pushed ${REMOTE_TAG}" + + - name: Promote inference image to Harbor + id: promote_inference + if: steps.tags.outputs.needs_promotion == 'true' && steps.tags.outputs.inference_tag != '' + run: | + set -euo pipefail + LOCAL_TAG="${{ steps.tags.outputs.inference_tag }}" + IMAGE_AND_TAG="${LOCAL_TAG#*/}" + REMOTE_TAG="${{ env.REMOTE_REGISTRY }}/${{ env.REMOTE_IMAGE_PREFIX }}/${IMAGE_AND_TAG}" + + echo "Promoting: ${LOCAL_TAG} → ${REMOTE_TAG}" + docker pull "${LOCAL_TAG}" + docker tag "${LOCAL_TAG}" "${REMOTE_TAG}" + docker push "${REMOTE_TAG}" + + echo "remote_tag=${REMOTE_TAG}" >> $GITHUB_OUTPUT + echo "Successfully pushed ${REMOTE_TAG}" + + - name: Commit and push config update + if: steps.tags.outputs.needs_promotion == 'true' + run: | + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" + git add .github/configs/${{ inputs.platform }}.yml + if git diff --cached --quiet; then + echo "No config changes to commit" + else + git commit -m "ci: promote ${{ inputs.platform }} image tags to Harbor [skip ci]" + git push + fi + + # --------------------------------------------------------------------------- + # Cleanup: clean up Docker build cache and dangling images on self-hosted runner + # --------------------------------------------------------------------------- + cleanup: + name: Clean up build cache + needs: promote + runs-on: [self-hosted, Linux, X64, nvidia-0, gpus-8] + if: always() + steps: + - name: Remove dangling images + run: docker image prune -f 2>/dev/null || true + + - name: Remove build cache older than 7 days + run: docker builder prune -f --filter "until=168h" 2>/dev/null || true + + - name: Remove old localhost registry images + run: | + # Remove local images tagged with localhost:5000 that are older than 7 days + docker images --format '{{.Repository}}:{{.Tag}} {{.CreatedSince}}' \ + | grep 'localhost:5000' \ + | grep -E '(weeks|months)' \ + | awk '{print $1}' \ + | xargs -r docker rmi 2>/dev/null || true + + - name: Report disk usage + run: | + echo "Docker disk usage:" + docker system df diff --git a/requirements/cuda/serve.txt b/requirements/cuda/serve.txt index 02dd3ca0e4..81ba8265f6 100644 --- a/requirements/cuda/serve.txt +++ b/requirements/cuda/serve.txt @@ -1,7 +1,7 @@ # serve-specific dependencies -r ./base.txt -vllm @ https://resource.flagos.net/repository/flagos-pypi-hosted/packages/vllm/0.11.0%2Bfl.0.1.cu124/vllm-0.11.0%2Bfl.0.1.cu124-cp312-cp312-linux_x86_64.whl +vllm @ https://resource.flagos.net/repository/flagos-pypi-hosted/packages/vllm/0.13.0%2Bfl.0.1.cu128.g72506c983/vllm-0.13.0%2Bfl.0.1.cu128.g72506c983-cp312-cp312-linux_x86_64.whl # support 0.5b_multiple_instance ci test ray==2.49.1 From 6a6a947508e34bcd0f9c1e6350e63912f308d2a9 Mon Sep 17 00:00:00 2001 From: zihugithub Date: Tue, 10 Mar 2026 18:07:58 +0800 Subject: [PATCH 02/29] fix: code style adjustments in PR --- .github/workflows/all_tests_cuda.yml | 2 +- .github/workflows/build_image_cuda.yml | 11 +++++------ .github/workflows/push_image_harbor.yml | 13 ------------- 3 files changed, 6 insertions(+), 20 deletions(-) diff --git a/.github/workflows/all_tests_cuda.yml b/.github/workflows/all_tests_cuda.yml index 6b05048c1a..e02f580c43 100644 --- a/.github/workflows/all_tests_cuda.yml +++ b/.github/workflows/all_tests_cuda.yml @@ -48,4 +48,4 @@ jobs: uses: ./.github/workflows/push_image_harbor.yml with: platform: cuda - secrets: inherit \ No newline at end of file + secrets: inherit diff --git a/.github/workflows/build_image_cuda.yml b/.github/workflows/build_image_cuda.yml index 12d531cffa..43125a2143 100644 --- a/.github/workflows/build_image_cuda.yml +++ b/.github/workflows/build_image_cuda.yml @@ -88,7 +88,7 @@ jobs: # Manual trigger: build selected task only echo 'matrix={"task":["${{ inputs.task }}"]}' >> $GITHUB_OUTPUT else - # Push to main or build_all_tasks=true: build all tasks + # PR or build_all_tasks=true: build all tasks echo 'matrix={"task":["train","inference","all"]}' >> $GITHUB_OUTPUT fi @@ -97,12 +97,13 @@ jobs: run: | EVENT="${{ github.event_name }}" - if [ "$EVENT" = "push" ]; then - # Push to main: always build dev images and push + if [ "$EVENT" = "pull_request" ]; then + # PR: always build dev images, push to local registry, use cache echo "target=dev" >> $GITHUB_OUTPUT echo "push=true" >> $GITHUB_OUTPUT echo "no_cache=false" >> $GITHUB_OUTPUT else + # workflow_dispatch: use user-provided inputs echo "target=${{ inputs.target || 'dev' }}" >> $GITHUB_OUTPUT echo "push=${{ inputs.push }}" >> $GITHUB_OUTPUT echo "no_cache=${{ inputs.no_cache }}" >> $GITHUB_OUTPUT @@ -189,8 +190,6 @@ jobs: echo "| Local Tag | \`${LOCAL_TAG}\` |" } >> $GITHUB_STEP_SUMMARY - echo "PKG_MGR: ${{ env.PKG_MGR }}" - - name: Build Docker image uses: docker/build-push-action@v6 with: @@ -235,7 +234,7 @@ jobs: name: Update CI config needs: build runs-on: [self-hosted, Linux, X64] - if: needs.prepare.outputs.push == 'true' && needs.build.result == 'success' + if: needs.build.result == 'success' steps: - name: Checkout code uses: actions/checkout@v4 diff --git a/.github/workflows/push_image_harbor.yml b/.github/workflows/push_image_harbor.yml index 2a89aff39d..d10e9a3e4c 100644 --- a/.github/workflows/push_image_harbor.yml +++ b/.github/workflows/push_image_harbor.yml @@ -83,19 +83,6 @@ jobs: echo "remote_tag=${REMOTE_TAG}" >> $GITHUB_OUTPUT echo "Successfully pushed ${REMOTE_TAG}" - - name: Commit and push config update - if: steps.tags.outputs.needs_promotion == 'true' - run: | - git config user.name "github-actions[bot]" - git config user.email "github-actions[bot]@users.noreply.github.com" - git add .github/configs/${{ inputs.platform }}.yml - if git diff --cached --quiet; then - echo "No config changes to commit" - else - git commit -m "ci: promote ${{ inputs.platform }} image tags to Harbor [skip ci]" - git push - fi - # --------------------------------------------------------------------------- # Cleanup: clean up Docker build cache and dangling images on self-hosted runner # --------------------------------------------------------------------------- From 78432660794386966e20dd54da90099f6c862949 Mon Sep 17 00:00:00 2001 From: zihugithub Date: Tue, 10 Mar 2026 18:28:04 +0800 Subject: [PATCH 03/29] Set safe directory --- .github/workflows/build_image_cuda.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/build_image_cuda.yml b/.github/workflows/build_image_cuda.yml index 43125a2143..730a1c98f9 100644 --- a/.github/workflows/build_image_cuda.yml +++ b/.github/workflows/build_image_cuda.yml @@ -124,6 +124,10 @@ jobs: inference_tag: ${{ steps.export.outputs.inference_tag }} all_tag: ${{ steps.export.outputs.all_tag }} steps: + - name: Set safe directory + run: | + git config --global --add safe.directory . + - name: Checkout code uses: actions/checkout@v4 From 1f895eb600e188abeae11164ae1cf9f5e4cb406b Mon Sep 17 00:00:00 2001 From: zihugithub Date: Tue, 10 Mar 2026 18:36:52 +0800 Subject: [PATCH 04/29] Clean workspace --- .github/workflows/build_image_cuda.yml | 8 +++++--- .github/workflows/push_image_harbor.yml | 3 +++ 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build_image_cuda.yml b/.github/workflows/build_image_cuda.yml index 730a1c98f9..80128017d5 100644 --- a/.github/workflows/build_image_cuda.yml +++ b/.github/workflows/build_image_cuda.yml @@ -124,9 +124,8 @@ jobs: inference_tag: ${{ steps.export.outputs.inference_tag }} all_tag: ${{ steps.export.outputs.all_tag }} steps: - - name: Set safe directory - run: | - git config --global --add safe.directory . + - name: Clean workspace + run: sudo rm -rf "$GITHUB_WORKSPACE"/* "$GITHUB_WORKSPACE"/.[!.]* || true - name: Checkout code uses: actions/checkout@v4 @@ -240,6 +239,9 @@ jobs: runs-on: [self-hosted, Linux, X64] if: needs.build.result == 'success' steps: + - name: Clean workspace + run: sudo rm -rf "$GITHUB_WORKSPACE"/* "$GITHUB_WORKSPACE"/.[!.]* || true + - name: Checkout code uses: actions/checkout@v4 with: diff --git a/.github/workflows/push_image_harbor.yml b/.github/workflows/push_image_harbor.yml index d10e9a3e4c..5f2053063a 100644 --- a/.github/workflows/push_image_harbor.yml +++ b/.github/workflows/push_image_harbor.yml @@ -17,6 +17,9 @@ jobs: name: Push validated images to Harbor runs-on: [self-hosted, Linux, X64, nvidia-0, gpus-8] steps: + - name: Clean workspace + run: sudo rm -rf "$GITHUB_WORKSPACE"/* "$GITHUB_WORKSPACE"/.[!.]* || true + - name: Checkout code uses: actions/checkout@v4 From 903c520ce84ba055366980b6b88591bdb2a7ebf4 Mon Sep 17 00:00:00 2001 From: zihugithub Date: Wed, 11 Mar 2026 10:43:07 +0800 Subject: [PATCH 05/29] Remove redundant clean workspace steps before checkout --- .github/workflows/build_image_cuda.yml | 6 ------ .github/workflows/push_image_harbor.yml | 3 --- 2 files changed, 9 deletions(-) diff --git a/.github/workflows/build_image_cuda.yml b/.github/workflows/build_image_cuda.yml index 80128017d5..43125a2143 100644 --- a/.github/workflows/build_image_cuda.yml +++ b/.github/workflows/build_image_cuda.yml @@ -124,9 +124,6 @@ jobs: inference_tag: ${{ steps.export.outputs.inference_tag }} all_tag: ${{ steps.export.outputs.all_tag }} steps: - - name: Clean workspace - run: sudo rm -rf "$GITHUB_WORKSPACE"/* "$GITHUB_WORKSPACE"/.[!.]* || true - - name: Checkout code uses: actions/checkout@v4 @@ -239,9 +236,6 @@ jobs: runs-on: [self-hosted, Linux, X64] if: needs.build.result == 'success' steps: - - name: Clean workspace - run: sudo rm -rf "$GITHUB_WORKSPACE"/* "$GITHUB_WORKSPACE"/.[!.]* || true - - name: Checkout code uses: actions/checkout@v4 with: diff --git a/.github/workflows/push_image_harbor.yml b/.github/workflows/push_image_harbor.yml index 5f2053063a..d10e9a3e4c 100644 --- a/.github/workflows/push_image_harbor.yml +++ b/.github/workflows/push_image_harbor.yml @@ -17,9 +17,6 @@ jobs: name: Push validated images to Harbor runs-on: [self-hosted, Linux, X64, nvidia-0, gpus-8] steps: - - name: Clean workspace - run: sudo rm -rf "$GITHUB_WORKSPACE"/* "$GITHUB_WORKSPACE"/.[!.]* || true - - name: Checkout code uses: actions/checkout@v4 From 84d54bc17cc84c83d2cb3fefe8a4f25cdd5a1cad Mon Sep 17 00:00:00 2001 From: zihugithub Date: Thu, 12 Mar 2026 14:26:42 +0800 Subject: [PATCH 06/29] fix(ci): use head_ref for PR checkout to avoid detached HEAD on push --- .github/workflows/build_image_cuda.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_image_cuda.yml b/.github/workflows/build_image_cuda.yml index 43125a2143..d95236a48d 100644 --- a/.github/workflows/build_image_cuda.yml +++ b/.github/workflows/build_image_cuda.yml @@ -239,7 +239,7 @@ jobs: - name: Checkout code uses: actions/checkout@v4 with: - ref: ${{ github.ref }} + ref: ${{ github.head_ref || github.ref }} token: ${{ secrets.GITHUB_TOKEN }} - name: Update cuda.yml with new image tags From 753242a0cd20ba277892984410ad30bde3d9a850 Mon Sep 17 00:00:00 2001 From: zihugithub Date: Mon, 16 Mar 2026 18:13:52 +0800 Subject: [PATCH 07/29] fix git fetch failure in update-config job --- .github/workflows/build_image_cuda.yml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build_image_cuda.yml b/.github/workflows/build_image_cuda.yml index d95236a48d..db23dba087 100644 --- a/.github/workflows/build_image_cuda.yml +++ b/.github/workflows/build_image_cuda.yml @@ -52,6 +52,9 @@ on: - 'requirements/**' - '.github/workflows/build_image_cuda.yml' +permissions: + contents: write + concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} cancel-in-progress: true @@ -233,7 +236,7 @@ jobs: update-config: name: Update CI config needs: build - runs-on: [self-hosted, Linux, X64] + runs-on: [self-hosted, Linux, X64, nvidia-0, gpus-8] if: needs.build.result == 'success' steps: - name: Checkout code @@ -241,6 +244,8 @@ jobs: with: ref: ${{ github.head_ref || github.ref }} token: ${{ secrets.GITHUB_TOKEN }} + clean: true + fetch-depth: 0 - name: Update cuda.yml with new image tags run: | From 26c3a1407349363f3a47fafa5f5ea01eabcb2929 Mon Sep 17 00:00:00 2001 From: zihugithub Date: Tue, 17 Mar 2026 09:52:44 +0800 Subject: [PATCH 08/29] Clean workspace --- .github/workflows/build_image_cuda.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.github/workflows/build_image_cuda.yml b/.github/workflows/build_image_cuda.yml index db23dba087..232a721529 100644 --- a/.github/workflows/build_image_cuda.yml +++ b/.github/workflows/build_image_cuda.yml @@ -127,6 +127,9 @@ jobs: inference_tag: ${{ steps.export.outputs.inference_tag }} all_tag: ${{ steps.export.outputs.all_tag }} steps: + - name: Clean workspace + run: sudo rm -rf "$GITHUB_WORKSPACE"/* "$GITHUB_WORKSPACE"/.[!.]* 2>/dev/null || true + - name: Checkout code uses: actions/checkout@v4 @@ -239,6 +242,9 @@ jobs: runs-on: [self-hosted, Linux, X64, nvidia-0, gpus-8] if: needs.build.result == 'success' steps: + - name: Clean workspace + run: sudo rm -rf "$GITHUB_WORKSPACE"/* "$GITHUB_WORKSPACE"/.[!.]* 2>/dev/null || true + - name: Checkout code uses: actions/checkout@v4 with: From bf71fbeb2717fddd802dfbe30b6c43e2cbb12300 Mon Sep 17 00:00:00 2001 From: zihugithub Date: Tue, 17 Mar 2026 10:56:20 +0800 Subject: [PATCH 09/29] add paths-ignore for test workflow and fix PR checkout in build workflow --- .github/workflows/all_tests_cuda.yml | 6 ++++++ .github/workflows/build_image_cuda.yml | 8 ++++++-- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/.github/workflows/all_tests_cuda.yml b/.github/workflows/all_tests_cuda.yml index e02f580c43..2b3441adbd 100644 --- a/.github/workflows/all_tests_cuda.yml +++ b/.github/workflows/all_tests_cuda.yml @@ -10,6 +10,12 @@ on: branches: ["main"] pull_request: branches: ["main"] + paths-ignore: + - 'docker/cuda/**' + - 'docker/build.sh' + - 'tools/install/**' + - 'requirements/**' + - '.github/workflows/build_image_cuda.yml' concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.actor }} diff --git a/.github/workflows/build_image_cuda.yml b/.github/workflows/build_image_cuda.yml index 232a721529..e04fb2d8f4 100644 --- a/.github/workflows/build_image_cuda.yml +++ b/.github/workflows/build_image_cuda.yml @@ -248,11 +248,15 @@ jobs: - name: Checkout code uses: actions/checkout@v4 with: - ref: ${{ github.head_ref || github.ref }} token: ${{ secrets.GITHUB_TOKEN }} - clean: true fetch-depth: 0 + - name: Switch to PR branch + if: github.event_name == 'pull_request' + run: | + git fetch origin "${{ github.head_ref }}" + git checkout "${{ github.head_ref }}" + - name: Update cuda.yml with new image tags run: | set -euo pipefail From 9cfa2e1483fbcc9b0761717d528a1d57d45fc118 Mon Sep 17 00:00:00 2001 From: zihugithub Date: Tue, 17 Mar 2026 12:06:01 +0800 Subject: [PATCH 10/29] support fork PRs in build_image_cuda workflow --- .github/workflows/build_image_cuda.yml | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build_image_cuda.yml b/.github/workflows/build_image_cuda.yml index e04fb2d8f4..968bc61113 100644 --- a/.github/workflows/build_image_cuda.yml +++ b/.github/workflows/build_image_cuda.yml @@ -254,8 +254,11 @@ jobs: - name: Switch to PR branch if: github.event_name == 'pull_request' run: | - git fetch origin "${{ github.head_ref }}" - git checkout "${{ github.head_ref }}" + PR_REPO="${{ github.event.pull_request.head.repo.clone_url }}" + PR_BRANCH="${{ github.head_ref }}" + git remote add pr-head "${PR_REPO}" || true + git fetch pr-head "${PR_BRANCH}" + git checkout -b "${PR_BRANCH}" "pr-head/${PR_BRANCH}" - name: Update cuda.yml with new image tags run: | @@ -287,7 +290,11 @@ jobs: echo "No config changes to commit" else git commit -m "ci: update CUDA image tags [skip ci]" - git push + if [ "${{ github.event_name }}" = "pull_request" ]; then + git push pr-head HEAD:"${{ github.head_ref }}" + else + git push + fi fi # --------------------------------------------------------------------------- From 731083117fe0902ed881f8e8abe1c882fbeae087 Mon Sep 17 00:00:00 2001 From: zihugithub Date: Tue, 17 Mar 2026 14:02:34 +0800 Subject: [PATCH 11/29] use FORK_PUSH_TOKEN for fork PR push and add token-help job --- .github/workflows/build_image_cuda.yml | 39 ++++++++++++++++++++++++-- 1 file changed, 36 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build_image_cuda.yml b/.github/workflows/build_image_cuda.yml index 968bc61113..22f365c7af 100644 --- a/.github/workflows/build_image_cuda.yml +++ b/.github/workflows/build_image_cuda.yml @@ -254,9 +254,10 @@ jobs: - name: Switch to PR branch if: github.event_name == 'pull_request' run: | - PR_REPO="${{ github.event.pull_request.head.repo.clone_url }}" + PR_REPO_FULL="${{ github.event.pull_request.head.repo.full_name }}" PR_BRANCH="${{ github.head_ref }}" - git remote add pr-head "${PR_REPO}" || true + FORK_URL="https://x-access-token:${{ secrets.FORK_PUSH_TOKEN }}@github.com/${PR_REPO_FULL}.git" + git remote add pr-head "${FORK_URL}" || true git fetch pr-head "${PR_BRANCH}" git checkout -b "${PR_BRANCH}" "pr-head/${PR_BRANCH}" @@ -297,12 +298,44 @@ jobs: fi fi + # --------------------------------------------------------------------------- + # Token help: print setup instructions when update-config fails on fork PRs + # --------------------------------------------------------------------------- + token-help: + name: Print token setup instructions + needs: update-config + runs-on: ubuntu-latest + if: failure() && github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name != github.repository + steps: + - name: Print FORK_PUSH_TOKEN setup instructions + run: | + echo "::error::Push to fork failed. The FORK_PUSH_TOKEN secret is missing or invalid." + { + echo "### Push to fork failed — FORK_PUSH_TOKEN setup required" + echo "" + echo "**Step 1: Create a Personal Access Token (PAT)**" + echo "1. Go to your GitHub account: \`Settings → Developer settings → Personal access tokens → Fine-grained tokens\`" + echo "2. Click \`Generate new token\`" + echo "3. Set \`Repository access\` → \`Only select repositories\` → select your fork repo (\`${{ github.event.pull_request.head.repo.full_name }}\`)" + echo "4. Set \`Permissions\` → \`Repository permissions\` → \`Contents\`: **Read and Write**" + echo "5. Click \`Generate token\` and copy the token value" + echo "" + echo "**Step 2: Add the secret to the upstream repository**" + echo "1. Go to [\`${{ github.repository }}\`](https://github.com/${{ github.repository }}/settings/secrets/actions) → \`Settings → Secrets and variables → Actions\`" + echo "2. Click \`New repository secret\`" + echo "3. Name: \`FORK_PUSH_TOKEN\`" + echo "4. Value: paste the token from Step 1" + echo "5. Click \`Add secret\`" + echo "" + echo "**Step 3:** Re-run this workflow" + } >> $GITHUB_STEP_SUMMARY + # --------------------------------------------------------------------------- # Summary: verify all builds completed # --------------------------------------------------------------------------- summary: name: Build summary - needs: update-config + needs: [update-config, token-help] runs-on: ubuntu-latest if: always() steps: From 8b39e356c3fff3cefe73a6bc8d828cd5f1f1ff21 Mon Sep 17 00:00:00 2001 From: zihugithub Date: Tue, 17 Mar 2026 16:11:50 +0800 Subject: [PATCH 12/29] fix(ci): use stable image tags without timestamp for registry push --- .github/configs/cuda.yml | 4 +- .github/workflows/all_tests_cuda.yml | 27 ++--- .github/workflows/build_image_cuda.yml | 155 ++++++------------------- 3 files changed, 48 insertions(+), 138 deletions(-) diff --git a/.github/configs/cuda.yml b/.github/configs/cuda.yml index 093b97dac2..cd1d99461f 100644 --- a/.github/configs/cuda.yml +++ b/.github/configs/cuda.yml @@ -7,8 +7,8 @@ display_name: "CUDA Tests" # Docker image for this hardware ci_image: localhost:5000/flagscale:cuda12.8.1-cudnn9.7.1-python3.12-torch2.7.1-time2510131515 -ci_train_image: localhost:5000/flagscale-train:dev-cu128-py3.12-20260228210721 -ci_inference_image: localhost:5000/flagscale-inference:dev-cu128-py3.12-20260302102033 +ci_train_image: localhost:5000/flagscale-train:dev-cu128-py3.12 +ci_inference_image: localhost:5000/flagscale-inference:dev-cu128-py3.12 # Runner labels for this hardware runner_labels: diff --git a/.github/workflows/all_tests_cuda.yml b/.github/workflows/all_tests_cuda.yml index 2b3441adbd..e02ef07b77 100644 --- a/.github/workflows/all_tests_cuda.yml +++ b/.github/workflows/all_tests_cuda.yml @@ -1,13 +1,17 @@ name: cuda_tests on: - # Trigger after Build Docker Images - CUDA succeeds - workflow_run: - workflows: ["Build Docker Images - CUDA"] - types: [completed] + # Called by Build Docker Images - CUDA workflow + workflow_call: push: branches: ["main"] + paths-ignore: + - 'docker/cuda/**' + - 'docker/build.sh' + - 'tools/install/**' + - 'requirements/**' + - '.github/workflows/build_image_cuda.yml' pull_request: branches: ["main"] paths-ignore: @@ -23,10 +27,6 @@ concurrency: jobs: run_tests: - # Skip if triggered by workflow_run but the build did not succeed - if: >- - github.event_name != 'workflow_run' || - github.event.workflow_run.conclusion == 'success' # Package manager and environment settings are read from .github/configs/cuda.yml uses: ./.github/workflows/all_tests_common.yml with: @@ -44,14 +44,3 @@ jobs: exit 1 fi echo "✅ All tests passed!" - - push_images_to_harbor: - needs: all_tests - if: >- - needs.all_tests.result == 'success' && - github.event_name == 'workflow_run' && - github.event.workflow_run.conclusion == 'success' - uses: ./.github/workflows/push_image_harbor.yml - with: - platform: cuda - secrets: inherit diff --git a/.github/workflows/build_image_cuda.yml b/.github/workflows/build_image_cuda.yml index 22f365c7af..44f3ef3efc 100644 --- a/.github/workflows/build_image_cuda.yml +++ b/.github/workflows/build_image_cuda.yml @@ -53,7 +53,7 @@ on: - '.github/workflows/build_image_cuda.yml' permissions: - contents: write + contents: read concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} @@ -155,33 +155,27 @@ jobs: TIMESTAMP=$(date +%Y%m%d%H%M%S) # Image naming follows docker/build.sh convention: - # flagscale-:-cu-py- + # flagscale-:-cu-py[-] IMAGE_NAME="flagscale-${TASK}" - TAG="${TARGET}-cu${CUDA_MAJOR}${CUDA_MINOR}-py${PYTHON_VERSION}-${TIMESTAMP}" + TAG="${TARGET}-cu${CUDA_MAJOR}${CUDA_MINOR}-py${PYTHON_VERSION}" + TAG_TS="${TAG}-${TIMESTAMP}" - # Local registry tag (for CI runners) + # Build tag (with timestamp, used during docker build) + BUILD_TAG="${{ env.REGISTRY }}/${IMAGE_NAME}:${TAG_TS}" + # Registry tag (without timestamp, used when pushing to registry) LOCAL_TAG="${{ env.REGISTRY }}/${IMAGE_NAME}:${TAG}" # Derived build arguments BASE_IMAGE="nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}" PYTORCH_INDEX="https://download.pytorch.org/whl/cu${CUDA_MAJOR}${CUDA_MINOR}" - # Tags list: local registry only - TAGS="${LOCAL_TAG}" - echo "image_name=${IMAGE_NAME}" >> $GITHUB_OUTPUT echo "tag=${TAG}" >> $GITHUB_OUTPUT + echo "build_tag=${BUILD_TAG}" >> $GITHUB_OUTPUT echo "local_tag=${LOCAL_TAG}" >> $GITHUB_OUTPUT echo "base_image=${BASE_IMAGE}" >> $GITHUB_OUTPUT echo "pytorch_index=${PYTORCH_INDEX}" >> $GITHUB_OUTPUT - # Multi-line tags output - { - echo "tags<> $GITHUB_OUTPUT - # Job summary { echo "### Build: ${IMAGE_NAME}" @@ -193,7 +187,8 @@ jobs: echo "| CUDA | \`${CUDA_VERSION}\` |" echo "| Python | \`${PYTHON_VERSION}\` |" echo "| Dockerfile | \`docker/cuda/Dockerfile.${TASK}\` |" - echo "| Local Tag | \`${LOCAL_TAG}\` |" + echo "| Build Tag | \`${BUILD_TAG}\` |" + echo "| Registry Tag | \`${LOCAL_TAG}\` |" } >> $GITHUB_STEP_SUMMARY - name: Build Docker image @@ -203,7 +198,7 @@ jobs: file: docker/cuda/Dockerfile.${{ matrix.task }} target: ${{ needs.prepare.outputs.target }} load: true - tags: ${{ steps.meta.outputs.tags }} + tags: ${{ steps.meta.outputs.build_tag }} build-args: | BASE_IMAGE=${{ steps.meta.outputs.base_image }} CUDA_VERSION=${{ env.CUDA_VERSION }} @@ -216,7 +211,9 @@ jobs: - name: Push Docker image if: needs.prepare.outputs.push == 'true' - run: docker push ${{ steps.meta.outputs.local_tag }} + run: | + docker tag ${{ steps.meta.outputs.build_tag }} ${{ steps.meta.outputs.local_tag }} + docker push ${{ steps.meta.outputs.local_tag }} - name: Export image tag for config update id: export @@ -232,117 +229,41 @@ jobs: echo "**Result:** Built successfully" >> $GITHUB_STEP_SUMMARY echo "**Pushed:** ${{ needs.prepare.outputs.push }}" >> $GITHUB_STEP_SUMMARY - # --------------------------------------------------------------------------- - # Update Config: update cuda.yml with localhost tags (temporary, for test validation) - # After tests pass, push_image_harbor.yml will promote to Harbor and update to final tags - # --------------------------------------------------------------------------- - update-config: - name: Update CI config - needs: build - runs-on: [self-hosted, Linux, X64, nvidia-0, gpus-8] - if: needs.build.result == 'success' - steps: - - name: Clean workspace - run: sudo rm -rf "$GITHUB_WORKSPACE"/* "$GITHUB_WORKSPACE"/.[!.]* 2>/dev/null || true - - - name: Checkout code - uses: actions/checkout@v4 - with: - token: ${{ secrets.GITHUB_TOKEN }} - fetch-depth: 0 - - - name: Switch to PR branch - if: github.event_name == 'pull_request' - run: | - PR_REPO_FULL="${{ github.event.pull_request.head.repo.full_name }}" - PR_BRANCH="${{ github.head_ref }}" - FORK_URL="https://x-access-token:${{ secrets.FORK_PUSH_TOKEN }}@github.com/${PR_REPO_FULL}.git" - git remote add pr-head "${FORK_URL}" || true - git fetch pr-head "${PR_BRANCH}" - git checkout -b "${PR_BRANCH}" "pr-head/${PR_BRANCH}" - - - name: Update cuda.yml with new image tags - run: | - set -euo pipefail - CONFIG_FILE=".github/configs/cuda.yml" - - TRAIN_TAG="${{ needs.build.outputs.train_tag }}" - INFERENCE_TAG="${{ needs.build.outputs.inference_tag }}" - - if [ -n "$TRAIN_TAG" ]; then - echo "Updating ci_train_image to: $TRAIN_TAG" - sed -i "s|^ci_train_image:.*|ci_train_image: ${TRAIN_TAG}|" "$CONFIG_FILE" - fi - - if [ -n "$INFERENCE_TAG" ]; then - echo "Updating ci_inference_image to: $INFERENCE_TAG" - sed -i "s|^ci_inference_image:.*|ci_inference_image: ${INFERENCE_TAG}|" "$CONFIG_FILE" - fi - - echo "Updated config:" - cat "$CONFIG_FILE" - - - name: Commit and push config update - run: | - git config user.name "github-actions[bot]" - git config user.email "github-actions[bot]@users.noreply.github.com" - git add .github/configs/cuda.yml - if git diff --cached --quiet; then - echo "No config changes to commit" - else - git commit -m "ci: update CUDA image tags [skip ci]" - if [ "${{ github.event_name }}" = "pull_request" ]; then - git push pr-head HEAD:"${{ github.head_ref }}" - else - git push - fi - fi - - # --------------------------------------------------------------------------- - # Token help: print setup instructions when update-config fails on fork PRs - # --------------------------------------------------------------------------- - token-help: - name: Print token setup instructions - needs: update-config - runs-on: ubuntu-latest - if: failure() && github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name != github.repository - steps: - - name: Print FORK_PUSH_TOKEN setup instructions - run: | - echo "::error::Push to fork failed. The FORK_PUSH_TOKEN secret is missing or invalid." - { - echo "### Push to fork failed — FORK_PUSH_TOKEN setup required" - echo "" - echo "**Step 1: Create a Personal Access Token (PAT)**" - echo "1. Go to your GitHub account: \`Settings → Developer settings → Personal access tokens → Fine-grained tokens\`" - echo "2. Click \`Generate new token\`" - echo "3. Set \`Repository access\` → \`Only select repositories\` → select your fork repo (\`${{ github.event.pull_request.head.repo.full_name }}\`)" - echo "4. Set \`Permissions\` → \`Repository permissions\` → \`Contents\`: **Read and Write**" - echo "5. Click \`Generate token\` and copy the token value" - echo "" - echo "**Step 2: Add the secret to the upstream repository**" - echo "1. Go to [\`${{ github.repository }}\`](https://github.com/${{ github.repository }}/settings/secrets/actions) → \`Settings → Secrets and variables → Actions\`" - echo "2. Click \`New repository secret\`" - echo "3. Name: \`FORK_PUSH_TOKEN\`" - echo "4. Value: paste the token from Step 1" - echo "5. Click \`Add secret\`" - echo "" - echo "**Step 3:** Re-run this workflow" - } >> $GITHUB_STEP_SUMMARY - # --------------------------------------------------------------------------- # Summary: verify all builds completed # --------------------------------------------------------------------------- summary: name: Build summary - needs: [update-config, token-help] + needs: build runs-on: ubuntu-latest if: always() steps: - name: Verify build results run: | - if [ "${{ needs.update-config.result }}" != "success" ]; then + if [ "${{ needs.build.result }}" != "success" ]; then echo "::error::One or more image builds failed" exit 1 fi echo "All Docker images built successfully!" + + # --------------------------------------------------------------------------- + # Run CUDA tests after build succeeds + # --------------------------------------------------------------------------- + run_cuda_tests: + name: Run CUDA tests + needs: summary + if: needs.summary.result == 'success' + uses: ./.github/workflows/all_tests_cuda.yml + secrets: inherit + + # --------------------------------------------------------------------------- + # Push validated images to Harbor after tests pass + # --------------------------------------------------------------------------- + push_images_to_harbor: + name: Push images to Harbor + needs: run_cuda_tests + if: needs.run_cuda_tests.result == 'success' + uses: ./.github/workflows/push_image_harbor.yml + with: + platform: cuda + secrets: inherit From a510a24a9be97d6464b69e70a37e5fdea119194d Mon Sep 17 00:00:00 2001 From: zihugithub Date: Fri, 20 Mar 2026 15:42:58 +0800 Subject: [PATCH 13/29] debug1 --- .github/workflows/build_image_cuda.yml | 68 +++++++++++++------------ .github/workflows/push_image_harbor.yml | 3 ++ 2 files changed, 38 insertions(+), 33 deletions(-) diff --git a/.github/workflows/build_image_cuda.yml b/.github/workflows/build_image_cuda.yml index 44f3ef3efc..8f4a71759e 100644 --- a/.github/workflows/build_image_cuda.yml +++ b/.github/workflows/build_image_cuda.yml @@ -191,29 +191,29 @@ jobs: echo "| Registry Tag | \`${LOCAL_TAG}\` |" } >> $GITHUB_STEP_SUMMARY - - name: Build Docker image - uses: docker/build-push-action@v6 - with: - context: . - file: docker/cuda/Dockerfile.${{ matrix.task }} - target: ${{ needs.prepare.outputs.target }} - load: true - tags: ${{ steps.meta.outputs.build_tag }} - build-args: | - BASE_IMAGE=${{ steps.meta.outputs.base_image }} - CUDA_VERSION=${{ env.CUDA_VERSION }} - UBUNTU_VERSION=${{ env.UBUNTU_VERSION }} - PYTHON_VERSION=${{ env.PYTHON_VERSION }} - UV_VERSION=${{ env.UV_VERSION }} - PYTORCH_INDEX=${{ steps.meta.outputs.pytorch_index }} - PKG_MGR=${{ env.PKG_MGR }} - no-cache: ${{ needs.prepare.outputs.no_cache == 'true' }} + # - name: Build Docker image + # uses: docker/build-push-action@v6 + # with: + # context: . + # file: docker/cuda/Dockerfile.${{ matrix.task }} + # target: ${{ needs.prepare.outputs.target }} + # load: true + # tags: ${{ steps.meta.outputs.build_tag }} + # build-args: | + # BASE_IMAGE=${{ steps.meta.outputs.base_image }} + # CUDA_VERSION=${{ env.CUDA_VERSION }} + # UBUNTU_VERSION=${{ env.UBUNTU_VERSION }} + # PYTHON_VERSION=${{ env.PYTHON_VERSION }} + # UV_VERSION=${{ env.UV_VERSION }} + # PYTORCH_INDEX=${{ steps.meta.outputs.pytorch_index }} + # PKG_MGR=${{ env.PKG_MGR }} + # no-cache: ${{ needs.prepare.outputs.no_cache == 'true' }} - - name: Push Docker image - if: needs.prepare.outputs.push == 'true' - run: | - docker tag ${{ steps.meta.outputs.build_tag }} ${{ steps.meta.outputs.local_tag }} - docker push ${{ steps.meta.outputs.local_tag }} + # - name: Push Docker image + # if: needs.prepare.outputs.push == 'true' + # run: | + # docker tag ${{ steps.meta.outputs.build_tag }} ${{ steps.meta.outputs.local_tag }} + # docker push ${{ steps.meta.outputs.local_tag }} - name: Export image tag for config update id: export @@ -246,23 +246,25 @@ jobs: fi echo "All Docker images built successfully!" - # --------------------------------------------------------------------------- - # Run CUDA tests after build succeeds - # --------------------------------------------------------------------------- - run_cuda_tests: - name: Run CUDA tests - needs: summary - if: needs.summary.result == 'success' - uses: ./.github/workflows/all_tests_cuda.yml - secrets: inherit + # # --------------------------------------------------------------------------- + # # Run CUDA tests after build succeeds + # # --------------------------------------------------------------------------- + # run_cuda_tests: + # name: Run CUDA tests + # needs: summary + # if: needs.summary.result == 'success' + # uses: ./.github/workflows/all_tests_cuda.yml + # secrets: inherit # --------------------------------------------------------------------------- # Push validated images to Harbor after tests pass # --------------------------------------------------------------------------- push_images_to_harbor: name: Push images to Harbor - needs: run_cuda_tests - if: needs.run_cuda_tests.result == 'success' + # needs: run_cuda_tests + needs: summary + # if: needs.run_cuda_tests.result == 'success' + if: needs.summary.result == 'success' uses: ./.github/workflows/push_image_harbor.yml with: platform: cuda diff --git a/.github/workflows/push_image_harbor.yml b/.github/workflows/push_image_harbor.yml index d10e9a3e4c..640c59d0f6 100644 --- a/.github/workflows/push_image_harbor.yml +++ b/.github/workflows/push_image_harbor.yml @@ -17,6 +17,9 @@ jobs: name: Push validated images to Harbor runs-on: [self-hosted, Linux, X64, nvidia-0, gpus-8] steps: + - name: Clean workspace + run: sudo rm -rf "$GITHUB_WORKSPACE"/* "$GITHUB_WORKSPACE"/.[!.]* 2>/dev/null || true + - name: Checkout code uses: actions/checkout@v4 From 3492db7fcb557acda0fcaff5438d9275c0acfec5 Mon Sep 17 00:00:00 2001 From: zihugithub Date: Sat, 21 Mar 2026 10:41:12 +0800 Subject: [PATCH 14/29] re-enable Docker build/push and CUDA tests in build_image_cuda workflow --- .github/workflows/all_tests_common.yml | 67 +++++++++++++------------ .github/workflows/build_image_cuda.yml | 68 +++++++++++++------------- 2 files changed, 69 insertions(+), 66 deletions(-) diff --git a/.github/workflows/all_tests_common.yml b/.github/workflows/all_tests_common.yml index d898543572..eb766f4111 100644 --- a/.github/workflows/all_tests_common.yml +++ b/.github/workflows/all_tests_common.yml @@ -112,32 +112,33 @@ jobs: container_options: ${{ needs.checkout_and_config.outputs.container_options }} source_artifact: flagscale-source-${{ github.sha }} - unit_tests: - needs: - - checkout_and_config - - cli_validation - strategy: - fail-fast: false - matrix: - device: ${{ fromJson(needs.checkout_and_config.outputs.device_types) }} - uses: ./.github/workflows/unit_tests_common.yml - name: unit_tests - with: - platform: ${{ inputs.platform }} - device: ${{ matrix.device }} - image: ${{ needs.checkout_and_config.outputs.ci_train_image }} - runs_on: ${{ needs.checkout_and_config.outputs.runs_on }} - container_volumes: ${{ needs.checkout_and_config.outputs.container_volumes }} - container_options: ${{ needs.checkout_and_config.outputs.container_options }} - source_artifact: flagscale-source-${{ github.sha }} - pkg_mgr: ${{ needs.checkout_and_config.outputs.pkg_mgr }} - env_name: ${{ needs.checkout_and_config.outputs.env_name_train }} - env_path: ${{ needs.checkout_and_config.outputs.env_path }} + # unit_tests: + # needs: + # - checkout_and_config + # - cli_validation + # strategy: + # fail-fast: false + # matrix: + # device: ${{ fromJson(needs.checkout_and_config.outputs.device_types) }} + # uses: ./.github/workflows/unit_tests_common.yml + # name: unit_tests + # with: + # platform: ${{ inputs.platform }} + # device: ${{ matrix.device }} + # image: ${{ needs.checkout_and_config.outputs.ci_train_image }} + # runs_on: ${{ needs.checkout_and_config.outputs.runs_on }} + # container_volumes: ${{ needs.checkout_and_config.outputs.container_volumes }} + # container_options: ${{ needs.checkout_and_config.outputs.container_options }} + # source_artifact: flagscale-source-${{ github.sha }} + # pkg_mgr: ${{ needs.checkout_and_config.outputs.pkg_mgr }} + # env_name: ${{ needs.checkout_and_config.outputs.env_name_train }} + # env_path: ${{ needs.checkout_and_config.outputs.env_path }} functional_tests_train: needs: - checkout_and_config - - unit_tests + - cli_validation + # - unit_tests if: fromJson(needs.checkout_and_config.outputs.train_test_matrix)[0] != null uses: ./.github/workflows/functional_tests_train.yml with: @@ -155,7 +156,8 @@ jobs: functional_tests_hetero_train: needs: - checkout_and_config - - unit_tests + - cli_validation + # - unit_tests if: fromJson(needs.checkout_and_config.outputs.hetero_train_test_matrix)[0] != null uses: ./.github/workflows/functional_tests_hetero_train.yml with: @@ -173,7 +175,8 @@ jobs: functional_tests_inference: needs: - checkout_and_config - - unit_tests + - cli_validation + # - unit_tests if: fromJson(needs.checkout_and_config.outputs.inference_test_matrix)[0] != null uses: ./.github/workflows/functional_tests_inference.yml with: @@ -191,7 +194,8 @@ jobs: functional_tests_serve: needs: - checkout_and_config - - unit_tests + - cli_validation + # - unit_tests if: fromJson(needs.checkout_and_config.outputs.serve_test_matrix)[0] != null uses: ./.github/workflows/functional_tests_serve.yml with: @@ -228,7 +232,8 @@ jobs: functional_tests_benchmark: needs: - checkout_and_config - - unit_tests + - cli_validation + # - unit_tests if: fromJson(needs.checkout_and_config.outputs.benchmark_test_matrix)[0] != null uses: ./.github/workflows/functional_tests_benchmark.yml with: @@ -250,7 +255,7 @@ jobs: needs: - checkout_and_config - cli_validation - - unit_tests + # - unit_tests - functional_tests_train - functional_tests_hetero_train - functional_tests_benchmark @@ -266,10 +271,10 @@ jobs: # Check all test jobs (skip if not run) failed=false - if [ "${{ needs.unit_tests.result }}" != "success" ]; then - echo "❌ Unit tests failed" - failed=true - fi + # if [ "${{ needs.unit_tests.result }}" != "success" ]; then + # echo "❌ Unit tests failed" + # failed=true + # fi if [ "${{ needs.cli_validation.result }}" != "success" ]; then echo "❌ CLI validation failed" diff --git a/.github/workflows/build_image_cuda.yml b/.github/workflows/build_image_cuda.yml index 8f4a71759e..44f3ef3efc 100644 --- a/.github/workflows/build_image_cuda.yml +++ b/.github/workflows/build_image_cuda.yml @@ -191,29 +191,29 @@ jobs: echo "| Registry Tag | \`${LOCAL_TAG}\` |" } >> $GITHUB_STEP_SUMMARY - # - name: Build Docker image - # uses: docker/build-push-action@v6 - # with: - # context: . - # file: docker/cuda/Dockerfile.${{ matrix.task }} - # target: ${{ needs.prepare.outputs.target }} - # load: true - # tags: ${{ steps.meta.outputs.build_tag }} - # build-args: | - # BASE_IMAGE=${{ steps.meta.outputs.base_image }} - # CUDA_VERSION=${{ env.CUDA_VERSION }} - # UBUNTU_VERSION=${{ env.UBUNTU_VERSION }} - # PYTHON_VERSION=${{ env.PYTHON_VERSION }} - # UV_VERSION=${{ env.UV_VERSION }} - # PYTORCH_INDEX=${{ steps.meta.outputs.pytorch_index }} - # PKG_MGR=${{ env.PKG_MGR }} - # no-cache: ${{ needs.prepare.outputs.no_cache == 'true' }} + - name: Build Docker image + uses: docker/build-push-action@v6 + with: + context: . + file: docker/cuda/Dockerfile.${{ matrix.task }} + target: ${{ needs.prepare.outputs.target }} + load: true + tags: ${{ steps.meta.outputs.build_tag }} + build-args: | + BASE_IMAGE=${{ steps.meta.outputs.base_image }} + CUDA_VERSION=${{ env.CUDA_VERSION }} + UBUNTU_VERSION=${{ env.UBUNTU_VERSION }} + PYTHON_VERSION=${{ env.PYTHON_VERSION }} + UV_VERSION=${{ env.UV_VERSION }} + PYTORCH_INDEX=${{ steps.meta.outputs.pytorch_index }} + PKG_MGR=${{ env.PKG_MGR }} + no-cache: ${{ needs.prepare.outputs.no_cache == 'true' }} - # - name: Push Docker image - # if: needs.prepare.outputs.push == 'true' - # run: | - # docker tag ${{ steps.meta.outputs.build_tag }} ${{ steps.meta.outputs.local_tag }} - # docker push ${{ steps.meta.outputs.local_tag }} + - name: Push Docker image + if: needs.prepare.outputs.push == 'true' + run: | + docker tag ${{ steps.meta.outputs.build_tag }} ${{ steps.meta.outputs.local_tag }} + docker push ${{ steps.meta.outputs.local_tag }} - name: Export image tag for config update id: export @@ -246,25 +246,23 @@ jobs: fi echo "All Docker images built successfully!" - # # --------------------------------------------------------------------------- - # # Run CUDA tests after build succeeds - # # --------------------------------------------------------------------------- - # run_cuda_tests: - # name: Run CUDA tests - # needs: summary - # if: needs.summary.result == 'success' - # uses: ./.github/workflows/all_tests_cuda.yml - # secrets: inherit + # --------------------------------------------------------------------------- + # Run CUDA tests after build succeeds + # --------------------------------------------------------------------------- + run_cuda_tests: + name: Run CUDA tests + needs: summary + if: needs.summary.result == 'success' + uses: ./.github/workflows/all_tests_cuda.yml + secrets: inherit # --------------------------------------------------------------------------- # Push validated images to Harbor after tests pass # --------------------------------------------------------------------------- push_images_to_harbor: name: Push images to Harbor - # needs: run_cuda_tests - needs: summary - # if: needs.run_cuda_tests.result == 'success' - if: needs.summary.result == 'success' + needs: run_cuda_tests + if: needs.run_cuda_tests.result == 'success' uses: ./.github/workflows/push_image_harbor.yml with: platform: cuda From 03ee149b6ed918c5ce0a3a2d10d900027991b405 Mon Sep 17 00:00:00 2001 From: zihugithub Date: Mon, 30 Mar 2026 17:14:05 +0800 Subject: [PATCH 15/29] add two-stage pipeline to support fork PR builds --- .github/workflows/build_image_cuda.yml | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build_image_cuda.yml b/.github/workflows/build_image_cuda.yml index 44f3ef3efc..b10dc6da8e 100644 --- a/.github/workflows/build_image_cuda.yml +++ b/.github/workflows/build_image_cuda.yml @@ -42,7 +42,7 @@ on: type: boolean default: false - # Trigger on PRs that modify docker-related files + # Trigger on PRs that modify docker-related files (build + test only, no push) pull_request: branches: [main] paths: @@ -52,6 +52,16 @@ on: - 'requirements/**' - '.github/workflows/build_image_cuda.yml' + # Trigger on merge to main: full pipeline including push to Harbor + push: + branches: [main] + paths: + - 'docker/cuda/**' + - 'docker/build.sh' + - 'tools/install/**' + - 'requirements/**' + - '.github/workflows/build_image_cuda.yml' + permissions: contents: read @@ -252,7 +262,7 @@ jobs: run_cuda_tests: name: Run CUDA tests needs: summary - if: needs.summary.result == 'success' + if: needs.summary.result == 'success' && github.event_name != 'pull_request' uses: ./.github/workflows/all_tests_cuda.yml secrets: inherit @@ -262,7 +272,7 @@ jobs: push_images_to_harbor: name: Push images to Harbor needs: run_cuda_tests - if: needs.run_cuda_tests.result == 'success' + if: needs.run_cuda_tests.result == 'success' && github.event_name != 'pull_request' uses: ./.github/workflows/push_image_harbor.yml with: platform: cuda From bc952890b853fb914b4cbd109cd9c22acacb987d Mon Sep 17 00:00:00 2001 From: zihugithub Date: Tue, 31 Mar 2026 11:34:42 +0800 Subject: [PATCH 16/29] debug2 --- .github/workflows/build_image_cuda.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_image_cuda.yml b/.github/workflows/build_image_cuda.yml index b10dc6da8e..63d1297618 100644 --- a/.github/workflows/build_image_cuda.yml +++ b/.github/workflows/build_image_cuda.yml @@ -262,7 +262,7 @@ jobs: run_cuda_tests: name: Run CUDA tests needs: summary - if: needs.summary.result == 'success' && github.event_name != 'pull_request' + if: needs.summary.result == 'success' uses: ./.github/workflows/all_tests_cuda.yml secrets: inherit From 02ffa3134f0a2af4d1e967c15b750729af18dc94 Mon Sep 17 00:00:00 2001 From: zihugithub Date: Wed, 8 Apr 2026 19:07:00 +0800 Subject: [PATCH 17/29] ci: refactor build_image_cuda workflow --- .github/configs/cuda.yml | 4 +- .github/workflows/all_tests_common.yml | 18 +++ .github/workflows/all_tests_cuda.yml | 5 +- .github/workflows/build_image_cuda.yml | 99 +++++++++------- .github/workflows/push_image_harbor.yml | 150 ++++++++++++++++++------ 5 files changed, 192 insertions(+), 84 deletions(-) diff --git a/.github/configs/cuda.yml b/.github/configs/cuda.yml index cd1d99461f..093b97dac2 100644 --- a/.github/configs/cuda.yml +++ b/.github/configs/cuda.yml @@ -7,8 +7,8 @@ display_name: "CUDA Tests" # Docker image for this hardware ci_image: localhost:5000/flagscale:cuda12.8.1-cudnn9.7.1-python3.12-torch2.7.1-time2510131515 -ci_train_image: localhost:5000/flagscale-train:dev-cu128-py3.12 -ci_inference_image: localhost:5000/flagscale-inference:dev-cu128-py3.12 +ci_train_image: localhost:5000/flagscale-train:dev-cu128-py3.12-20260228210721 +ci_inference_image: localhost:5000/flagscale-inference:dev-cu128-py3.12-20260302102033 # Runner labels for this hardware runner_labels: diff --git a/.github/workflows/all_tests_common.yml b/.github/workflows/all_tests_common.yml index eb766f4111..47af712e49 100644 --- a/.github/workflows/all_tests_common.yml +++ b/.github/workflows/all_tests_common.yml @@ -7,6 +7,16 @@ on: required: true type: string description: Platform name (e.g., cuda, default) + ci_train_image: + required: false + type: string + description: Override train image (e.g., newly built image). Falls back to platform config if not set. + default: "" + ci_inference_image: + required: false + type: string + description: Override inference image. Falls back to platform config if not set. + default: "" jobs: checkout_and_config: @@ -100,6 +110,14 @@ jobs: # Load configuration and group tests by task load_platform_config "$PLATFORM" + # Override images if provided as inputs + if [ -n "${{ inputs.ci_train_image }}" ]; then + echo "ci_train_image=${{ inputs.ci_train_image }}" >> $GITHUB_OUTPUT + fi + if [ -n "${{ inputs.ci_inference_image }}" ]; then + echo "ci_inference_image=${{ inputs.ci_inference_image }}" >> $GITHUB_OUTPUT + fi + # CLI validation runs first (outside virtual env) as a gate for all subsequent tests cli_validation: needs: checkout_and_config diff --git a/.github/workflows/all_tests_cuda.yml b/.github/workflows/all_tests_cuda.yml index e02ef07b77..2374e10850 100644 --- a/.github/workflows/all_tests_cuda.yml +++ b/.github/workflows/all_tests_cuda.yml @@ -1,9 +1,6 @@ name: cuda_tests on: - # Called by Build Docker Images - CUDA workflow - workflow_call: - push: branches: ["main"] paths-ignore: @@ -12,6 +9,7 @@ on: - 'tools/install/**' - 'requirements/**' - '.github/workflows/build_image_cuda.yml' + - '.github/workflows/push_image_harbor.yml' pull_request: branches: ["main"] paths-ignore: @@ -20,6 +18,7 @@ on: - 'tools/install/**' - 'requirements/**' - '.github/workflows/build_image_cuda.yml' + - '.github/workflows/push_image_harbor.yml' concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.actor }} diff --git a/.github/workflows/build_image_cuda.yml b/.github/workflows/build_image_cuda.yml index 63d1297618..ae96330ada 100644 --- a/.github/workflows/build_image_cuda.yml +++ b/.github/workflows/build_image_cuda.yml @@ -41,6 +41,10 @@ on: description: 'Build all tasks (train, inference, all) - overrides task selection' type: boolean default: false + tar_dir: + description: 'Directory to store image tar files' + type: string + default: '/home/flagscale_cicd/images_tar' # Trigger on PRs that modify docker-related files (build + test only, no push) pull_request: @@ -52,16 +56,6 @@ on: - 'requirements/**' - '.github/workflows/build_image_cuda.yml' - # Trigger on merge to main: full pipeline including push to Harbor - push: - branches: [main] - paths: - - 'docker/cuda/**' - - 'docker/build.sh' - - 'tools/install/**' - - 'requirements/**' - - '.github/workflows/build_image_cuda.yml' - permissions: contents: read @@ -78,6 +72,7 @@ env: PYTHON_VERSION: '3.12' UV_VERSION: '0.7.2' PKG_MGR: ${{ inputs.pkg_mgr || 'conda' }} + TAR_DIR: ${{ inputs.tar_dir || '/home/flagscale_cicd/images_tar' }} jobs: # --------------------------------------------------------------------------- @@ -102,7 +97,8 @@ jobs: echo 'matrix={"task":["${{ inputs.task }}"]}' >> $GITHUB_OUTPUT else # PR or build_all_tasks=true: build all tasks - echo 'matrix={"task":["train","inference","all"]}' >> $GITHUB_OUTPUT + # echo 'matrix={"task":["train","inference","all"]}' >> $GITHUB_OUTPUT + echo 'matrix={"task":["train"]}' >> $GITHUB_OUTPUT fi - name: Set build parameters @@ -136,6 +132,9 @@ jobs: train_tag: ${{ steps.export.outputs.train_tag }} inference_tag: ${{ steps.export.outputs.inference_tag }} all_tag: ${{ steps.export.outputs.all_tag }} + train_tar: ${{ steps.export.outputs.train_tar }} + inference_tar: ${{ steps.export.outputs.inference_tar }} + all_tar: ${{ steps.export.outputs.all_tar }} steps: - name: Clean workspace run: sudo rm -rf "$GITHUB_WORKSPACE"/* "$GITHUB_WORKSPACE"/.[!.]* 2>/dev/null || true @@ -167,22 +166,16 @@ jobs: # Image naming follows docker/build.sh convention: # flagscale-:-cu-py[-] IMAGE_NAME="flagscale-${TASK}" - TAG="${TARGET}-cu${CUDA_MAJOR}${CUDA_MINOR}-py${PYTHON_VERSION}" - TAG_TS="${TAG}-${TIMESTAMP}" + TAG="${TARGET}-cu${CUDA_MAJOR}${CUDA_MINOR}-py${PYTHON_VERSION}-${TIMESTAMP}" - # Build tag (with timestamp, used during docker build) - BUILD_TAG="${{ env.REGISTRY }}/${IMAGE_NAME}:${TAG_TS}" - # Registry tag (without timestamp, used when pushing to registry) - LOCAL_TAG="${{ env.REGISTRY }}/${IMAGE_NAME}:${TAG}" + BUILD_TAG="${{ env.REGISTRY }}/${IMAGE_NAME}:${TAG}" # Derived build arguments BASE_IMAGE="nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}" PYTORCH_INDEX="https://download.pytorch.org/whl/cu${CUDA_MAJOR}${CUDA_MINOR}" echo "image_name=${IMAGE_NAME}" >> $GITHUB_OUTPUT - echo "tag=${TAG}" >> $GITHUB_OUTPUT echo "build_tag=${BUILD_TAG}" >> $GITHUB_OUTPUT - echo "local_tag=${LOCAL_TAG}" >> $GITHUB_OUTPUT echo "base_image=${BASE_IMAGE}" >> $GITHUB_OUTPUT echo "pytorch_index=${PYTORCH_INDEX}" >> $GITHUB_OUTPUT @@ -198,7 +191,6 @@ jobs: echo "| Python | \`${PYTHON_VERSION}\` |" echo "| Dockerfile | \`docker/cuda/Dockerfile.${TASK}\` |" echo "| Build Tag | \`${BUILD_TAG}\` |" - echo "| Registry Tag | \`${LOCAL_TAG}\` |" } >> $GITHUB_STEP_SUMMARY - name: Build Docker image @@ -219,18 +211,23 @@ jobs: PKG_MGR=${{ env.PKG_MGR }} no-cache: ${{ needs.prepare.outputs.no_cache == 'true' }} - - name: Push Docker image - if: needs.prepare.outputs.push == 'true' + - name: Save image as tar run: | - docker tag ${{ steps.meta.outputs.build_tag }} ${{ steps.meta.outputs.local_tag }} - docker push ${{ steps.meta.outputs.local_tag }} + TAR_DIR="${{ env.TAR_DIR }}" + mkdir -p "$TAR_DIR" + IMAGE_TAG="${{ steps.meta.outputs.build_tag }}" + # Use image name with tag as filename (replace / and : with -) + TAR_NAME=$(echo "$IMAGE_TAG" | tr '/: ' '---').tar + docker save "$IMAGE_TAG" -o "${TAR_DIR}/${TAR_NAME}" + echo "tar_path=${TAR_DIR}/${TAR_NAME}" >> $GITHUB_OUTPUT + id: save_tar - name: Export image tag for config update id: export - if: success() && needs.prepare.outputs.push == 'true' run: | TASK="${{ matrix.task }}" - echo "${TASK}_tag=${{ steps.meta.outputs.local_tag }}" >> $GITHUB_OUTPUT + echo "${TASK}_tag=${{ steps.meta.outputs.build_tag }}" >> $GITHUB_OUTPUT + echo "${TASK}_tar=${{ steps.save_tar.outputs.tar_path }}" >> $GITHUB_OUTPUT - name: Print build result if: success() @@ -257,23 +254,45 @@ jobs: echo "All Docker images built successfully!" # --------------------------------------------------------------------------- - # Run CUDA tests after build succeeds + # Load images from tar and push to local registry before running tests # --------------------------------------------------------------------------- - run_cuda_tests: - name: Run CUDA tests - needs: summary - if: needs.summary.result == 'success' - uses: ./.github/workflows/all_tests_cuda.yml - secrets: inherit + load_images: + name: Load and push images + needs: ['build', 'summary'] + runs-on: [self-hosted, Linux, X64, nvidia-0, gpus-8] + steps: + - name: Load train image from tar and push + run: | + TAR="${{ needs.build.outputs.train_tar }}" + TAG="${{ needs.build.outputs.train_tag }}" + if [ -f "$TAR" ]; then + echo "Loading $TAR" + docker load -i "$TAR" + docker push "$TAG" + else + echo "::warning::Train image tar not found: $TAR, skipping load" + fi + + - name: Load inference image from tar and push + run: | + TAR="${{ needs.build.outputs.inference_tar }}" + TAG="${{ needs.build.outputs.inference_tag }}" + if [ -f "$TAR" ]; then + echo "Loading $TAR" + docker load -i "$TAR" + docker push "$TAG" + else + echo "::warning::Inference image tar not found: $TAR, skipping load" + fi # --------------------------------------------------------------------------- - # Push validated images to Harbor after tests pass + # Run CUDA tests after build succeeds # --------------------------------------------------------------------------- - push_images_to_harbor: - name: Push images to Harbor - needs: run_cuda_tests - if: needs.run_cuda_tests.result == 'success' && github.event_name != 'pull_request' - uses: ./.github/workflows/push_image_harbor.yml + run_cuda_tests: + name: Run CUDA tests + needs: ['prepare', 'build', 'load_images'] + uses: ./.github/workflows/all_tests_common.yml with: platform: cuda - secrets: inherit + ci_train_image: ${{ needs.build.outputs.train_tag }} + ci_inference_image: ${{ needs.build.outputs.inference_tag }} diff --git a/.github/workflows/push_image_harbor.yml b/.github/workflows/push_image_harbor.yml index 640c59d0f6..708d4040c2 100644 --- a/.github/workflows/push_image_harbor.yml +++ b/.github/workflows/push_image_harbor.yml @@ -1,51 +1,70 @@ name: Push Images to Harbor on: - workflow_call: - inputs: - platform: - required: true - type: string - description: "Platform name (e.g. cuda), used to locate .github/configs/.yml" + push: + branches: ['main'] + paths: + - 'docker/cuda/**' + - 'docker/build.sh' + - 'tools/install/**' + - 'requirements/**' + - '.github/workflows/build_image_cuda.yml' + +permissions: + contents: write env: REMOTE_REGISTRY: harbor.baai.ac.cn REMOTE_IMAGE_PREFIX: flagscale + TAR_DIR: /home/flagscale_cicd/images_tar jobs: - promote: - name: Push validated images to Harbor + # --------------------------------------------------------------------------- + # Prepare: scan tar directory and detect which images need promotion + # --------------------------------------------------------------------------- + prepare: + name: Detect tar files runs-on: [self-hosted, Linux, X64, nvidia-0, gpus-8] + outputs: + needs_promotion: ${{ steps.detect.outputs.needs_promotion }} + train_tar: ${{ steps.detect.outputs.train_tar }} + inference_tar: ${{ steps.detect.outputs.inference_tar }} steps: - - name: Clean workspace - run: sudo rm -rf "$GITHUB_WORKSPACE"/* "$GITHUB_WORKSPACE"/.[!.]* 2>/dev/null || true - - - name: Checkout code - uses: actions/checkout@v4 - - - name: Read image tags from config - id: tags + - name: Detect tar files in ${{ env.TAR_DIR }} + id: detect run: | set -euo pipefail - CONFIG_FILE=".github/configs/${{ inputs.platform }}.yml" + TAR_DIR="${{ env.TAR_DIR }}" - TRAIN_TAG=$(grep '^ci_train_image:' "$CONFIG_FILE" | awk '{print $2}') - INFERENCE_TAG=$(grep '^ci_inference_image:' "$CONFIG_FILE" | awk '{print $2}') + TRAIN_TAR=$(find "$TAR_DIR" -maxdepth 1 -name '*flagscale-train*' -name '*.tar' 2>/dev/null | sort | tail -1) + INFERENCE_TAR=$(find "$TAR_DIR" -maxdepth 1 -name '*flagscale-inference*' -name '*.tar' 2>/dev/null | sort | tail -1) - echo "train_tag=${TRAIN_TAG}" >> $GITHUB_OUTPUT - echo "inference_tag=${INFERENCE_TAG}" >> $GITHUB_OUTPUT + echo "train_tar=${TRAIN_TAR}" >> $GITHUB_OUTPUT + echo "inference_tar=${INFERENCE_TAR}" >> $GITHUB_OUTPUT - # Check if images are from localhost (freshly built, need promotion) - if echo "${TRAIN_TAG}${INFERENCE_TAG}" | grep -q 'localhost'; then + if [ -n "$TRAIN_TAR" ] || [ -n "$INFERENCE_TAR" ]; then echo "needs_promotion=true" >> $GITHUB_OUTPUT - echo "Images are from localhost, promotion needed" + echo "Detected tars:" + if [ -n "$TRAIN_TAR" ]; then echo " train: $TRAIN_TAR"; fi + if [ -n "$INFERENCE_TAR" ]; then echo " inference: $INFERENCE_TAR"; fi else echo "needs_promotion=false" >> $GITHUB_OUTPUT - echo "Images already on Harbor, skipping promotion" + echo "No tar files found in $TAR_DIR, skipping promotion" fi + # --------------------------------------------------------------------------- + # Promote: load tar → retag → push to Harbor → delete tar + # --------------------------------------------------------------------------- + promote: + name: Push validated images to Harbor + needs: prepare + if: needs.prepare.outputs.needs_promotion == 'true' + runs-on: [self-hosted, Linux, X64, nvidia-0, gpus-8] + outputs: + remote_train_tag: ${{ steps.promote_train.outputs.remote_tag }} + remote_inference_tag: ${{ steps.promote_inference.outputs.remote_tag }} + steps: - name: Login to Harbor registry - if: steps.tags.outputs.needs_promotion == 'true' uses: docker/login-action@v3 with: registry: ${{ env.REMOTE_REGISTRY }} @@ -54,37 +73,91 @@ jobs: - name: Promote train image to Harbor id: promote_train - if: steps.tags.outputs.needs_promotion == 'true' && steps.tags.outputs.train_tag != '' + if: needs.prepare.outputs.train_tar != '' run: | set -euo pipefail - LOCAL_TAG="${{ steps.tags.outputs.train_tag }}" + TAR_PATH="${{ needs.prepare.outputs.train_tar }}" + + echo "Loading tar: $TAR_PATH" + LOCAL_TAG=$(docker load -i "$TAR_PATH" | grep 'Loaded image:' | awk '{print $NF}') + echo "Loaded image tag: $LOCAL_TAG" + + # Strip local registry prefix (e.g. localhost:5000/) to get image:tag IMAGE_AND_TAG="${LOCAL_TAG#*/}" REMOTE_TAG="${{ env.REMOTE_REGISTRY }}/${{ env.REMOTE_IMAGE_PREFIX }}/${IMAGE_AND_TAG}" - echo "Promoting: ${LOCAL_TAG} → ${REMOTE_TAG}" - docker pull "${LOCAL_TAG}" docker tag "${LOCAL_TAG}" "${REMOTE_TAG}" docker push "${REMOTE_TAG}" - echo "remote_tag=${REMOTE_TAG}" >> $GITHUB_OUTPUT - echo "Successfully pushed ${REMOTE_TAG}" + echo "Pushed: ${REMOTE_TAG}" + + echo "Removing tar: $TAR_PATH" + rm -f "$TAR_PATH" - name: Promote inference image to Harbor id: promote_inference - if: steps.tags.outputs.needs_promotion == 'true' && steps.tags.outputs.inference_tag != '' + if: needs.prepare.outputs.inference_tar != '' run: | set -euo pipefail - LOCAL_TAG="${{ steps.tags.outputs.inference_tag }}" + TAR_PATH="${{ needs.prepare.outputs.inference_tar }}" + + echo "Loading tar: $TAR_PATH" + LOCAL_TAG=$(docker load -i "$TAR_PATH" | grep 'Loaded image:' | awk '{print $NF}') + echo "Loaded image tag: $LOCAL_TAG" + IMAGE_AND_TAG="${LOCAL_TAG#*/}" REMOTE_TAG="${{ env.REMOTE_REGISTRY }}/${{ env.REMOTE_IMAGE_PREFIX }}/${IMAGE_AND_TAG}" - echo "Promoting: ${LOCAL_TAG} → ${REMOTE_TAG}" - docker pull "${LOCAL_TAG}" docker tag "${LOCAL_TAG}" "${REMOTE_TAG}" docker push "${REMOTE_TAG}" - echo "remote_tag=${REMOTE_TAG}" >> $GITHUB_OUTPUT - echo "Successfully pushed ${REMOTE_TAG}" + echo "Pushed: ${REMOTE_TAG}" + + echo "Removing tar: $TAR_PATH" + rm -f "$TAR_PATH" + + # --------------------------------------------------------------------------- + # Update config: write Harbor image tags back to cuda.yml and commit + # --------------------------------------------------------------------------- + update_config: + name: Update cuda.yml with Harbor image tags + needs: promote + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + token: ${{ secrets.GITHUB_TOKEN }} + + - name: Update image tags in cuda.yml + run: | + set -euo pipefail + CONFIG_FILE=".github/configs/cuda.yml" + + REMOTE_TRAIN="${{ needs.promote.outputs.remote_train_tag }}" + REMOTE_INFERENCE="${{ needs.promote.outputs.remote_inference_tag }}" + + if [ -n "$REMOTE_TRAIN" ]; then + sed -i "s|^ci_train_image:.*|ci_train_image: ${REMOTE_TRAIN}|" "$CONFIG_FILE" + echo "Updated ci_train_image: ${REMOTE_TRAIN}" + fi + + if [ -n "$REMOTE_INFERENCE" ]; then + sed -i "s|^ci_inference_image:.*|ci_inference_image: ${REMOTE_INFERENCE}|" "$CONFIG_FILE" + echo "Updated ci_inference_image: ${REMOTE_INFERENCE}" + fi + + - name: Commit and push updated config + run: | + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" + git add .github/configs/cuda.yml + if git diff --cached --quiet; then + echo "No changes to commit" + else + git commit -m "ci: update cuda.yml with new Harbor image tags [skip ci]" + git push + fi # --------------------------------------------------------------------------- # Cleanup: clean up Docker build cache and dangling images on self-hosted runner @@ -103,7 +176,6 @@ jobs: - name: Remove old localhost registry images run: | - # Remove local images tagged with localhost:5000 that are older than 7 days docker images --format '{{.Repository}}:{{.Tag}} {{.CreatedSince}}' \ | grep 'localhost:5000' \ | grep -E '(weeks|months)' \ From 56c30b10ae91ecd81cfab41752fe4e87bb81a86a Mon Sep 17 00:00:00 2001 From: zihugithub Date: Thu, 9 Apr 2026 10:07:26 +0800 Subject: [PATCH 18/29] ci: trigger push_image_harbor on build workflow success --- .github/workflows/build_image_cuda.yml | 3 +-- .github/workflows/push_image_harbor.yml | 12 ++++-------- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/.github/workflows/build_image_cuda.yml b/.github/workflows/build_image_cuda.yml index ae96330ada..49e80d299e 100644 --- a/.github/workflows/build_image_cuda.yml +++ b/.github/workflows/build_image_cuda.yml @@ -97,8 +97,7 @@ jobs: echo 'matrix={"task":["${{ inputs.task }}"]}' >> $GITHUB_OUTPUT else # PR or build_all_tasks=true: build all tasks - # echo 'matrix={"task":["train","inference","all"]}' >> $GITHUB_OUTPUT - echo 'matrix={"task":["train"]}' >> $GITHUB_OUTPUT + echo 'matrix={"task":["train","inference","all"]}' >> $GITHUB_OUTPUT fi - name: Set build parameters diff --git a/.github/workflows/push_image_harbor.yml b/.github/workflows/push_image_harbor.yml index 708d4040c2..1c2f5fb59c 100644 --- a/.github/workflows/push_image_harbor.yml +++ b/.github/workflows/push_image_harbor.yml @@ -1,14 +1,9 @@ name: Push Images to Harbor on: - push: - branches: ['main'] - paths: - - 'docker/cuda/**' - - 'docker/build.sh' - - 'tools/install/**' - - 'requirements/**' - - '.github/workflows/build_image_cuda.yml' + workflow_run: + workflows: ['Build Docker Images - CUDA'] + types: [completed] permissions: contents: write @@ -25,6 +20,7 @@ jobs: prepare: name: Detect tar files runs-on: [self-hosted, Linux, X64, nvidia-0, gpus-8] + if: github.event.workflow_run.conclusion == 'success' outputs: needs_promotion: ${{ steps.detect.outputs.needs_promotion }} train_tar: ${{ steps.detect.outputs.train_tar }} From 63ae08340f86d2f1b595ae6f3e587e05ef8324a9 Mon Sep 17 00:00:00 2001 From: zihugithub Date: Thu, 9 Apr 2026 11:32:59 +0800 Subject: [PATCH 19/29] debug3 --- .github/workflows/build_image_cuda.yml | 4 ++-- .github/workflows/push_image_harbor.yml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build_image_cuda.yml b/.github/workflows/build_image_cuda.yml index 49e80d299e..9c34c39b96 100644 --- a/.github/workflows/build_image_cuda.yml +++ b/.github/workflows/build_image_cuda.yml @@ -213,11 +213,11 @@ jobs: - name: Save image as tar run: | TAR_DIR="${{ env.TAR_DIR }}" - mkdir -p "$TAR_DIR" + sudo mkdir -p "$TAR_DIR" IMAGE_TAG="${{ steps.meta.outputs.build_tag }}" # Use image name with tag as filename (replace / and : with -) TAR_NAME=$(echo "$IMAGE_TAG" | tr '/: ' '---').tar - docker save "$IMAGE_TAG" -o "${TAR_DIR}/${TAR_NAME}" + sudo docker save "$IMAGE_TAG" -o "${TAR_DIR}/${TAR_NAME}" echo "tar_path=${TAR_DIR}/${TAR_NAME}" >> $GITHUB_OUTPUT id: save_tar diff --git a/.github/workflows/push_image_harbor.yml b/.github/workflows/push_image_harbor.yml index 1c2f5fb59c..c06e4a3f5f 100644 --- a/.github/workflows/push_image_harbor.yml +++ b/.github/workflows/push_image_harbor.yml @@ -88,7 +88,7 @@ jobs: echo "Pushed: ${REMOTE_TAG}" echo "Removing tar: $TAR_PATH" - rm -f "$TAR_PATH" + sudo rm -f "$TAR_PATH" - name: Promote inference image to Harbor id: promote_inference From cd02656e9b5c7305b27fe0aac7f9ccda967b81c6 Mon Sep 17 00:00:00 2001 From: zihugithub Date: Thu, 9 Apr 2026 18:42:03 +0800 Subject: [PATCH 20/29] ci: add paths-ignore for ascend/metax, fix sudo docker load, support all-image promotion --- .github/workflows/all_tests_ascend.yml | 14 +++++ .github/workflows/all_tests_metax.yml | 14 +++++ .github/workflows/build_image_cuda.yml | 4 +- .github/workflows/functional_tests_train.yml | 66 -------------------- .github/workflows/push_image_harbor.yml | 51 ++++++++++++--- 5 files changed, 74 insertions(+), 75 deletions(-) diff --git a/.github/workflows/all_tests_ascend.yml b/.github/workflows/all_tests_ascend.yml index ae7709de76..459c8616a9 100644 --- a/.github/workflows/all_tests_ascend.yml +++ b/.github/workflows/all_tests_ascend.yml @@ -3,8 +3,22 @@ name: ascend_tests on: push: branches: ["main"] + paths-ignore: + - 'docker/cuda/**' + - 'docker/build.sh' + - 'tools/install/**' + - 'requirements/**' + - '.github/workflows/build_image_cuda.yml' + - '.github/workflows/push_image_harbor.yml' pull_request: branches: ["main"] + paths-ignore: + - 'docker/cuda/**' + - 'docker/build.sh' + - 'tools/install/**' + - 'requirements/**' + - '.github/workflows/build_image_cuda.yml' + - '.github/workflows/push_image_harbor.yml' concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.actor }} diff --git a/.github/workflows/all_tests_metax.yml b/.github/workflows/all_tests_metax.yml index e2933cddc8..b350a6c4e0 100644 --- a/.github/workflows/all_tests_metax.yml +++ b/.github/workflows/all_tests_metax.yml @@ -3,8 +3,22 @@ name: metax_c500_tests on: push: branches: ["main"] + paths-ignore: + - 'docker/cuda/**' + - 'docker/build.sh' + - 'tools/install/**' + - 'requirements/**' + - '.github/workflows/build_image_cuda.yml' + - '.github/workflows/push_image_harbor.yml' pull_request: branches: ["main"] + paths-ignore: + - 'docker/cuda/**' + - 'docker/build.sh' + - 'tools/install/**' + - 'requirements/**' + - '.github/workflows/build_image_cuda.yml' + - '.github/workflows/push_image_harbor.yml' concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.actor }} diff --git a/.github/workflows/build_image_cuda.yml b/.github/workflows/build_image_cuda.yml index 9c34c39b96..ec1b92abd2 100644 --- a/.github/workflows/build_image_cuda.yml +++ b/.github/workflows/build_image_cuda.yml @@ -266,7 +266,7 @@ jobs: TAG="${{ needs.build.outputs.train_tag }}" if [ -f "$TAR" ]; then echo "Loading $TAR" - docker load -i "$TAR" + sudo docker load -i "$TAR" docker push "$TAG" else echo "::warning::Train image tar not found: $TAR, skipping load" @@ -278,7 +278,7 @@ jobs: TAG="${{ needs.build.outputs.inference_tag }}" if [ -f "$TAR" ]; then echo "Loading $TAR" - docker load -i "$TAR" + sudo docker load -i "$TAR" docker push "$TAG" else echo "::warning::Inference image tar not found: $TAR, skipping load" diff --git a/.github/workflows/functional_tests_train.yml b/.github/workflows/functional_tests_train.yml index b815bba9fe..d3dd892884 100644 --- a/.github/workflows/functional_tests_train.yml +++ b/.github/workflows/functional_tests_train.yml @@ -112,7 +112,6 @@ jobs: git config --global --add safe.directory $PROJECT_ROOT - name: Install dependencies for training - if: inputs.platform == 'cuda' run: | set -euo pipefail cd $PROJECT_ROOT @@ -177,71 +176,6 @@ jobs: echo "Environment ready for train tests" timeout-minutes: 30 - - name: Install dependencies for serve metax - if: inputs.platform == 'metax' - run: | - set -euo pipefail - - # Clone FlagOS's Megatron-LM fork required by the serve task on MetaX platform - git clone https://github.com/flagos-ai/Megatron-LM-FL.git /workspace/Megatron-LM-FL - - cd $PROJECT_ROOT - - PKG_MGR='${{ inputs.pkg_mgr }}' - ENV_NAME='${{ inputs.env_name }}' - ENV_PATH='${{ inputs.env_path }}' - - echo "Installing dependencies for training" - echo "Package Manager: $PKG_MGR" - echo "Environment Name: $ENV_NAME" - echo "Environment Path: $ENV_PATH" - - # Source environment utilities - source ./tools/install/utils/pyenv_utils.sh - - # Activate environment based on package manager - case "$PKG_MGR" in - conda) - if [ -n "$ENV_NAME" ]; then - activate_conda "$ENV_NAME" "$ENV_PATH" || echo "⚠️ Conda activation failed" - fi - ;; - uv) - if [ -n "$ENV_PATH" ] && [ -d "$ENV_PATH" ]; then - activate_uv_env "$ENV_PATH" || echo "⚠️ UV activation failed" - fi - ;; - pip) - echo "ℹ️ Running tests with pip/system Python" - ;; - esac - - # Display Python environment info - echo "Python location: $(which python)" - echo "Python version: $(python --version)" - - # Install Megatron-LM-FL - pip install megatron_core==0.1.0+megatron0.15.0rc7 \ - --extra-index-url https://resource.flagos.net/repository/flagos-pypi-hosted/simple \ - || { echo "❌ Megatron-LM-FL install failed"; exit 1; } - echo "✅ Megatron-LM-FL installed successfully" - - # Install TransformerEngine-FL and dependencies - git clone --depth 1 https://github.com/flagos-ai/TransformerEngine-FL.git /workspace/TransformerEngine-FL \ - || { echo "❌ TransformerEngine-FL clone failed"; exit 1; } - TE_FL_SKIP_CUDA=1 pip install /workspace/TransformerEngine-FL --no-build-isolation \ - || { echo "❌ TransformerEngine-FL install failed"; exit 1; } - echo "✅ TransformerEngine-FL installed successfully" - - # Install FlagScale - pip install . --no-build-isolation --root-user-action=ignore || { echo "❌ FlagScale CLI install failed"; exit 1; } - - # Verify installation - command -v flagscale || { echo "❌ FlagScale CLI not found in PATH"; exit 1; } - echo "✅ FlagScale CLI installed successfully: $(flagscale --version 2>/dev/null || echo 'version unknown')" - echo "✅ Environment ready for train tests" - timeout-minutes: 30 - - name: Run functional tests id: functional_test run: | diff --git a/.github/workflows/push_image_harbor.yml b/.github/workflows/push_image_harbor.yml index c06e4a3f5f..7f92fb55ce 100644 --- a/.github/workflows/push_image_harbor.yml +++ b/.github/workflows/push_image_harbor.yml @@ -1,9 +1,14 @@ name: Push Images to Harbor on: - workflow_run: - workflows: ['Build Docker Images - CUDA'] - types: [completed] + push: + branches: [main] + paths: + - 'docker/cuda/**' + - 'docker/build.sh' + - 'tools/install/**' + - 'requirements/**' + - '.github/workflows/build_image_cuda.yml' permissions: contents: write @@ -20,11 +25,11 @@ jobs: prepare: name: Detect tar files runs-on: [self-hosted, Linux, X64, nvidia-0, gpus-8] - if: github.event.workflow_run.conclusion == 'success' outputs: needs_promotion: ${{ steps.detect.outputs.needs_promotion }} train_tar: ${{ steps.detect.outputs.train_tar }} inference_tar: ${{ steps.detect.outputs.inference_tar }} + all_tar: ${{ steps.detect.outputs.all_tar }} steps: - name: Detect tar files in ${{ env.TAR_DIR }} id: detect @@ -34,15 +39,18 @@ jobs: TRAIN_TAR=$(find "$TAR_DIR" -maxdepth 1 -name '*flagscale-train*' -name '*.tar' 2>/dev/null | sort | tail -1) INFERENCE_TAR=$(find "$TAR_DIR" -maxdepth 1 -name '*flagscale-inference*' -name '*.tar' 2>/dev/null | sort | tail -1) + ALL_TAR=$(find "$TAR_DIR" -maxdepth 1 -name '*flagscale-all*' -name '*.tar' 2>/dev/null | sort | tail -1) echo "train_tar=${TRAIN_TAR}" >> $GITHUB_OUTPUT echo "inference_tar=${INFERENCE_TAR}" >> $GITHUB_OUTPUT + echo "all_tar=${ALL_TAR}" >> $GITHUB_OUTPUT - if [ -n "$TRAIN_TAR" ] || [ -n "$INFERENCE_TAR" ]; then + if [ -n "$TRAIN_TAR" ] || [ -n "$INFERENCE_TAR" ] || [ -n "$ALL_TAR" ]; then echo "needs_promotion=true" >> $GITHUB_OUTPUT echo "Detected tars:" if [ -n "$TRAIN_TAR" ]; then echo " train: $TRAIN_TAR"; fi if [ -n "$INFERENCE_TAR" ]; then echo " inference: $INFERENCE_TAR"; fi + if [ -n "$ALL_TAR" ]; then echo " all: $ALL_TAR"; fi else echo "needs_promotion=false" >> $GITHUB_OUTPUT echo "No tar files found in $TAR_DIR, skipping promotion" @@ -59,6 +67,7 @@ jobs: outputs: remote_train_tag: ${{ steps.promote_train.outputs.remote_tag }} remote_inference_tag: ${{ steps.promote_inference.outputs.remote_tag }} + remote_all_tag: ${{ steps.promote_all.outputs.remote_tag }} steps: - name: Login to Harbor registry uses: docker/login-action@v3 @@ -75,7 +84,7 @@ jobs: TAR_PATH="${{ needs.prepare.outputs.train_tar }}" echo "Loading tar: $TAR_PATH" - LOCAL_TAG=$(docker load -i "$TAR_PATH" | grep 'Loaded image:' | awk '{print $NF}') + LOCAL_TAG=$(sudo docker load -i "$TAR_PATH" | grep 'Loaded image:' | awk '{print $NF}') echo "Loaded image tag: $LOCAL_TAG" # Strip local registry prefix (e.g. localhost:5000/) to get image:tag @@ -98,7 +107,29 @@ jobs: TAR_PATH="${{ needs.prepare.outputs.inference_tar }}" echo "Loading tar: $TAR_PATH" - LOCAL_TAG=$(docker load -i "$TAR_PATH" | grep 'Loaded image:' | awk '{print $NF}') + LOCAL_TAG=$(sudo docker load -i "$TAR_PATH" | grep 'Loaded image:' | awk '{print $NF}') + echo "Loaded image tag: $LOCAL_TAG" + + IMAGE_AND_TAG="${LOCAL_TAG#*/}" + REMOTE_TAG="${{ env.REMOTE_REGISTRY }}/${{ env.REMOTE_IMAGE_PREFIX }}/${IMAGE_AND_TAG}" + + docker tag "${LOCAL_TAG}" "${REMOTE_TAG}" + docker push "${REMOTE_TAG}" + echo "remote_tag=${REMOTE_TAG}" >> $GITHUB_OUTPUT + echo "Pushed: ${REMOTE_TAG}" + + echo "Removing tar: $TAR_PATH" + rm -f "$TAR_PATH" + + - name: Promote all image to Harbor + id: promote_all + if: needs.prepare.outputs.all_tar != '' + run: | + set -euo pipefail + TAR_PATH="${{ needs.prepare.outputs.all_tar }}" + + echo "Loading tar: $TAR_PATH" + LOCAL_TAG=$(sudo docker load -i "$TAR_PATH" | grep 'Loaded image:' | awk '{print $NF}') echo "Loaded image tag: $LOCAL_TAG" IMAGE_AND_TAG="${LOCAL_TAG#*/}" @@ -132,6 +163,7 @@ jobs: REMOTE_TRAIN="${{ needs.promote.outputs.remote_train_tag }}" REMOTE_INFERENCE="${{ needs.promote.outputs.remote_inference_tag }}" + REMOTE_ALL="${{ needs.promote.outputs.remote_all_tag }}" if [ -n "$REMOTE_TRAIN" ]; then sed -i "s|^ci_train_image:.*|ci_train_image: ${REMOTE_TRAIN}|" "$CONFIG_FILE" @@ -143,6 +175,11 @@ jobs: echo "Updated ci_inference_image: ${REMOTE_INFERENCE}" fi + if [ -n "$REMOTE_ALL" ]; then + sed -i "s|^ci_image:.*|ci_image: ${REMOTE_ALL}|" "$CONFIG_FILE" + echo "Updated ci_image: ${REMOTE_ALL}" + fi + - name: Commit and push updated config run: | git config user.name "github-actions[bot]" From 3b99fa37e3cdff5663b7dd985bf28008ce0f9a42 Mon Sep 17 00:00:00 2001 From: zihugithub Date: Mon, 13 Apr 2026 18:34:05 +0800 Subject: [PATCH 21/29] ci: pass proxy settings to docker build stages --- .github/workflows/build_image_cuda.yml | 10 ++++++++ docker/cuda/Dockerfile.all | 32 ++++++++++++++++++++++++++ docker/cuda/Dockerfile.inference | 32 ++++++++++++++++++++++++++ docker/cuda/Dockerfile.train | 32 ++++++++++++++++++++++++++ 4 files changed, 106 insertions(+) diff --git a/.github/workflows/build_image_cuda.yml b/.github/workflows/build_image_cuda.yml index ec1b92abd2..252e54a068 100644 --- a/.github/workflows/build_image_cuda.yml +++ b/.github/workflows/build_image_cuda.yml @@ -124,6 +124,10 @@ jobs: name: Build ${{ matrix.task }} needs: prepare runs-on: [self-hosted, Linux, X64, nvidia-0, gpus-8] + env: + HTTP_PROXY: ${{ vars.HTTP_PROXY }} + HTTPS_PROXY: ${{ vars.HTTPS_PROXY }} + NO_PROXY: ${{ vars.NO_PROXY }} strategy: fail-fast: false matrix: ${{ fromJson(needs.prepare.outputs.matrix) }} @@ -208,6 +212,12 @@ jobs: UV_VERSION=${{ env.UV_VERSION }} PYTORCH_INDEX=${{ steps.meta.outputs.pytorch_index }} PKG_MGR=${{ env.PKG_MGR }} + HTTP_PROXY=${{ env.HTTP_PROXY }} + HTTPS_PROXY=${{ env.HTTPS_PROXY }} + NO_PROXY=${{ env.NO_PROXY }} + http_proxy=${{ env.HTTP_PROXY }} + https_proxy=${{ env.HTTPS_PROXY }} + no_proxy=${{ env.NO_PROXY }} no-cache: ${{ needs.prepare.outputs.no_cache == 'true' }} - name: Save image as tar diff --git a/docker/cuda/Dockerfile.all b/docker/cuda/Dockerfile.all index f38b3b3405..e40201b073 100644 --- a/docker/cuda/Dockerfile.all +++ b/docker/cuda/Dockerfile.all @@ -38,6 +38,14 @@ ARG UV_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL} # PyTorch wheel index (derived from CUDA version) ARG PYTORCH_INDEX=https://download.pytorch.org/whl/cu128 +# Proxy settings (build-time only, not persisted in image) +ARG HTTP_PROXY +ARG HTTPS_PROXY +ARG NO_PROXY +ARG http_proxy=${HTTP_PROXY} +ARG https_proxy=${HTTPS_PROXY} +ARG no_proxy=${NO_PROXY} + # ============================================================================= # BASE STAGE - System dependencies # ============================================================================= @@ -48,6 +56,14 @@ ARG PYTHON_VERSION ARG UV_VERSION ARG PKG_MGR +# Proxy settings (build-time only) +ARG HTTP_PROXY +ARG HTTPS_PROXY +ARG NO_PROXY +ARG http_proxy=${HTTP_PROXY} +ARG https_proxy=${HTTPS_PROXY} +ARG no_proxy=${NO_PROXY} + # Root installation directory (single source of truth) ARG FLAGSCALE_HOME=/root @@ -128,6 +144,14 @@ ARG PYTORCH_INDEX ARG PKG_MGR ARG FLAGSCALE_HOME=/root +# Proxy settings (build-time only) +ARG HTTP_PROXY +ARG HTTPS_PROXY +ARG NO_PROXY +ARG http_proxy=${HTTP_PROXY} +ARG https_proxy=${HTTPS_PROXY} +ARG no_proxy=${NO_PROXY} + # PyPI index URLs (re-declare to use in this stage) ARG PIP_INDEX_URL ARG PIP_EXTRA_INDEX_URL @@ -186,6 +210,14 @@ ARG PYTORCH_INDEX ARG PKG_MGR ARG FLAGSCALE_HOME=/root +# Proxy settings (build-time only) +ARG HTTP_PROXY +ARG HTTPS_PROXY +ARG NO_PROXY +ARG http_proxy=${HTTP_PROXY} +ARG https_proxy=${HTTPS_PROXY} +ARG no_proxy=${NO_PROXY} + # PyPI index URLs ARG PIP_INDEX_URL ARG PIP_EXTRA_INDEX_URL diff --git a/docker/cuda/Dockerfile.inference b/docker/cuda/Dockerfile.inference index 25788401b5..a6d3636292 100644 --- a/docker/cuda/Dockerfile.inference +++ b/docker/cuda/Dockerfile.inference @@ -36,6 +36,14 @@ ARG UV_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL} # PyTorch wheel index (derived from CUDA version) ARG PYTORCH_INDEX=https://download.pytorch.org/whl/cu128 + +ARG HTTP_PROXY +ARG HTTPS_PROXY +ARG NO_PROXY +ARG http_proxy=${HTTP_PROXY} +ARG https_proxy=${HTTPS_PROXY} +ARG no_proxy=${NO_PROXY} + # ============================================================================= # BASE STAGE - System dependencies # ============================================================================= @@ -46,6 +54,14 @@ ARG PYTHON_VERSION ARG UV_VERSION ARG PKG_MGR +# Proxy settings (build-time only) +ARG HTTP_PROXY +ARG HTTPS_PROXY +ARG NO_PROXY +ARG http_proxy=${HTTP_PROXY} +ARG https_proxy=${HTTPS_PROXY} +ARG no_proxy=${NO_PROXY} + # Root installation directory (single source of truth) ARG FLAGSCALE_HOME=/root @@ -126,6 +142,14 @@ ARG PYTORCH_INDEX ARG PKG_MGR ARG FLAGSCALE_HOME=/root +# Proxy settings (build-time only) +ARG HTTP_PROXY +ARG HTTPS_PROXY +ARG NO_PROXY +ARG http_proxy=${HTTP_PROXY} +ARG https_proxy=${HTTPS_PROXY} +ARG no_proxy=${NO_PROXY} + # PyPI index URLs (re-declare to use in this stage) ARG PIP_INDEX_URL ARG PIP_EXTRA_INDEX_URL @@ -184,6 +208,14 @@ ARG PYTORCH_INDEX ARG PKG_MGR ARG FLAGSCALE_HOME=/root +# Proxy settings (build-time only) +ARG HTTP_PROXY +ARG HTTPS_PROXY +ARG NO_PROXY +ARG http_proxy=${HTTP_PROXY} +ARG https_proxy=${HTTPS_PROXY} +ARG no_proxy=${NO_PROXY} + # PyPI index URLs ARG PIP_INDEX_URL ARG PIP_EXTRA_INDEX_URL diff --git a/docker/cuda/Dockerfile.train b/docker/cuda/Dockerfile.train index b6de815865..06fa58bc0e 100644 --- a/docker/cuda/Dockerfile.train +++ b/docker/cuda/Dockerfile.train @@ -36,6 +36,14 @@ ARG UV_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL} # PyTorch wheel index (derived from CUDA version) ARG PYTORCH_INDEX=https://download.pytorch.org/whl/cu128 +# Proxy settings (build-time only, not persisted in image) +ARG HTTP_PROXY +ARG HTTPS_PROXY +ARG NO_PROXY +ARG http_proxy=${HTTP_PROXY} +ARG https_proxy=${HTTPS_PROXY} +ARG no_proxy=${NO_PROXY} + # ============================================================================= # BASE STAGE - System dependencies # ============================================================================= @@ -46,6 +54,14 @@ ARG PYTHON_VERSION ARG UV_VERSION ARG PKG_MGR +# Proxy settings (build-time only) +ARG HTTP_PROXY +ARG HTTPS_PROXY +ARG NO_PROXY +ARG http_proxy=${HTTP_PROXY} +ARG https_proxy=${HTTPS_PROXY} +ARG no_proxy=${NO_PROXY} + # Root installation directory (single source of truth) ARG FLAGSCALE_HOME=/root @@ -127,6 +143,14 @@ ARG PKG_MGR ARG FLAGSCALE_HOME=/root ARG PYTHON_VERSION=3.12 +# Proxy settings (build-time only) +ARG HTTP_PROXY +ARG HTTPS_PROXY +ARG NO_PROXY +ARG http_proxy=${HTTP_PROXY} +ARG https_proxy=${HTTPS_PROXY} +ARG no_proxy=${NO_PROXY} + # PyPI index URLs (re-declare to use in this stage) ARG PIP_INDEX_URL ARG PIP_EXTRA_INDEX_URL @@ -186,6 +210,14 @@ ARG PKG_MGR ARG FLAGSCALE_HOME=/root ARG PYTHON_VERSION=3.12 +# Proxy settings (build-time only) +ARG HTTP_PROXY +ARG HTTPS_PROXY +ARG NO_PROXY +ARG http_proxy=${HTTP_PROXY} +ARG https_proxy=${HTTPS_PROXY} +ARG no_proxy=${NO_PROXY} + # PyPI index URLs ARG PIP_INDEX_URL ARG PIP_EXTRA_INDEX_URL From c4c3e10e09432a1b88635e50b496433e6f2d9bef Mon Sep 17 00:00:00 2001 From: zihugithub Date: Tue, 14 Apr 2026 14:24:12 +0800 Subject: [PATCH 22/29] ci: auto-detect proxy from runner environment for docker build --- .github/workflows/build_image_cuda.yml | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/.github/workflows/build_image_cuda.yml b/.github/workflows/build_image_cuda.yml index 252e54a068..bbce763f0c 100644 --- a/.github/workflows/build_image_cuda.yml +++ b/.github/workflows/build_image_cuda.yml @@ -128,6 +128,7 @@ jobs: HTTP_PROXY: ${{ vars.HTTP_PROXY }} HTTPS_PROXY: ${{ vars.HTTPS_PROXY }} NO_PROXY: ${{ vars.NO_PROXY }} + # Note: vars.* may be empty for fork PRs; proxy is detected from runner env in the 'proxy' step strategy: fail-fast: false matrix: ${{ fromJson(needs.prepare.outputs.matrix) }} @@ -196,6 +197,18 @@ jobs: echo "| Build Tag | \`${BUILD_TAG}\` |" } >> $GITHUB_STEP_SUMMARY + - name: Detect proxy from runner environment + id: proxy + run: | + # Read proxy from runner environment (works for both fork PRs and direct pushes) + HTTP_PROXY_VAL="${http_proxy:-${HTTP_PROXY:-}}" + HTTPS_PROXY_VAL="${https_proxy:-${HTTPS_PROXY:-}}" + NO_PROXY_VAL="${no_proxy:-${NO_PROXY:-}}" + echo "http_proxy=${HTTP_PROXY_VAL}" >> $GITHUB_OUTPUT + echo "https_proxy=${HTTPS_PROXY_VAL}" >> $GITHUB_OUTPUT + echo "no_proxy=${NO_PROXY_VAL}" >> $GITHUB_OUTPUT + echo "Detected proxies: HTTP=${HTTP_PROXY_VAL} HTTPS=${HTTPS_PROXY_VAL} NO_PROXY=${NO_PROXY_VAL}" + - name: Build Docker image uses: docker/build-push-action@v6 with: @@ -212,12 +225,12 @@ jobs: UV_VERSION=${{ env.UV_VERSION }} PYTORCH_INDEX=${{ steps.meta.outputs.pytorch_index }} PKG_MGR=${{ env.PKG_MGR }} - HTTP_PROXY=${{ env.HTTP_PROXY }} - HTTPS_PROXY=${{ env.HTTPS_PROXY }} - NO_PROXY=${{ env.NO_PROXY }} - http_proxy=${{ env.HTTP_PROXY }} - https_proxy=${{ env.HTTPS_PROXY }} - no_proxy=${{ env.NO_PROXY }} + HTTP_PROXY=${{ steps.proxy.outputs.http_proxy }} + HTTPS_PROXY=${{ steps.proxy.outputs.https_proxy }} + NO_PROXY=${{ steps.proxy.outputs.no_proxy }} + http_proxy=${{ steps.proxy.outputs.http_proxy }} + https_proxy=${{ steps.proxy.outputs.https_proxy }} + no_proxy=${{ steps.proxy.outputs.no_proxy }} no-cache: ${{ needs.prepare.outputs.no_cache == 'true' }} - name: Save image as tar From 1a7a3bc451af35b879c933e84eafbd648763ba71 Mon Sep 17 00:00:00 2001 From: zihugithub Date: Wed, 15 Apr 2026 10:52:48 +0800 Subject: [PATCH 23/29] ci: add runs_on parameter for custom runner selection --- .github/workflows/all_tests_common.yml | 11 +++++++++++ .github/workflows/build_image_cuda.yml | 1 + 2 files changed, 12 insertions(+) diff --git a/.github/workflows/all_tests_common.yml b/.github/workflows/all_tests_common.yml index b15f455faf..8f4a8d8562 100644 --- a/.github/workflows/all_tests_common.yml +++ b/.github/workflows/all_tests_common.yml @@ -17,6 +17,11 @@ on: type: string description: Override inference image. Falls back to platform config if not set. default: "" + runs_on: + required: false + type: string + description: Override runs_on. Falls back to platform config if not set. + default: "" jobs: checkout_and_config: @@ -117,6 +122,12 @@ jobs: if [ -n "${{ inputs.ci_inference_image }}" ]; then echo "ci_inference_image=${{ inputs.ci_inference_image }}" >> $GITHUB_OUTPUT fi + # Use single-quoted assignment so bash treats the JSON value literally + # (double quotes inside the JSON would break echo "runs_on=${{ inputs.runs_on }}") + RUNS_ON_INPUT='${{ inputs.runs_on }}' + if [ -n "$RUNS_ON_INPUT" ]; then + { echo 'runs_on<> $GITHUB_OUTPUT + fi # CLI validation runs first (outside virtual env) as a gate for all subsequent tests cli_validation: diff --git a/.github/workflows/build_image_cuda.yml b/.github/workflows/build_image_cuda.yml index bbce763f0c..d1c4f80322 100644 --- a/.github/workflows/build_image_cuda.yml +++ b/.github/workflows/build_image_cuda.yml @@ -318,3 +318,4 @@ jobs: platform: cuda ci_train_image: ${{ needs.build.outputs.train_tag }} ci_inference_image: ${{ needs.build.outputs.inference_tag }} + runs_on: '["self-hosted", "Linux", "X64", "nvidia-0", "gpus-8"]' From fcbdbcce9308c0b809a4084913813fc6b68937cb Mon Sep 17 00:00:00 2001 From: zihugithub Date: Wed, 15 Apr 2026 13:29:09 +0800 Subject: [PATCH 24/29] ci: add runs_on and container_volumes as overridable workflow inputs --- .github/workflows/all_tests_common.yml | 10 ++++++++++ .github/workflows/build_image_cuda.yml | 13 ++++++++++++- 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/.github/workflows/all_tests_common.yml b/.github/workflows/all_tests_common.yml index 8f4a8d8562..e2ad875a9a 100644 --- a/.github/workflows/all_tests_common.yml +++ b/.github/workflows/all_tests_common.yml @@ -22,6 +22,11 @@ on: type: string description: Override runs_on. Falls back to platform config if not set. default: "" + container_volumes: + required: false + type: string + description: Override container_volumes. Falls back to platform config if not set. + default: "" jobs: checkout_and_config: @@ -128,6 +133,11 @@ jobs: if [ -n "$RUNS_ON_INPUT" ]; then { echo 'runs_on<> $GITHUB_OUTPUT fi + # (double quotes inside the JSON would break echo "container_volumes=${{ inputs.container_volumes }}") + CONTAINER_VOLUMES_INPUT='${{ inputs.container_volumes }}' + if [ -n "$CONTAINER_VOLUMES_INPUT" ]; then + { echo 'container_volumes<> $GITHUB_OUTPUT + fi # CLI validation runs first (outside virtual env) as a gate for all subsequent tests cli_validation: diff --git a/.github/workflows/build_image_cuda.yml b/.github/workflows/build_image_cuda.yml index d1c4f80322..081eb469da 100644 --- a/.github/workflows/build_image_cuda.yml +++ b/.github/workflows/build_image_cuda.yml @@ -45,6 +45,16 @@ on: description: 'Directory to store image tar files' type: string default: '/home/flagscale_cicd/images_tar' + runs_on: + required: false + type: string + description: Override runs_on. Falls back to platform config if not set. + default: '["self-hosted", "Linux", "X64", "nvidia-0", "gpus-8"]' + container_volumes: + required: false + type: string + description: Override container_volumes. Falls back to platform config if not set. + default: '["/home/flagscale_cicd/docker/docker_build/docker_data:/home/gitlab-runner/data", "/home/flagscale_cicd/docker/docker_build/docker_tokenizers:/home/gitlab-runner/tokenizers"]' # Trigger on PRs that modify docker-related files (build + test only, no push) pull_request: @@ -318,4 +328,5 @@ jobs: platform: cuda ci_train_image: ${{ needs.build.outputs.train_tag }} ci_inference_image: ${{ needs.build.outputs.inference_tag }} - runs_on: '["self-hosted", "Linux", "X64", "nvidia-0", "gpus-8"]' + runs_on: ${{ inputs.runs_on || '["self-hosted", "Linux", "X64", "nvidia-0", "gpus-8"]' }} + container_volumes: ${{ inputs.container_volumes || '["/home/flagscale_cicd/docker/docker_build/docker_data:/home/gitlab-runner/data", "/home/flagscale_cicd/docker/docker_build/docker_tokenizers:/home/gitlab-runner/tokenizers"]'}} From d6301a7b76ba9d5ba1e47a42a62d5a9dee886c84 Mon Sep 17 00:00:00 2001 From: zihugithub Date: Wed, 15 Apr 2026 13:29:54 +0800 Subject: [PATCH 25/29] ci: add runs_on and container_volumes as overridable workflow inputs --- .github/workflows/build_image_cuda.yml | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build_image_cuda.yml b/.github/workflows/build_image_cuda.yml index 081eb469da..945203be73 100644 --- a/.github/workflows/build_image_cuda.yml +++ b/.github/workflows/build_image_cuda.yml @@ -54,7 +54,9 @@ on: required: false type: string description: Override container_volumes. Falls back to platform config if not set. - default: '["/home/flagscale_cicd/docker/docker_build/docker_data:/home/gitlab-runner/data", "/home/flagscale_cicd/docker/docker_build/docker_tokenizers:/home/gitlab-runner/tokenizers"]' + default: >- + ["/home/flagscale_cicd/docker/docker_build/docker_data:/home/gitlab-runner/data", + "/home/flagscale_cicd/docker/docker_build/docker_tokenizers:/home/gitlab-runner/tokenizers"] # Trigger on PRs that modify docker-related files (build + test only, no push) pull_request: @@ -328,5 +330,10 @@ jobs: platform: cuda ci_train_image: ${{ needs.build.outputs.train_tag }} ci_inference_image: ${{ needs.build.outputs.inference_tag }} - runs_on: ${{ inputs.runs_on || '["self-hosted", "Linux", "X64", "nvidia-0", "gpus-8"]' }} - container_volumes: ${{ inputs.container_volumes || '["/home/flagscale_cicd/docker/docker_build/docker_data:/home/gitlab-runner/data", "/home/flagscale_cicd/docker/docker_build/docker_tokenizers:/home/gitlab-runner/tokenizers"]'}} + runs_on: >- + ${{ inputs.runs_on || + '["self-hosted", "Linux", "X64", "nvidia-0", "gpus-8"]' }} + container_volumes: >- + ${{ inputs.container_volumes || + '["/home/flagscale_cicd/docker/docker_build/docker_data:/home/gitlab-runner/data", + "/home/flagscale_cicd/docker/docker_build/docker_tokenizers:/home/gitlab-runner/tokenizers"]' }} From abcb13bec7bd246f388be1bbe515110b1272fe62 Mon Sep 17 00:00:00 2001 From: zihugithub Date: Wed, 15 Apr 2026 15:52:33 +0800 Subject: [PATCH 26/29] debug0 --- .github/workflows/all_tests_common.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/all_tests_common.yml b/.github/workflows/all_tests_common.yml index e2ad875a9a..f1ed893fcb 100644 --- a/.github/workflows/all_tests_common.yml +++ b/.github/workflows/all_tests_common.yml @@ -314,11 +314,11 @@ jobs: # Check all test jobs (skip if not run) failed=false - if [ "${{ needs.unit_tests.result }}" != "success" ] && \ - [ "${{ needs.unit_tests.result }}" != "skipped" ]; then - echo "❌ Unit tests failed" - failed=true - fi + # if [ "${{ needs.unit_tests.result }}" != "success" ] && \ + # [ "${{ needs.unit_tests.result }}" != "skipped" ]; then + # echo "❌ Unit tests failed" + # failed=true + # fi if [ "${{ needs.cli_validation.result }}" != "success" ] && \ [ "${{ needs.cli_validation.result }}" != "skipped" ]; then From 747517ca5f7ef2a0f8a2d9100663ec82ed5eb30f Mon Sep 17 00:00:00 2001 From: zihugithub Date: Wed, 15 Apr 2026 19:15:31 +0800 Subject: [PATCH 27/29] ci: update Ascend volume paths and re-enable unit tests --- .github/configs/ascend.yml | 4 +- .github/workflows/all_tests_common.yml | 66 +++++++++++++------------- tests/__init__.py | 1 + tests/unit_tests/__init__.py | 1 + 4 files changed, 37 insertions(+), 35 deletions(-) create mode 100644 tests/__init__.py create mode 100644 tests/unit_tests/__init__.py diff --git a/.github/configs/ascend.yml b/.github/configs/ascend.yml index d2879fabbd..db071e052d 100644 --- a/.github/configs/ascend.yml +++ b/.github/configs/ascend.yml @@ -15,8 +15,8 @@ runner_labels: ["flagscale-ascend-ascend910-gpu2-32c-128g"] # Container volumes (hardware-specific paths) container_volumes: - - /public/cicd/baai_datasets:/home/gitlab-runner/data - - /public/cicd/baai_tokenizers:/home/gitlab-runner/tokenizers + - /public-ks3/cicd/baai_datasets:/home/gitlab-runner/data + - /public-ks3/cicd/baai_tokenizers:/home/gitlab-runner/tokenizers - /usr/local/Ascend/driver:/usr/local/Ascend/driver:ro - /usr/local/Ascend/add-ons:/usr/local/Ascend/add-ons:ro - /usr/local/sbin/npu-smi:/usr/local/sbin/npu-smi:ro diff --git a/.github/workflows/all_tests_common.yml b/.github/workflows/all_tests_common.yml index f1ed893fcb..94cc9cdbdc 100644 --- a/.github/workflows/all_tests_common.yml +++ b/.github/workflows/all_tests_common.yml @@ -154,34 +154,34 @@ jobs: env_name: ${{ needs.checkout_and_config.outputs.env_name_train }} env_path: ${{ needs.checkout_and_config.outputs.env_path }} - # unit_tests: - # needs: - # - checkout_and_config - # - cli_validation - # if: fromJson(needs.checkout_and_config.outputs.device_types)[0] != null - # strategy: - # fail-fast: false - # matrix: - # device: ${{ fromJson(needs.checkout_and_config.outputs.device_types) }} - # uses: ./.github/workflows/unit_tests_common.yml - # name: unit_tests - # with: - # platform: ${{ inputs.platform }} - # device: ${{ matrix.device }} - # image: ${{ needs.checkout_and_config.outputs.ci_train_image }} - # runs_on: ${{ needs.checkout_and_config.outputs.runs_on }} - # container_volumes: ${{ needs.checkout_and_config.outputs.container_volumes }} - # container_options: ${{ needs.checkout_and_config.outputs.container_options }} - # source_artifact: flagscale-source-${{ github.sha }} - # pkg_mgr: ${{ needs.checkout_and_config.outputs.pkg_mgr }} - # env_name: ${{ needs.checkout_and_config.outputs.env_name_train }} - # env_path: ${{ needs.checkout_and_config.outputs.env_path }} + unit_tests: + needs: + - checkout_and_config + - cli_validation + if: fromJson(needs.checkout_and_config.outputs.device_types)[0] != null + strategy: + fail-fast: false + matrix: + device: ${{ fromJson(needs.checkout_and_config.outputs.device_types) }} + uses: ./.github/workflows/unit_tests_common.yml + name: unit_tests + with: + platform: ${{ inputs.platform }} + device: ${{ matrix.device }} + image: ${{ needs.checkout_and_config.outputs.ci_train_image }} + runs_on: ${{ needs.checkout_and_config.outputs.runs_on }} + container_volumes: ${{ needs.checkout_and_config.outputs.container_volumes }} + container_options: ${{ needs.checkout_and_config.outputs.container_options }} + source_artifact: flagscale-source-${{ github.sha }} + pkg_mgr: ${{ needs.checkout_and_config.outputs.pkg_mgr }} + env_name: ${{ needs.checkout_and_config.outputs.env_name_train }} + env_path: ${{ needs.checkout_and_config.outputs.env_path }} functional_tests_train: needs: - checkout_and_config - cli_validation - # - unit_tests + - unit_tests if: fromJson(needs.checkout_and_config.outputs.train_test_matrix)[0] != null uses: ./.github/workflows/functional_tests_train.yml with: @@ -200,7 +200,7 @@ jobs: needs: - checkout_and_config - cli_validation - # - unit_tests + - unit_tests if: fromJson(needs.checkout_and_config.outputs.hetero_train_test_matrix)[0] != null uses: ./.github/workflows/functional_tests_hetero_train.yml with: @@ -219,7 +219,7 @@ jobs: needs: - checkout_and_config - cli_validation - # - unit_tests + - unit_tests if: fromJson(needs.checkout_and_config.outputs.inference_test_matrix)[0] != null uses: ./.github/workflows/functional_tests_inference.yml with: @@ -238,7 +238,7 @@ jobs: needs: - checkout_and_config - cli_validation - # - unit_tests + - unit_tests if: fromJson(needs.checkout_and_config.outputs.serve_test_matrix)[0] != null uses: ./.github/workflows/functional_tests_serve.yml with: @@ -276,7 +276,7 @@ jobs: needs: - checkout_and_config - cli_validation - # - unit_tests + - unit_tests if: fromJson(needs.checkout_and_config.outputs.benchmark_test_matrix)[0] != null uses: ./.github/workflows/functional_tests_benchmark.yml with: @@ -298,7 +298,7 @@ jobs: needs: - checkout_and_config - cli_validation - # - unit_tests + - unit_tests - functional_tests_train - functional_tests_hetero_train - functional_tests_benchmark @@ -314,11 +314,11 @@ jobs: # Check all test jobs (skip if not run) failed=false - # if [ "${{ needs.unit_tests.result }}" != "success" ] && \ - # [ "${{ needs.unit_tests.result }}" != "skipped" ]; then - # echo "❌ Unit tests failed" - # failed=true - # fi + if [ "${{ needs.unit_tests.result }}" != "success" ] && \ + [ "${{ needs.unit_tests.result }}" != "skipped" ]; then + echo "❌ Unit tests failed" + failed=true + fi if [ "${{ needs.cli_validation.result }}" != "success" ] && \ [ "${{ needs.cli_validation.result }}" != "skipped" ]; then diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000000..db5dae1f3c --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1 @@ +# This file makes tests/ a Python package diff --git a/tests/unit_tests/__init__.py b/tests/unit_tests/__init__.py new file mode 100644 index 0000000000..9add4932b0 --- /dev/null +++ b/tests/unit_tests/__init__.py @@ -0,0 +1 @@ +# This file makes tests/unit_tests/ a Python package From 852340d79f0d4e2eac4132babf627df3a0519a40 Mon Sep 17 00:00:00 2001 From: liyuzhuo Date: Thu, 16 Apr 2026 14:38:28 +0800 Subject: [PATCH 28/29] ignore docs --- .github/workflows/all_tests_ascend.yml | 2 ++ .github/workflows/all_tests_cuda.yml | 36 +++++++++++++++++++++++++- .github/workflows/all_tests_metax.yml | 2 ++ .github/workflows/build_image_cuda.yml | 2 +- .github/workflows/format_check.yml | 2 ++ 5 files changed, 42 insertions(+), 2 deletions(-) diff --git a/.github/workflows/all_tests_ascend.yml b/.github/workflows/all_tests_ascend.yml index 459c8616a9..401eeb813f 100644 --- a/.github/workflows/all_tests_ascend.yml +++ b/.github/workflows/all_tests_ascend.yml @@ -4,6 +4,7 @@ on: push: branches: ["main"] paths-ignore: + - 'docs/**' - 'docker/cuda/**' - 'docker/build.sh' - 'tools/install/**' @@ -13,6 +14,7 @@ on: pull_request: branches: ["main"] paths-ignore: + - 'docs/**' - 'docker/cuda/**' - 'docker/build.sh' - 'tools/install/**' diff --git a/.github/workflows/all_tests_cuda.yml b/.github/workflows/all_tests_cuda.yml index 2374e10850..db6215ceab 100644 --- a/.github/workflows/all_tests_cuda.yml +++ b/.github/workflows/all_tests_cuda.yml @@ -4,6 +4,7 @@ on: push: branches: ["main"] paths-ignore: + - 'docs/**' - 'docker/cuda/**' - 'docker/build.sh' - 'tools/install/**' @@ -13,6 +14,7 @@ on: pull_request: branches: ["main"] paths-ignore: + - 'docs/**' - 'docker/cuda/**' - 'docker/build.sh' - 'tools/install/**' @@ -25,19 +27,51 @@ concurrency: cancel-in-progress: true jobs: + # --------------------------------------------------------------------------- + # Guard: skip tests when PR contains docker-related changes, because + # build_image_cuda.yml will handle build + test for those PRs. + # paths-ignore alone cannot guarantee mutual exclusivity when a PR touches + # both docker files and non-docker files, so we need this job-level check. + # --------------------------------------------------------------------------- + check_docker_changes: + runs-on: ubuntu-latest + if: github.event_name == 'pull_request' + outputs: + has_docker_changes: ${{ steps.check.outputs.has_docker_changes }} + steps: + - name: Check for docker-related file changes + id: check + env: + GH_TOKEN: ${{ github.token }} + run: | + CHANGED=$(gh api repos/${{ github.repository }}/pulls/${{ github.event.pull_request.number }}/files --paginate --jq '.[].filename') + if echo "$CHANGED" | grep -qE '^(docker/cuda/|docker/build\.sh$|tools/install/|requirements/|\.github/workflows/build_image_cuda\.yml$)'; then + echo "has_docker_changes=true" >> $GITHUB_OUTPUT + else + echo "has_docker_changes=false" >> $GITHUB_OUTPUT + fi + run_tests: + needs: check_docker_changes + # On push: always run (paths-ignore already filtered). + # On PR: only run when no docker-related files changed. + if: always() && (github.event_name == 'push' || needs.check_docker_changes.outputs.has_docker_changes != 'true') # Package manager and environment settings are read from .github/configs/cuda.yml uses: ./.github/workflows/all_tests_common.yml with: platform: cuda all_tests: - needs: run_tests + needs: [check_docker_changes, run_tests] runs-on: ubuntu-latest if: always() steps: - name: Verify workflow status run: | + if [ "${{ needs.check_docker_changes.outputs.has_docker_changes }}" = "true" ]; then + echo "⏭️ Skipped - docker changes detected, handled by build_image_cuda workflow" + exit 0 + fi if [ "${{ needs.run_tests.result }}" != "success" ]; then echo "❌ Tests workflow failed" exit 1 diff --git a/.github/workflows/all_tests_metax.yml b/.github/workflows/all_tests_metax.yml index b350a6c4e0..d459e040b6 100644 --- a/.github/workflows/all_tests_metax.yml +++ b/.github/workflows/all_tests_metax.yml @@ -4,6 +4,7 @@ on: push: branches: ["main"] paths-ignore: + - 'docs/**' - 'docker/cuda/**' - 'docker/build.sh' - 'tools/install/**' @@ -13,6 +14,7 @@ on: pull_request: branches: ["main"] paths-ignore: + - 'docs/**' - 'docker/cuda/**' - 'docker/build.sh' - 'tools/install/**' diff --git a/.github/workflows/build_image_cuda.yml b/.github/workflows/build_image_cuda.yml index 945203be73..3e8a500899 100644 --- a/.github/workflows/build_image_cuda.yml +++ b/.github/workflows/build_image_cuda.yml @@ -61,7 +61,7 @@ on: # Trigger on PRs that modify docker-related files (build + test only, no push) pull_request: branches: [main] - paths: + paths:∏ - 'docker/cuda/**' - 'docker/build.sh' - 'tools/install/**' diff --git a/.github/workflows/format_check.yml b/.github/workflows/format_check.yml index 2aa817cc58..ba3ca9b97b 100644 --- a/.github/workflows/format_check.yml +++ b/.github/workflows/format_check.yml @@ -4,6 +4,8 @@ on: pull_request: branches: [ "main" ] types: [opened, synchronize, reopened] + paths-ignore: + - 'docs/**' jobs: format: From 264d68e8bc7c386eef3a20c1e00383502e716aec Mon Sep 17 00:00:00 2001 From: liyuzhuo Date: Thu, 16 Apr 2026 14:54:29 +0800 Subject: [PATCH 29/29] syntax --- .github/workflows/build_image_cuda.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_image_cuda.yml b/.github/workflows/build_image_cuda.yml index 3e8a500899..945203be73 100644 --- a/.github/workflows/build_image_cuda.yml +++ b/.github/workflows/build_image_cuda.yml @@ -61,7 +61,7 @@ on: # Trigger on PRs that modify docker-related files (build + test only, no push) pull_request: branches: [main] - paths:∏ + paths: - 'docker/cuda/**' - 'docker/build.sh' - 'tools/install/**'