From 2314a2165247eacbc8e4bf023d7afd71bd0b25d5 Mon Sep 17 00:00:00 2001
From: zihugithub <fbye@baai.ac.cn>
Date: Tue, 10 Mar 2026 17:27:07 +0800
Subject: [PATCH 01/29] [CICD] Add auto build and push CUDA Docker images to
 Harbor pipeline

---
 .github/workflows/all_tests_cuda.yml    |  20 ++
 .github/workflows/build_image_cuda.yml  | 294 ++++++++++++++++++++++++
 .github/workflows/push_image_harbor.yml | 126 ++++++++++
 requirements/cuda/serve.txt             |   2 +-
 4 files changed, 441 insertions(+), 1 deletion(-)
 create mode 100644 .github/workflows/build_image_cuda.yml
 create mode 100644 .github/workflows/push_image_harbor.yml

diff --git a/.github/workflows/all_tests_cuda.yml b/.github/workflows/all_tests_cuda.yml
index 0b34872207..6b05048c1a 100644
--- a/.github/workflows/all_tests_cuda.yml
+++ b/.github/workflows/all_tests_cuda.yml
@@ -1,6 +1,11 @@
 name: cuda_tests
 
 on:
+  # Trigger after Build Docker Images - CUDA succeeds
+  workflow_run:
+    workflows: ["Build Docker Images - CUDA"]
+    types: [completed]
+
   push:
     branches: ["main"]
   pull_request:
@@ -12,6 +17,10 @@ concurrency:
 
 jobs:
   run_tests:
+    # Skip if triggered by workflow_run but the build did not succeed
+    if: >-
+      github.event_name != 'workflow_run' ||
+      github.event.workflow_run.conclusion == 'success'
     # Package manager and environment settings are read from .github/configs/cuda.yml
     uses: ./.github/workflows/all_tests_common.yml
     with:
@@ -29,3 +38,14 @@ jobs:
             exit 1
           fi
           echo "✅ All tests passed!"
+
+  push_images_to_harbor:
+    needs: all_tests
+    if: >-
+      needs.all_tests.result == 'success' &&
+      github.event_name == 'workflow_run' &&
+      github.event.workflow_run.conclusion == 'success'
+    uses: ./.github/workflows/push_image_harbor.yml
+    with:
+      platform: cuda
+    secrets: inherit
\ No newline at end of file
diff --git a/.github/workflows/build_image_cuda.yml b/.github/workflows/build_image_cuda.yml
new file mode 100644
index 0000000000..12d531cffa
--- /dev/null
+++ b/.github/workflows/build_image_cuda.yml
@@ -0,0 +1,294 @@
+name: Build Docker Images - CUDA
+
+on:
+  # Manual trigger with configurable options
+  workflow_dispatch:
+    inputs:
+      task:
+        description: 'Task to build'
+        required: true
+        type: choice
+        options:
+          - train
+          - inference
+          - all
+        default: 'train'
+      target:
+        description: 'Build target stage (dev includes dev tools, release is production)'
+        required: true
+        type: choice
+        options:
+          - dev
+          - release
+        default: 'dev'
+      push:
+        description: 'Push image to registry'
+        type: boolean
+        default: true
+      no_cache:
+        description: 'Build without Docker cache'
+        type: boolean
+        default: false
+      pkg_mgr:
+        description: 'Package manager to use'
+        required: true
+        type: choice
+        options:
+          - conda
+          - uv
+        default: 'conda'
+      build_all_tasks:
+        description: 'Build all tasks (train, inference, all) - overrides task selection'
+        type: boolean
+        default: false
+
+  # Trigger on PRs that modify docker-related files
+  pull_request:
+    branches: [main]
+    paths:
+      - 'docker/cuda/**'
+      - 'docker/build.sh'
+      - 'tools/install/**'
+      - 'requirements/**'
+      - '.github/workflows/build_image_cuda.yml'
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
+env:
+  # Local registry used by CI runners (see .github/configs/cuda.yml)
+  REGISTRY: localhost:5000
+  # Default build versions (keep in sync with docker/build.sh)
+  CUDA_VERSION: '12.8.1'
+  UBUNTU_VERSION: '22.04'
+  PYTHON_VERSION: '3.12'
+  UV_VERSION: '0.7.2'
+  PKG_MGR: ${{ inputs.pkg_mgr || 'conda' }}
+
+jobs:
+  # ---------------------------------------------------------------------------
+  # Prepare: compute build matrix and parameters based on trigger type
+  # ---------------------------------------------------------------------------
+  prepare:
+    name: Prepare build matrix
+    runs-on: ubuntu-latest
+    outputs:
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
+      target: ${{ steps.params.outputs.target }}
+      push: ${{ steps.params.outputs.push }}
+      no_cache: ${{ steps.params.outputs.no_cache }}
+    steps:
+      - name: Determine build matrix
+        id: set-matrix
+        run: |
+          EVENT="${{ github.event_name }}"
+
+          if [ "$EVENT" = "workflow_dispatch" ] && [ "${{ inputs.build_all_tasks }}" != "true" ]; then
+            # Manual trigger: build selected task only
+            echo 'matrix={"task":["${{ inputs.task }}"]}' >> $GITHUB_OUTPUT
+          else
+            # Push to main or build_all_tasks=true: build all tasks
+            echo 'matrix={"task":["train","inference","all"]}' >> $GITHUB_OUTPUT
+          fi
+
+      - name: Set build parameters
+        id: params
+        run: |
+          EVENT="${{ github.event_name }}"
+
+          if [ "$EVENT" = "push" ]; then
+            # Push to main: always build dev images and push
+            echo "target=dev" >> $GITHUB_OUTPUT
+            echo "push=true" >> $GITHUB_OUTPUT
+            echo "no_cache=false" >> $GITHUB_OUTPUT
+          else
+            echo "target=${{ inputs.target || 'dev' }}" >> $GITHUB_OUTPUT
+            echo "push=${{ inputs.push }}" >> $GITHUB_OUTPUT
+            echo "no_cache=${{ inputs.no_cache }}" >> $GITHUB_OUTPUT
+          fi
+
+  # ---------------------------------------------------------------------------
+  # Build: build and push Docker images (matrix across tasks)
+  # ---------------------------------------------------------------------------
+  build:
+    name: Build ${{ matrix.task }}
+    needs: prepare
+    runs-on: [self-hosted, Linux, X64, nvidia-0, gpus-8]
+    strategy:
+      fail-fast: false
+      matrix: ${{ fromJson(needs.prepare.outputs.matrix) }}
+    outputs:
+      train_tag: ${{ steps.export.outputs.train_tag }}
+      inference_tag: ${{ steps.export.outputs.inference_tag }}
+      all_tag: ${{ steps.export.outputs.all_tag }}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+        with:
+          # Use docker driver to avoid pulling moby/buildkit from Docker Hub
+          driver: docker
+
+      - name: Compute build metadata
+        id: meta
+        run: |
+          set -euo pipefail
+
+          TASK="${{ matrix.task }}"
+          TARGET="${{ needs.prepare.outputs.target }}"
+          CUDA_VERSION="${{ env.CUDA_VERSION }}"
+          PYTHON_VERSION="${{ env.PYTHON_VERSION }}"
+          UBUNTU_VERSION="${{ env.UBUNTU_VERSION }}"
+
+          CUDA_MAJOR=$(echo "$CUDA_VERSION" | cut -d. -f1)
+          CUDA_MINOR=$(echo "$CUDA_VERSION" | cut -d. -f2)
+          TIMESTAMP=$(date +%Y%m%d%H%M%S)
+
+          # Image naming follows docker/build.sh convention:
+          #   flagscale-<task>:<target>-cu<major><minor>-py<version>-<timestamp>
+          IMAGE_NAME="flagscale-${TASK}"
+          TAG="${TARGET}-cu${CUDA_MAJOR}${CUDA_MINOR}-py${PYTHON_VERSION}-${TIMESTAMP}"
+
+          # Local registry tag (for CI runners)
+          LOCAL_TAG="${{ env.REGISTRY }}/${IMAGE_NAME}:${TAG}"
+
+          # Derived build arguments
+          BASE_IMAGE="nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}"
+          PYTORCH_INDEX="https://download.pytorch.org/whl/cu${CUDA_MAJOR}${CUDA_MINOR}"
+
+          # Tags list: local registry only
+          TAGS="${LOCAL_TAG}"
+
+          echo "image_name=${IMAGE_NAME}" >> $GITHUB_OUTPUT
+          echo "tag=${TAG}" >> $GITHUB_OUTPUT
+          echo "local_tag=${LOCAL_TAG}" >> $GITHUB_OUTPUT
+          echo "base_image=${BASE_IMAGE}" >> $GITHUB_OUTPUT
+          echo "pytorch_index=${PYTORCH_INDEX}" >> $GITHUB_OUTPUT
+
+          # Multi-line tags output
+          {
+            echo "tags<<EOF"
+            echo "${TAGS}"
+            echo "EOF"
+          } >> $GITHUB_OUTPUT
+
+          # Job summary
+          {
+            echo "### Build: ${IMAGE_NAME}"
+            echo ""
+            echo "| Parameter | Value |"
+            echo "|---|---|"
+            echo "| Task | \`${TASK}\` |"
+            echo "| Target | \`${TARGET}\` |"
+            echo "| CUDA | \`${CUDA_VERSION}\` |"
+            echo "| Python | \`${PYTHON_VERSION}\` |"
+            echo "| Dockerfile | \`docker/cuda/Dockerfile.${TASK}\` |"
+            echo "| Local Tag | \`${LOCAL_TAG}\` |"
+          } >> $GITHUB_STEP_SUMMARY
+
+          echo "PKG_MGR: ${{ env.PKG_MGR }}"
+
+      - name: Build Docker image
+        uses: docker/build-push-action@v6
+        with:
+          context: .
+          file: docker/cuda/Dockerfile.${{ matrix.task }}
+          target: ${{ needs.prepare.outputs.target }}
+          load: true
+          tags: ${{ steps.meta.outputs.tags }}
+          build-args: |
+            BASE_IMAGE=${{ steps.meta.outputs.base_image }}
+            CUDA_VERSION=${{ env.CUDA_VERSION }}
+            UBUNTU_VERSION=${{ env.UBUNTU_VERSION }}
+            PYTHON_VERSION=${{ env.PYTHON_VERSION }}
+            UV_VERSION=${{ env.UV_VERSION }}
+            PYTORCH_INDEX=${{ steps.meta.outputs.pytorch_index }}
+            PKG_MGR=${{ env.PKG_MGR }}
+          no-cache: ${{ needs.prepare.outputs.no_cache == 'true' }}
+
+      - name: Push Docker image
+        if: needs.prepare.outputs.push == 'true'
+        run: docker push ${{ steps.meta.outputs.local_tag }}
+
+      - name: Export image tag for config update
+        id: export
+        if: success() && needs.prepare.outputs.push == 'true'
+        run: |
+          TASK="${{ matrix.task }}"
+          echo "${TASK}_tag=${{ steps.meta.outputs.local_tag }}" >> $GITHUB_OUTPUT
+
+      - name: Print build result
+        if: success()
+        run: |
+          echo "" >> $GITHUB_STEP_SUMMARY
+          echo "**Result:** Built successfully" >> $GITHUB_STEP_SUMMARY
+          echo "**Pushed:** ${{ needs.prepare.outputs.push }}" >> $GITHUB_STEP_SUMMARY
+
+  # ---------------------------------------------------------------------------
+  # Update Config: update cuda.yml with localhost tags (temporary, for test validation)
+  # After tests pass, push_image_harbor.yml will promote to Harbor and update to final tags
+  # ---------------------------------------------------------------------------
+  update-config:
+    name: Update CI config
+    needs: build
+    runs-on: [self-hosted, Linux, X64]
+    if: needs.prepare.outputs.push == 'true' && needs.build.result == 'success'
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.ref }}
+          token: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Update cuda.yml with new image tags
+        run: |
+          set -euo pipefail
+          CONFIG_FILE=".github/configs/cuda.yml"
+
+          TRAIN_TAG="${{ needs.build.outputs.train_tag }}"
+          INFERENCE_TAG="${{ needs.build.outputs.inference_tag }}"
+
+          if [ -n "$TRAIN_TAG" ]; then
+            echo "Updating ci_train_image to: $TRAIN_TAG"
+            sed -i "s|^ci_train_image:.*|ci_train_image: ${TRAIN_TAG}|" "$CONFIG_FILE"
+          fi
+
+          if [ -n "$INFERENCE_TAG" ]; then
+            echo "Updating ci_inference_image to: $INFERENCE_TAG"
+            sed -i "s|^ci_inference_image:.*|ci_inference_image: ${INFERENCE_TAG}|" "$CONFIG_FILE"
+          fi
+
+          echo "Updated config:"
+          cat "$CONFIG_FILE"
+
+      - name: Commit and push config update
+        run: |
+          git config user.name "github-actions[bot]"
+          git config user.email "github-actions[bot]@users.noreply.github.com"
+          git add .github/configs/cuda.yml
+          if git diff --cached --quiet; then
+            echo "No config changes to commit"
+          else
+            git commit -m "ci: update CUDA image tags [skip ci]"
+            git push
+          fi
+
+  # ---------------------------------------------------------------------------
+  # Summary: verify all builds completed
+  # ---------------------------------------------------------------------------
+  summary:
+    name: Build summary
+    needs: update-config
+    runs-on: ubuntu-latest
+    if: always()
+    steps:
+      - name: Verify build results
+        run: |
+          if [ "${{ needs.update-config.result }}" != "success" ]; then
+            echo "::error::One or more image builds failed"
+            exit 1
+          fi
+          echo "All Docker images built successfully!"
diff --git a/.github/workflows/push_image_harbor.yml b/.github/workflows/push_image_harbor.yml
new file mode 100644
index 0000000000..2a89aff39d
--- /dev/null
+++ b/.github/workflows/push_image_harbor.yml
@@ -0,0 +1,126 @@
+name: Push Images to Harbor
+
+on:
+  workflow_call:
+    inputs:
+      platform:
+        required: true
+        type: string
+        description: "Platform name (e.g. cuda), used to locate .github/configs/<platform>.yml"
+
+env:
+  REMOTE_REGISTRY: harbor.baai.ac.cn
+  REMOTE_IMAGE_PREFIX: flagscale
+
+jobs:
+  promote:
+    name: Push validated images to Harbor
+    runs-on: [self-hosted, Linux, X64, nvidia-0, gpus-8]
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Read image tags from config
+        id: tags
+        run: |
+          set -euo pipefail
+          CONFIG_FILE=".github/configs/${{ inputs.platform }}.yml"
+
+          TRAIN_TAG=$(grep '^ci_train_image:' "$CONFIG_FILE" | awk '{print $2}')
+          INFERENCE_TAG=$(grep '^ci_inference_image:' "$CONFIG_FILE" | awk '{print $2}')
+
+          echo "train_tag=${TRAIN_TAG}" >> $GITHUB_OUTPUT
+          echo "inference_tag=${INFERENCE_TAG}" >> $GITHUB_OUTPUT
+
+          # Check if images are from localhost (freshly built, need promotion)
+          if echo "${TRAIN_TAG}${INFERENCE_TAG}" | grep -q 'localhost'; then
+            echo "needs_promotion=true" >> $GITHUB_OUTPUT
+            echo "Images are from localhost, promotion needed"
+          else
+            echo "needs_promotion=false" >> $GITHUB_OUTPUT
+            echo "Images already on Harbor, skipping promotion"
+          fi
+
+      - name: Login to Harbor registry
+        if: steps.tags.outputs.needs_promotion == 'true'
+        uses: docker/login-action@v3
+        with:
+          registry: ${{ env.REMOTE_REGISTRY }}
+          username: ${{ secrets.REGISTRY_USERNAME }}
+          password: ${{ secrets.CONTAINER_REGISTRY }}
+
+      - name: Promote train image to Harbor
+        id: promote_train
+        if: steps.tags.outputs.needs_promotion == 'true' && steps.tags.outputs.train_tag != ''
+        run: |
+          set -euo pipefail
+          LOCAL_TAG="${{ steps.tags.outputs.train_tag }}"
+          IMAGE_AND_TAG="${LOCAL_TAG#*/}"
+          REMOTE_TAG="${{ env.REMOTE_REGISTRY }}/${{ env.REMOTE_IMAGE_PREFIX }}/${IMAGE_AND_TAG}"
+
+          echo "Promoting: ${LOCAL_TAG} → ${REMOTE_TAG}"
+          docker pull "${LOCAL_TAG}"
+          docker tag "${LOCAL_TAG}" "${REMOTE_TAG}"
+          docker push "${REMOTE_TAG}"
+
+          echo "remote_tag=${REMOTE_TAG}" >> $GITHUB_OUTPUT
+          echo "Successfully pushed ${REMOTE_TAG}"
+
+      - name: Promote inference image to Harbor
+        id: promote_inference
+        if: steps.tags.outputs.needs_promotion == 'true' && steps.tags.outputs.inference_tag != ''
+        run: |
+          set -euo pipefail
+          LOCAL_TAG="${{ steps.tags.outputs.inference_tag }}"
+          IMAGE_AND_TAG="${LOCAL_TAG#*/}"
+          REMOTE_TAG="${{ env.REMOTE_REGISTRY }}/${{ env.REMOTE_IMAGE_PREFIX }}/${IMAGE_AND_TAG}"
+
+          echo "Promoting: ${LOCAL_TAG} → ${REMOTE_TAG}"
+          docker pull "${LOCAL_TAG}"
+          docker tag "${LOCAL_TAG}" "${REMOTE_TAG}"
+          docker push "${REMOTE_TAG}"
+
+          echo "remote_tag=${REMOTE_TAG}" >> $GITHUB_OUTPUT
+          echo "Successfully pushed ${REMOTE_TAG}"
+
+      - name: Commit and push config update
+        if: steps.tags.outputs.needs_promotion == 'true'
+        run: |
+          git config user.name "github-actions[bot]"
+          git config user.email "github-actions[bot]@users.noreply.github.com"
+          git add .github/configs/${{ inputs.platform }}.yml
+          if git diff --cached --quiet; then
+            echo "No config changes to commit"
+          else
+            git commit -m "ci: promote ${{ inputs.platform }} image tags to Harbor [skip ci]"
+            git push
+          fi
+
+  # ---------------------------------------------------------------------------
+  # Cleanup: clean up Docker build cache and dangling images on self-hosted runner
+  # ---------------------------------------------------------------------------
+  cleanup:
+    name: Clean up build cache
+    needs: promote
+    runs-on: [self-hosted, Linux, X64, nvidia-0, gpus-8]
+    if: always()
+    steps:
+      - name: Remove dangling images
+        run: docker image prune -f 2>/dev/null || true
+
+      - name: Remove build cache older than 7 days
+        run: docker builder prune -f --filter "until=168h" 2>/dev/null || true
+
+      - name: Remove old localhost registry images
+        run: |
+          # Remove local images tagged with localhost:5000 that are older than 7 days
+          docker images --format '{{.Repository}}:{{.Tag}} {{.CreatedSince}}' \
+            | grep 'localhost:5000' \
+            | grep -E '(weeks|months)' \
+            | awk '{print $1}' \
+            | xargs -r docker rmi 2>/dev/null || true
+
+      - name: Report disk usage
+        run: |
+          echo "Docker disk usage:"
+          docker system df
diff --git a/requirements/cuda/serve.txt b/requirements/cuda/serve.txt
index 02dd3ca0e4..81ba8265f6 100644
--- a/requirements/cuda/serve.txt
+++ b/requirements/cuda/serve.txt
@@ -1,7 +1,7 @@
 # serve-specific dependencies
 
 -r ./base.txt
-vllm @ https://resource.flagos.net/repository/flagos-pypi-hosted/packages/vllm/0.11.0%2Bfl.0.1.cu124/vllm-0.11.0%2Bfl.0.1.cu124-cp312-cp312-linux_x86_64.whl
+vllm @ https://resource.flagos.net/repository/flagos-pypi-hosted/packages/vllm/0.13.0%2Bfl.0.1.cu128.g72506c983/vllm-0.13.0%2Bfl.0.1.cu128.g72506c983-cp312-cp312-linux_x86_64.whl
 
 # support 0.5b_multiple_instance ci test
 ray==2.49.1

From 6a6a947508e34bcd0f9c1e6350e63912f308d2a9 Mon Sep 17 00:00:00 2001
From: zihugithub <fbye@baai.ac.cn>
Date: Tue, 10 Mar 2026 18:07:58 +0800
Subject: [PATCH 02/29] fix: code style adjustments in PR

---
 .github/workflows/all_tests_cuda.yml    |  2 +-
 .github/workflows/build_image_cuda.yml  | 11 +++++------
 .github/workflows/push_image_harbor.yml | 13 -------------
 3 files changed, 6 insertions(+), 20 deletions(-)

diff --git a/.github/workflows/all_tests_cuda.yml b/.github/workflows/all_tests_cuda.yml
index 6b05048c1a..e02f580c43 100644
--- a/.github/workflows/all_tests_cuda.yml
+++ b/.github/workflows/all_tests_cuda.yml
@@ -48,4 +48,4 @@ jobs:
     uses: ./.github/workflows/push_image_harbor.yml
     with:
       platform: cuda
-    secrets: inherit
\ No newline at end of file
+    secrets: inherit
diff --git a/.github/workflows/build_image_cuda.yml b/.github/workflows/build_image_cuda.yml
index 12d531cffa..43125a2143 100644
--- a/.github/workflows/build_image_cuda.yml
+++ b/.github/workflows/build_image_cuda.yml
@@ -88,7 +88,7 @@ jobs:
             # Manual trigger: build selected task only
             echo 'matrix={"task":["${{ inputs.task }}"]}' >> $GITHUB_OUTPUT
           else
-            # Push to main or build_all_tasks=true: build all tasks
+            # PR or build_all_tasks=true: build all tasks
             echo 'matrix={"task":["train","inference","all"]}' >> $GITHUB_OUTPUT
           fi
 
@@ -97,12 +97,13 @@ jobs:
         run: |
           EVENT="${{ github.event_name }}"
 
-          if [ "$EVENT" = "push" ]; then
-            # Push to main: always build dev images and push
+          if [ "$EVENT" = "pull_request" ]; then
+            # PR: always build dev images, push to local registry, use cache
             echo "target=dev" >> $GITHUB_OUTPUT
             echo "push=true" >> $GITHUB_OUTPUT
             echo "no_cache=false" >> $GITHUB_OUTPUT
           else
+            # workflow_dispatch: use user-provided inputs
             echo "target=${{ inputs.target || 'dev' }}" >> $GITHUB_OUTPUT
             echo "push=${{ inputs.push }}" >> $GITHUB_OUTPUT
             echo "no_cache=${{ inputs.no_cache }}" >> $GITHUB_OUTPUT
@@ -189,8 +190,6 @@ jobs:
             echo "| Local Tag | \`${LOCAL_TAG}\` |"
           } >> $GITHUB_STEP_SUMMARY
 
-          echo "PKG_MGR: ${{ env.PKG_MGR }}"
-
       - name: Build Docker image
         uses: docker/build-push-action@v6
         with:
@@ -235,7 +234,7 @@ jobs:
     name: Update CI config
     needs: build
     runs-on: [self-hosted, Linux, X64]
-    if: needs.prepare.outputs.push == 'true' && needs.build.result == 'success'
+    if: needs.build.result == 'success'
     steps:
       - name: Checkout code
         uses: actions/checkout@v4
diff --git a/.github/workflows/push_image_harbor.yml b/.github/workflows/push_image_harbor.yml
index 2a89aff39d..d10e9a3e4c 100644
--- a/.github/workflows/push_image_harbor.yml
+++ b/.github/workflows/push_image_harbor.yml
@@ -83,19 +83,6 @@ jobs:
           echo "remote_tag=${REMOTE_TAG}" >> $GITHUB_OUTPUT
           echo "Successfully pushed ${REMOTE_TAG}"
 
-      - name: Commit and push config update
-        if: steps.tags.outputs.needs_promotion == 'true'
-        run: |
-          git config user.name "github-actions[bot]"
-          git config user.email "github-actions[bot]@users.noreply.github.com"
-          git add .github/configs/${{ inputs.platform }}.yml
-          if git diff --cached --quiet; then
-            echo "No config changes to commit"
-          else
-            git commit -m "ci: promote ${{ inputs.platform }} image tags to Harbor [skip ci]"
-            git push
-          fi
-
   # ---------------------------------------------------------------------------
   # Cleanup: clean up Docker build cache and dangling images on self-hosted runner
   # ---------------------------------------------------------------------------

From 78432660794386966e20dd54da90099f6c862949 Mon Sep 17 00:00:00 2001
From: zihugithub <fbye@baai.ac.cn>
Date: Tue, 10 Mar 2026 18:28:04 +0800
Subject: [PATCH 03/29] Set safe directory

---
 .github/workflows/build_image_cuda.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/workflows/build_image_cuda.yml b/.github/workflows/build_image_cuda.yml
index 43125a2143..730a1c98f9 100644
--- a/.github/workflows/build_image_cuda.yml
+++ b/.github/workflows/build_image_cuda.yml
@@ -124,6 +124,10 @@ jobs:
       inference_tag: ${{ steps.export.outputs.inference_tag }}
       all_tag: ${{ steps.export.outputs.all_tag }}
     steps:
+      - name: Set safe directory
+        run: |
+          git config --global --add safe.directory .
+
       - name: Checkout code
         uses: actions/checkout@v4
 

From 1f895eb600e188abeae11164ae1cf9f5e4cb406b Mon Sep 17 00:00:00 2001
From: zihugithub <fbye@baai.ac.cn>
Date: Tue, 10 Mar 2026 18:36:52 +0800
Subject: [PATCH 04/29] Clean workspace

---
 .github/workflows/build_image_cuda.yml  | 8 +++++---
 .github/workflows/push_image_harbor.yml | 3 +++
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/build_image_cuda.yml b/.github/workflows/build_image_cuda.yml
index 730a1c98f9..80128017d5 100644
--- a/.github/workflows/build_image_cuda.yml
+++ b/.github/workflows/build_image_cuda.yml
@@ -124,9 +124,8 @@ jobs:
       inference_tag: ${{ steps.export.outputs.inference_tag }}
       all_tag: ${{ steps.export.outputs.all_tag }}
     steps:
-      - name: Set safe directory
-        run: |
-          git config --global --add safe.directory .
+      - name: Clean workspace
+        run: sudo rm -rf "$GITHUB_WORKSPACE"/* "$GITHUB_WORKSPACE"/.[!.]* || true
 
       - name: Checkout code
         uses: actions/checkout@v4
@@ -240,6 +239,9 @@ jobs:
     runs-on: [self-hosted, Linux, X64]
     if: needs.build.result == 'success'
     steps:
+      - name: Clean workspace
+        run: sudo rm -rf "$GITHUB_WORKSPACE"/* "$GITHUB_WORKSPACE"/.[!.]* || true
+
       - name: Checkout code
         uses: actions/checkout@v4
         with:
diff --git a/.github/workflows/push_image_harbor.yml b/.github/workflows/push_image_harbor.yml
index d10e9a3e4c..5f2053063a 100644
--- a/.github/workflows/push_image_harbor.yml
+++ b/.github/workflows/push_image_harbor.yml
@@ -17,6 +17,9 @@ jobs:
     name: Push validated images to Harbor
     runs-on: [self-hosted, Linux, X64, nvidia-0, gpus-8]
     steps:
+      - name: Clean workspace
+        run: sudo rm -rf "$GITHUB_WORKSPACE"/* "$GITHUB_WORKSPACE"/.[!.]* || true
+
       - name: Checkout code
         uses: actions/checkout@v4
 

From 903c520ce84ba055366980b6b88591bdb2a7ebf4 Mon Sep 17 00:00:00 2001
From: zihugithub <fbye@baai.ac.cn>
Date: Wed, 11 Mar 2026 10:43:07 +0800
Subject: [PATCH 05/29] Remove redundant clean workspace steps before checkout

---
 .github/workflows/build_image_cuda.yml  | 6 ------
 .github/workflows/push_image_harbor.yml | 3 ---
 2 files changed, 9 deletions(-)

diff --git a/.github/workflows/build_image_cuda.yml b/.github/workflows/build_image_cuda.yml
index 80128017d5..43125a2143 100644
--- a/.github/workflows/build_image_cuda.yml
+++ b/.github/workflows/build_image_cuda.yml
@@ -124,9 +124,6 @@ jobs:
       inference_tag: ${{ steps.export.outputs.inference_tag }}
       all_tag: ${{ steps.export.outputs.all_tag }}
     steps:
-      - name: Clean workspace
-        run: sudo rm -rf "$GITHUB_WORKSPACE"/* "$GITHUB_WORKSPACE"/.[!.]* || true
-
       - name: Checkout code
         uses: actions/checkout@v4
 
@@ -239,9 +236,6 @@ jobs:
     runs-on: [self-hosted, Linux, X64]
     if: needs.build.result == 'success'
     steps:
-      - name: Clean workspace
-        run: sudo rm -rf "$GITHUB_WORKSPACE"/* "$GITHUB_WORKSPACE"/.[!.]* || true
-
       - name: Checkout code
         uses: actions/checkout@v4
         with:
diff --git a/.github/workflows/push_image_harbor.yml b/.github/workflows/push_image_harbor.yml
index 5f2053063a..d10e9a3e4c 100644
--- a/.github/workflows/push_image_harbor.yml
+++ b/.github/workflows/push_image_harbor.yml
@@ -17,9 +17,6 @@ jobs:
     name: Push validated images to Harbor
     runs-on: [self-hosted, Linux, X64, nvidia-0, gpus-8]
     steps:
-      - name: Clean workspace
-        run: sudo rm -rf "$GITHUB_WORKSPACE"/* "$GITHUB_WORKSPACE"/.[!.]* || true
-
       - name: Checkout code
         uses: actions/checkout@v4
 

From 84d54bc17cc84c83d2cb3fefe8a4f25cdd5a1cad Mon Sep 17 00:00:00 2001
From: zihugithub <fbye@baai.ac.cn>
Date: Thu, 12 Mar 2026 14:26:42 +0800
Subject: [PATCH 06/29] fix(ci): use head_ref for PR checkout to avoid detached
 HEAD on push

---
 .github/workflows/build_image_cuda.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_image_cuda.yml b/.github/workflows/build_image_cuda.yml
index 43125a2143..d95236a48d 100644
--- a/.github/workflows/build_image_cuda.yml
+++ b/.github/workflows/build_image_cuda.yml
@@ -239,7 +239,7 @@ jobs:
       - name: Checkout code
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.ref }}
+          ref: ${{ github.head_ref || github.ref }}
           token: ${{ secrets.GITHUB_TOKEN }}
 
       - name: Update cuda.yml with new image tags

From 753242a0cd20ba277892984410ad30bde3d9a850 Mon Sep 17 00:00:00 2001
From: zihugithub <fbye@baai.ac.cn>
Date: Mon, 16 Mar 2026 18:13:52 +0800
Subject: [PATCH 07/29] fix git fetch failure in update-config job

---
 .github/workflows/build_image_cuda.yml | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build_image_cuda.yml b/.github/workflows/build_image_cuda.yml
index d95236a48d..db23dba087 100644
--- a/.github/workflows/build_image_cuda.yml
+++ b/.github/workflows/build_image_cuda.yml
@@ -52,6 +52,9 @@ on:
       - 'requirements/**'
       - '.github/workflows/build_image_cuda.yml'
 
+permissions:
+  contents: write
+
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
   cancel-in-progress: true
@@ -233,7 +236,7 @@ jobs:
   update-config:
     name: Update CI config
     needs: build
-    runs-on: [self-hosted, Linux, X64]
+    runs-on: [self-hosted, Linux, X64, nvidia-0, gpus-8]
     if: needs.build.result == 'success'
     steps:
       - name: Checkout code
@@ -241,6 +244,8 @@ jobs:
         with:
           ref: ${{ github.head_ref || github.ref }}
           token: ${{ secrets.GITHUB_TOKEN }}
+          clean: true
+          fetch-depth: 0
 
       - name: Update cuda.yml with new image tags
         run: |

From 26c3a1407349363f3a47fafa5f5ea01eabcb2929 Mon Sep 17 00:00:00 2001
From: zihugithub <fbye@baai.ac.cn>
Date: Tue, 17 Mar 2026 09:52:44 +0800
Subject: [PATCH 08/29] Clean workspace

---
 .github/workflows/build_image_cuda.yml | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/.github/workflows/build_image_cuda.yml b/.github/workflows/build_image_cuda.yml
index db23dba087..232a721529 100644
--- a/.github/workflows/build_image_cuda.yml
+++ b/.github/workflows/build_image_cuda.yml
@@ -127,6 +127,9 @@ jobs:
       inference_tag: ${{ steps.export.outputs.inference_tag }}
       all_tag: ${{ steps.export.outputs.all_tag }}
     steps:
+      - name: Clean workspace
+        run: sudo rm -rf "$GITHUB_WORKSPACE"/* "$GITHUB_WORKSPACE"/.[!.]* 2>/dev/null || true
+
       - name: Checkout code
         uses: actions/checkout@v4
 
@@ -239,6 +242,9 @@ jobs:
     runs-on: [self-hosted, Linux, X64, nvidia-0, gpus-8]
     if: needs.build.result == 'success'
     steps:
+      - name: Clean workspace
+        run: sudo rm -rf "$GITHUB_WORKSPACE"/* "$GITHUB_WORKSPACE"/.[!.]* 2>/dev/null || true
+
       - name: Checkout code
         uses: actions/checkout@v4
         with:

From bf71fbeb2717fddd802dfbe30b6c43e2cbb12300 Mon Sep 17 00:00:00 2001
From: zihugithub <fbye@baai.ac.cn>
Date: Tue, 17 Mar 2026 10:56:20 +0800
Subject: [PATCH 09/29] add paths-ignore for test workflow and fix PR checkout
 in build workflow

---
 .github/workflows/all_tests_cuda.yml   | 6 ++++++
 .github/workflows/build_image_cuda.yml | 8 ++++++--
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/all_tests_cuda.yml b/.github/workflows/all_tests_cuda.yml
index e02f580c43..2b3441adbd 100644
--- a/.github/workflows/all_tests_cuda.yml
+++ b/.github/workflows/all_tests_cuda.yml
@@ -10,6 +10,12 @@ on:
     branches: ["main"]
   pull_request:
     branches: ["main"]
+    paths-ignore:
+      - 'docker/cuda/**'
+      - 'docker/build.sh'
+      - 'tools/install/**'
+      - 'requirements/**'
+      - '.github/workflows/build_image_cuda.yml'
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.actor }}
diff --git a/.github/workflows/build_image_cuda.yml b/.github/workflows/build_image_cuda.yml
index 232a721529..e04fb2d8f4 100644
--- a/.github/workflows/build_image_cuda.yml
+++ b/.github/workflows/build_image_cuda.yml
@@ -248,11 +248,15 @@ jobs:
       - name: Checkout code
         uses: actions/checkout@v4
         with:
-          ref: ${{ github.head_ref || github.ref }}
           token: ${{ secrets.GITHUB_TOKEN }}
-          clean: true
           fetch-depth: 0
 
+      - name: Switch to PR branch
+        if: github.event_name == 'pull_request'
+        run: |
+          git fetch origin "${{ github.head_ref }}"
+          git checkout "${{ github.head_ref }}"
+
       - name: Update cuda.yml with new image tags
         run: |
           set -euo pipefail

From 9cfa2e1483fbcc9b0761717d528a1d57d45fc118 Mon Sep 17 00:00:00 2001
From: zihugithub <fbye@baai.ac.cn>
Date: Tue, 17 Mar 2026 12:06:01 +0800
Subject: [PATCH 10/29] support fork PRs in build_image_cuda workflow

---
 .github/workflows/build_image_cuda.yml | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/build_image_cuda.yml b/.github/workflows/build_image_cuda.yml
index e04fb2d8f4..968bc61113 100644
--- a/.github/workflows/build_image_cuda.yml
+++ b/.github/workflows/build_image_cuda.yml
@@ -254,8 +254,11 @@ jobs:
       - name: Switch to PR branch
         if: github.event_name == 'pull_request'
         run: |
-          git fetch origin "${{ github.head_ref }}"
-          git checkout "${{ github.head_ref }}"
+          PR_REPO="${{ github.event.pull_request.head.repo.clone_url }}"
+          PR_BRANCH="${{ github.head_ref }}"
+          git remote add pr-head "${PR_REPO}" || true
+          git fetch pr-head "${PR_BRANCH}"
+          git checkout -b "${PR_BRANCH}" "pr-head/${PR_BRANCH}"
 
       - name: Update cuda.yml with new image tags
         run: |
@@ -287,7 +290,11 @@ jobs:
             echo "No config changes to commit"
           else
             git commit -m "ci: update CUDA image tags [skip ci]"
-            git push
+            if [ "${{ github.event_name }}" = "pull_request" ]; then
+              git push pr-head HEAD:"${{ github.head_ref }}"
+            else
+              git push
+            fi
           fi
 
   # ---------------------------------------------------------------------------

From 731083117fe0902ed881f8e8abe1c882fbeae087 Mon Sep 17 00:00:00 2001
From: zihugithub <fbye@baai.ac.cn>
Date: Tue, 17 Mar 2026 14:02:34 +0800
Subject: [PATCH 11/29] use FORK_PUSH_TOKEN for fork PR push and add token-help
 job

---
 .github/workflows/build_image_cuda.yml | 39 ++++++++++++++++++++++++--
 1 file changed, 36 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/build_image_cuda.yml b/.github/workflows/build_image_cuda.yml
index 968bc61113..22f365c7af 100644
--- a/.github/workflows/build_image_cuda.yml
+++ b/.github/workflows/build_image_cuda.yml
@@ -254,9 +254,10 @@ jobs:
       - name: Switch to PR branch
         if: github.event_name == 'pull_request'
         run: |
-          PR_REPO="${{ github.event.pull_request.head.repo.clone_url }}"
+          PR_REPO_FULL="${{ github.event.pull_request.head.repo.full_name }}"
           PR_BRANCH="${{ github.head_ref }}"
-          git remote add pr-head "${PR_REPO}" || true
+          FORK_URL="https://x-access-token:${{ secrets.FORK_PUSH_TOKEN }}@github.com/${PR_REPO_FULL}.git"
+          git remote add pr-head "${FORK_URL}" || true
           git fetch pr-head "${PR_BRANCH}"
           git checkout -b "${PR_BRANCH}" "pr-head/${PR_BRANCH}"
 
@@ -297,12 +298,44 @@ jobs:
             fi
           fi
 
+  # ---------------------------------------------------------------------------
+  # Token help: print setup instructions when update-config fails on fork PRs
+  # ---------------------------------------------------------------------------
+  token-help:
+    name: Print token setup instructions
+    needs: update-config
+    runs-on: ubuntu-latest
+    if: failure() && github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name != github.repository
+    steps:
+      - name: Print FORK_PUSH_TOKEN setup instructions
+        run: |
+          echo "::error::Push to fork failed. The FORK_PUSH_TOKEN secret is missing or invalid."
+          {
+            echo "### Push to fork failed — FORK_PUSH_TOKEN setup required"
+            echo ""
+            echo "**Step 1: Create a Personal Access Token (PAT)**"
+            echo "1. Go to your GitHub account: \`Settings → Developer settings → Personal access tokens → Fine-grained tokens\`"
+            echo "2. Click \`Generate new token\`"
+            echo "3. Set \`Repository access\` → \`Only select repositories\` → select your fork repo (\`${{ github.event.pull_request.head.repo.full_name }}\`)"
+            echo "4. Set \`Permissions\` → \`Repository permissions\` → \`Contents\`: **Read and Write**"
+            echo "5. Click \`Generate token\` and copy the token value"
+            echo ""
+            echo "**Step 2: Add the secret to the upstream repository**"
+            echo "1. Go to [\`${{ github.repository }}\`](https://github.com/${{ github.repository }}/settings/secrets/actions) → \`Settings → Secrets and variables → Actions\`"
+            echo "2. Click \`New repository secret\`"
+            echo "3. Name: \`FORK_PUSH_TOKEN\`"
+            echo "4. Value: paste the token from Step 1"
+            echo "5. Click \`Add secret\`"
+            echo ""
+            echo "**Step 3:** Re-run this workflow"
+          } >> $GITHUB_STEP_SUMMARY
+
   # ---------------------------------------------------------------------------
   # Summary: verify all builds completed
   # ---------------------------------------------------------------------------
   summary:
     name: Build summary
-    needs: update-config
+    needs: [update-config, token-help]
     runs-on: ubuntu-latest
     if: always()
     steps:

From 8b39e356c3fff3cefe73a6bc8d828cd5f1f1ff21 Mon Sep 17 00:00:00 2001
From: zihugithub <fbye@baai.ac.cn>
Date: Tue, 17 Mar 2026 16:11:50 +0800
Subject: [PATCH 12/29] fix(ci): use stable image tags without timestamp for
 registry push

---
 .github/configs/cuda.yml               |   4 +-
 .github/workflows/all_tests_cuda.yml   |  27 ++---
 .github/workflows/build_image_cuda.yml | 155 ++++++-------------------
 3 files changed, 48 insertions(+), 138 deletions(-)

diff --git a/.github/configs/cuda.yml b/.github/configs/cuda.yml
index 093b97dac2..cd1d99461f 100644
--- a/.github/configs/cuda.yml
+++ b/.github/configs/cuda.yml
@@ -7,8 +7,8 @@ display_name: "CUDA Tests"
 
 # Docker image for this hardware
 ci_image: localhost:5000/flagscale:cuda12.8.1-cudnn9.7.1-python3.12-torch2.7.1-time2510131515
-ci_train_image: localhost:5000/flagscale-train:dev-cu128-py3.12-20260228210721
-ci_inference_image: localhost:5000/flagscale-inference:dev-cu128-py3.12-20260302102033
+ci_train_image: localhost:5000/flagscale-train:dev-cu128-py3.12
+ci_inference_image: localhost:5000/flagscale-inference:dev-cu128-py3.12
 
 # Runner labels for this hardware
 runner_labels:
diff --git a/.github/workflows/all_tests_cuda.yml b/.github/workflows/all_tests_cuda.yml
index 2b3441adbd..e02ef07b77 100644
--- a/.github/workflows/all_tests_cuda.yml
+++ b/.github/workflows/all_tests_cuda.yml
@@ -1,13 +1,17 @@
 name: cuda_tests
 
 on:
-  # Trigger after Build Docker Images - CUDA succeeds
-  workflow_run:
-    workflows: ["Build Docker Images - CUDA"]
-    types: [completed]
+  # Called by Build Docker Images - CUDA workflow
+  workflow_call:
 
   push:
     branches: ["main"]
+    paths-ignore:
+      - 'docker/cuda/**'
+      - 'docker/build.sh'
+      - 'tools/install/**'
+      - 'requirements/**'
+      - '.github/workflows/build_image_cuda.yml'
   pull_request:
     branches: ["main"]
     paths-ignore:
@@ -23,10 +27,6 @@ concurrency:
 
 jobs:
   run_tests:
-    # Skip if triggered by workflow_run but the build did not succeed
-    if: >-
-      github.event_name != 'workflow_run' ||
-      github.event.workflow_run.conclusion == 'success'
     # Package manager and environment settings are read from .github/configs/cuda.yml
     uses: ./.github/workflows/all_tests_common.yml
     with:
@@ -44,14 +44,3 @@ jobs:
             exit 1
           fi
           echo "✅ All tests passed!"
-
-  push_images_to_harbor:
-    needs: all_tests
-    if: >-
-      needs.all_tests.result == 'success' &&
-      github.event_name == 'workflow_run' &&
-      github.event.workflow_run.conclusion == 'success'
-    uses: ./.github/workflows/push_image_harbor.yml
-    with:
-      platform: cuda
-    secrets: inherit
diff --git a/.github/workflows/build_image_cuda.yml b/.github/workflows/build_image_cuda.yml
index 22f365c7af..44f3ef3efc 100644
--- a/.github/workflows/build_image_cuda.yml
+++ b/.github/workflows/build_image_cuda.yml
@@ -53,7 +53,7 @@ on:
       - '.github/workflows/build_image_cuda.yml'
 
 permissions:
-  contents: write
+  contents: read
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
@@ -155,33 +155,27 @@ jobs:
           TIMESTAMP=$(date +%Y%m%d%H%M%S)
 
           # Image naming follows docker/build.sh convention:
-          #   flagscale-<task>:<target>-cu<major><minor>-py<version>-<timestamp>
+          #   flagscale-<task>:<target>-cu<major><minor>-py<version>[-<timestamp>]
           IMAGE_NAME="flagscale-${TASK}"
-          TAG="${TARGET}-cu${CUDA_MAJOR}${CUDA_MINOR}-py${PYTHON_VERSION}-${TIMESTAMP}"
+          TAG="${TARGET}-cu${CUDA_MAJOR}${CUDA_MINOR}-py${PYTHON_VERSION}"
+          TAG_TS="${TAG}-${TIMESTAMP}"
 
-          # Local registry tag (for CI runners)
+          # Build tag (with timestamp, used during docker build)
+          BUILD_TAG="${{ env.REGISTRY }}/${IMAGE_NAME}:${TAG_TS}"
+          # Registry tag (without timestamp, used when pushing to registry)
           LOCAL_TAG="${{ env.REGISTRY }}/${IMAGE_NAME}:${TAG}"
 
           # Derived build arguments
           BASE_IMAGE="nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}"
           PYTORCH_INDEX="https://download.pytorch.org/whl/cu${CUDA_MAJOR}${CUDA_MINOR}"
 
-          # Tags list: local registry only
-          TAGS="${LOCAL_TAG}"
-
           echo "image_name=${IMAGE_NAME}" >> $GITHUB_OUTPUT
           echo "tag=${TAG}" >> $GITHUB_OUTPUT
+          echo "build_tag=${BUILD_TAG}" >> $GITHUB_OUTPUT
           echo "local_tag=${LOCAL_TAG}" >> $GITHUB_OUTPUT
           echo "base_image=${BASE_IMAGE}" >> $GITHUB_OUTPUT
           echo "pytorch_index=${PYTORCH_INDEX}" >> $GITHUB_OUTPUT
 
-          # Multi-line tags output
-          {
-            echo "tags<<EOF"
-            echo "${TAGS}"
-            echo "EOF"
-          } >> $GITHUB_OUTPUT
-
           # Job summary
           {
             echo "### Build: ${IMAGE_NAME}"
@@ -193,7 +187,8 @@ jobs:
             echo "| CUDA | \`${CUDA_VERSION}\` |"
             echo "| Python | \`${PYTHON_VERSION}\` |"
             echo "| Dockerfile | \`docker/cuda/Dockerfile.${TASK}\` |"
-            echo "| Local Tag | \`${LOCAL_TAG}\` |"
+            echo "| Build Tag | \`${BUILD_TAG}\` |"
+            echo "| Registry Tag | \`${LOCAL_TAG}\` |"
           } >> $GITHUB_STEP_SUMMARY
 
       - name: Build Docker image
@@ -203,7 +198,7 @@ jobs:
           file: docker/cuda/Dockerfile.${{ matrix.task }}
           target: ${{ needs.prepare.outputs.target }}
           load: true
-          tags: ${{ steps.meta.outputs.tags }}
+          tags: ${{ steps.meta.outputs.build_tag }}
           build-args: |
             BASE_IMAGE=${{ steps.meta.outputs.base_image }}
             CUDA_VERSION=${{ env.CUDA_VERSION }}
@@ -216,7 +211,9 @@ jobs:
 
       - name: Push Docker image
         if: needs.prepare.outputs.push == 'true'
-        run: docker push ${{ steps.meta.outputs.local_tag }}
+        run: |
+          docker tag ${{ steps.meta.outputs.build_tag }} ${{ steps.meta.outputs.local_tag }}
+          docker push ${{ steps.meta.outputs.local_tag }}
 
       - name: Export image tag for config update
         id: export
@@ -232,117 +229,41 @@ jobs:
           echo "**Result:** Built successfully" >> $GITHUB_STEP_SUMMARY
           echo "**Pushed:** ${{ needs.prepare.outputs.push }}" >> $GITHUB_STEP_SUMMARY
 
-  # ---------------------------------------------------------------------------
-  # Update Config: update cuda.yml with localhost tags (temporary, for test validation)
-  # After tests pass, push_image_harbor.yml will promote to Harbor and update to final tags
-  # ---------------------------------------------------------------------------
-  update-config:
-    name: Update CI config
-    needs: build
-    runs-on: [self-hosted, Linux, X64, nvidia-0, gpus-8]
-    if: needs.build.result == 'success'
-    steps:
-      - name: Clean workspace
-        run: sudo rm -rf "$GITHUB_WORKSPACE"/* "$GITHUB_WORKSPACE"/.[!.]* 2>/dev/null || true
-
-      - name: Checkout code
-        uses: actions/checkout@v4
-        with:
-          token: ${{ secrets.GITHUB_TOKEN }}
-          fetch-depth: 0
-
-      - name: Switch to PR branch
-        if: github.event_name == 'pull_request'
-        run: |
-          PR_REPO_FULL="${{ github.event.pull_request.head.repo.full_name }}"
-          PR_BRANCH="${{ github.head_ref }}"
-          FORK_URL="https://x-access-token:${{ secrets.FORK_PUSH_TOKEN }}@github.com/${PR_REPO_FULL}.git"
-          git remote add pr-head "${FORK_URL}" || true
-          git fetch pr-head "${PR_BRANCH}"
-          git checkout -b "${PR_BRANCH}" "pr-head/${PR_BRANCH}"
-
-      - name: Update cuda.yml with new image tags
-        run: |
-          set -euo pipefail
-          CONFIG_FILE=".github/configs/cuda.yml"
-
-          TRAIN_TAG="${{ needs.build.outputs.train_tag }}"
-          INFERENCE_TAG="${{ needs.build.outputs.inference_tag }}"
-
-          if [ -n "$TRAIN_TAG" ]; then
-            echo "Updating ci_train_image to: $TRAIN_TAG"
-            sed -i "s|^ci_train_image:.*|ci_train_image: ${TRAIN_TAG}|" "$CONFIG_FILE"
-          fi
-
-          if [ -n "$INFERENCE_TAG" ]; then
-            echo "Updating ci_inference_image to: $INFERENCE_TAG"
-            sed -i "s|^ci_inference_image:.*|ci_inference_image: ${INFERENCE_TAG}|" "$CONFIG_FILE"
-          fi
-
-          echo "Updated config:"
-          cat "$CONFIG_FILE"
-
-      - name: Commit and push config update
-        run: |
-          git config user.name "github-actions[bot]"
-          git config user.email "github-actions[bot]@users.noreply.github.com"
-          git add .github/configs/cuda.yml
-          if git diff --cached --quiet; then
-            echo "No config changes to commit"
-          else
-            git commit -m "ci: update CUDA image tags [skip ci]"
-            if [ "${{ github.event_name }}" = "pull_request" ]; then
-              git push pr-head HEAD:"${{ github.head_ref }}"
-            else
-              git push
-            fi
-          fi
-
-  # ---------------------------------------------------------------------------
-  # Token help: print setup instructions when update-config fails on fork PRs
-  # ---------------------------------------------------------------------------
-  token-help:
-    name: Print token setup instructions
-    needs: update-config
-    runs-on: ubuntu-latest
-    if: failure() && github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name != github.repository
-    steps:
-      - name: Print FORK_PUSH_TOKEN setup instructions
-        run: |
-          echo "::error::Push to fork failed. The FORK_PUSH_TOKEN secret is missing or invalid."
-          {
-            echo "### Push to fork failed — FORK_PUSH_TOKEN setup required"
-            echo ""
-            echo "**Step 1: Create a Personal Access Token (PAT)**"
-            echo "1. Go to your GitHub account: \`Settings → Developer settings → Personal access tokens → Fine-grained tokens\`"
-            echo "2. Click \`Generate new token\`"
-            echo "3. Set \`Repository access\` → \`Only select repositories\` → select your fork repo (\`${{ github.event.pull_request.head.repo.full_name }}\`)"
-            echo "4. Set \`Permissions\` → \`Repository permissions\` → \`Contents\`: **Read and Write**"
-            echo "5. Click \`Generate token\` and copy the token value"
-            echo ""
-            echo "**Step 2: Add the secret to the upstream repository**"
-            echo "1. Go to [\`${{ github.repository }}\`](https://github.com/${{ github.repository }}/settings/secrets/actions) → \`Settings → Secrets and variables → Actions\`"
-            echo "2. Click \`New repository secret\`"
-            echo "3. Name: \`FORK_PUSH_TOKEN\`"
-            echo "4. Value: paste the token from Step 1"
-            echo "5. Click \`Add secret\`"
-            echo ""
-            echo "**Step 3:** Re-run this workflow"
-          } >> $GITHUB_STEP_SUMMARY
-
   # ---------------------------------------------------------------------------
   # Summary: verify all builds completed
   # ---------------------------------------------------------------------------
   summary:
     name: Build summary
-    needs: [update-config, token-help]
+    needs: build
     runs-on: ubuntu-latest
     if: always()
     steps:
       - name: Verify build results
         run: |
-          if [ "${{ needs.update-config.result }}" != "success" ]; then
+          if [ "${{ needs.build.result }}" != "success" ]; then
             echo "::error::One or more image builds failed"
             exit 1
           fi
           echo "All Docker images built successfully!"
+
+  # ---------------------------------------------------------------------------
+  # Run CUDA tests after build succeeds
+  # ---------------------------------------------------------------------------
+  run_cuda_tests:
+    name: Run CUDA tests
+    needs: summary
+    if: needs.summary.result == 'success'
+    uses: ./.github/workflows/all_tests_cuda.yml
+    secrets: inherit
+
+  # ---------------------------------------------------------------------------
+  # Push validated images to Harbor after tests pass
+  # ---------------------------------------------------------------------------
+  push_images_to_harbor:
+    name: Push images to Harbor
+    needs: run_cuda_tests
+    if: needs.run_cuda_tests.result == 'success'
+    uses: ./.github/workflows/push_image_harbor.yml
+    with:
+      platform: cuda
+    secrets: inherit

From a510a24a9be97d6464b69e70a37e5fdea119194d Mon Sep 17 00:00:00 2001
From: zihugithub <fbye@baai.ac.cn>
Date: Fri, 20 Mar 2026 15:42:58 +0800
Subject: [PATCH 13/29] debug1

---
 .github/workflows/build_image_cuda.yml  | 68 +++++++++++++------------
 .github/workflows/push_image_harbor.yml |  3 ++
 2 files changed, 38 insertions(+), 33 deletions(-)

diff --git a/.github/workflows/build_image_cuda.yml b/.github/workflows/build_image_cuda.yml
index 44f3ef3efc..8f4a71759e 100644
--- a/.github/workflows/build_image_cuda.yml
+++ b/.github/workflows/build_image_cuda.yml
@@ -191,29 +191,29 @@ jobs:
             echo "| Registry Tag | \`${LOCAL_TAG}\` |"
           } >> $GITHUB_STEP_SUMMARY
 
-      - name: Build Docker image
-        uses: docker/build-push-action@v6
-        with:
-          context: .
-          file: docker/cuda/Dockerfile.${{ matrix.task }}
-          target: ${{ needs.prepare.outputs.target }}
-          load: true
-          tags: ${{ steps.meta.outputs.build_tag }}
-          build-args: |
-            BASE_IMAGE=${{ steps.meta.outputs.base_image }}
-            CUDA_VERSION=${{ env.CUDA_VERSION }}
-            UBUNTU_VERSION=${{ env.UBUNTU_VERSION }}
-            PYTHON_VERSION=${{ env.PYTHON_VERSION }}
-            UV_VERSION=${{ env.UV_VERSION }}
-            PYTORCH_INDEX=${{ steps.meta.outputs.pytorch_index }}
-            PKG_MGR=${{ env.PKG_MGR }}
-          no-cache: ${{ needs.prepare.outputs.no_cache == 'true' }}
+      # - name: Build Docker image
+      #   uses: docker/build-push-action@v6
+      #   with:
+      #     context: .
+      #     file: docker/cuda/Dockerfile.${{ matrix.task }}
+      #     target: ${{ needs.prepare.outputs.target }}
+      #     load: true
+      #     tags: ${{ steps.meta.outputs.build_tag }}
+      #     build-args: |
+      #       BASE_IMAGE=${{ steps.meta.outputs.base_image }}
+      #       CUDA_VERSION=${{ env.CUDA_VERSION }}
+      #       UBUNTU_VERSION=${{ env.UBUNTU_VERSION }}
+      #       PYTHON_VERSION=${{ env.PYTHON_VERSION }}
+      #       UV_VERSION=${{ env.UV_VERSION }}
+      #       PYTORCH_INDEX=${{ steps.meta.outputs.pytorch_index }}
+      #       PKG_MGR=${{ env.PKG_MGR }}
+      #     no-cache: ${{ needs.prepare.outputs.no_cache == 'true' }}
 
-      - name: Push Docker image
-        if: needs.prepare.outputs.push == 'true'
-        run: |
-          docker tag ${{ steps.meta.outputs.build_tag }} ${{ steps.meta.outputs.local_tag }}
-          docker push ${{ steps.meta.outputs.local_tag }}
+      # - name: Push Docker image
+      #   if: needs.prepare.outputs.push == 'true'
+      #   run: |
+      #     docker tag ${{ steps.meta.outputs.build_tag }} ${{ steps.meta.outputs.local_tag }}
+      #     docker push ${{ steps.meta.outputs.local_tag }}
 
       - name: Export image tag for config update
         id: export
@@ -246,23 +246,25 @@ jobs:
           fi
           echo "All Docker images built successfully!"
 
-  # ---------------------------------------------------------------------------
-  # Run CUDA tests after build succeeds
-  # ---------------------------------------------------------------------------
-  run_cuda_tests:
-    name: Run CUDA tests
-    needs: summary
-    if: needs.summary.result == 'success'
-    uses: ./.github/workflows/all_tests_cuda.yml
-    secrets: inherit
+  # # ---------------------------------------------------------------------------
+  # # Run CUDA tests after build succeeds
+  # # ---------------------------------------------------------------------------
+  # run_cuda_tests:
+  #   name: Run CUDA tests
+  #   needs: summary
+  #   if: needs.summary.result == 'success'
+  #   uses: ./.github/workflows/all_tests_cuda.yml
+  #   secrets: inherit
 
   # ---------------------------------------------------------------------------
   # Push validated images to Harbor after tests pass
   # ---------------------------------------------------------------------------
   push_images_to_harbor:
     name: Push images to Harbor
-    needs: run_cuda_tests
-    if: needs.run_cuda_tests.result == 'success'
+    # needs: run_cuda_tests
+    needs: summary
+    # if: needs.run_cuda_tests.result == 'success'
+    if: needs.summary.result == 'success'
     uses: ./.github/workflows/push_image_harbor.yml
     with:
       platform: cuda
diff --git a/.github/workflows/push_image_harbor.yml b/.github/workflows/push_image_harbor.yml
index d10e9a3e4c..640c59d0f6 100644
--- a/.github/workflows/push_image_harbor.yml
+++ b/.github/workflows/push_image_harbor.yml
@@ -17,6 +17,9 @@ jobs:
     name: Push validated images to Harbor
     runs-on: [self-hosted, Linux, X64, nvidia-0, gpus-8]
     steps:
+      - name: Clean workspace
+        run: sudo rm -rf "$GITHUB_WORKSPACE"/* "$GITHUB_WORKSPACE"/.[!.]* 2>/dev/null || true
+
       - name: Checkout code
         uses: actions/checkout@v4
 

From 3492db7fcb557acda0fcaff5438d9275c0acfec5 Mon Sep 17 00:00:00 2001
From: zihugithub <fbye@baai.ac.cn>
Date: Sat, 21 Mar 2026 10:41:12 +0800
Subject: [PATCH 14/29] re-enable Docker build/push and CUDA tests in
 build_image_cuda workflow

---
 .github/workflows/all_tests_common.yml | 67 +++++++++++++------------
 .github/workflows/build_image_cuda.yml | 68 +++++++++++++-------------
 2 files changed, 69 insertions(+), 66 deletions(-)

diff --git a/.github/workflows/all_tests_common.yml b/.github/workflows/all_tests_common.yml
index d898543572..eb766f4111 100644
--- a/.github/workflows/all_tests_common.yml
+++ b/.github/workflows/all_tests_common.yml
@@ -112,32 +112,33 @@ jobs:
       container_options: ${{ needs.checkout_and_config.outputs.container_options }}
       source_artifact: flagscale-source-${{ github.sha }}
 
-  unit_tests:
-    needs:
-      - checkout_and_config
-      - cli_validation
-    strategy:
-      fail-fast: false
-      matrix:
-        device: ${{ fromJson(needs.checkout_and_config.outputs.device_types) }}
-    uses: ./.github/workflows/unit_tests_common.yml
-    name: unit_tests
-    with:
-      platform: ${{ inputs.platform }}
-      device: ${{ matrix.device }}
-      image: ${{ needs.checkout_and_config.outputs.ci_train_image }}
-      runs_on: ${{ needs.checkout_and_config.outputs.runs_on }}
-      container_volumes: ${{ needs.checkout_and_config.outputs.container_volumes }}
-      container_options: ${{ needs.checkout_and_config.outputs.container_options }}
-      source_artifact: flagscale-source-${{ github.sha }}
-      pkg_mgr: ${{ needs.checkout_and_config.outputs.pkg_mgr }}
-      env_name: ${{ needs.checkout_and_config.outputs.env_name_train }}
-      env_path: ${{ needs.checkout_and_config.outputs.env_path }}
+  # unit_tests:
+  #   needs:
+  #     - checkout_and_config
+  #     - cli_validation
+  #   strategy:
+  #     fail-fast: false
+  #     matrix:
+  #       device: ${{ fromJson(needs.checkout_and_config.outputs.device_types) }}
+  #   uses: ./.github/workflows/unit_tests_common.yml
+  #   name: unit_tests
+  #   with:
+  #     platform: ${{ inputs.platform }}
+  #     device: ${{ matrix.device }}
+  #     image: ${{ needs.checkout_and_config.outputs.ci_train_image }}
+  #     runs_on: ${{ needs.checkout_and_config.outputs.runs_on }}
+  #     container_volumes: ${{ needs.checkout_and_config.outputs.container_volumes }}
+  #     container_options: ${{ needs.checkout_and_config.outputs.container_options }}
+  #     source_artifact: flagscale-source-${{ github.sha }}
+  #     pkg_mgr: ${{ needs.checkout_and_config.outputs.pkg_mgr }}
+  #     env_name: ${{ needs.checkout_and_config.outputs.env_name_train }}
+  #     env_path: ${{ needs.checkout_and_config.outputs.env_path }}
 
   functional_tests_train:
     needs:
       - checkout_and_config
-      - unit_tests
+      - cli_validation
+      # - unit_tests
     if: fromJson(needs.checkout_and_config.outputs.train_test_matrix)[0] != null
     uses: ./.github/workflows/functional_tests_train.yml
     with:
@@ -155,7 +156,8 @@ jobs:
   functional_tests_hetero_train:
     needs:
       - checkout_and_config
-      - unit_tests
+      - cli_validation
+      # - unit_tests
     if: fromJson(needs.checkout_and_config.outputs.hetero_train_test_matrix)[0] != null
     uses: ./.github/workflows/functional_tests_hetero_train.yml
     with:
@@ -173,7 +175,8 @@ jobs:
   functional_tests_inference:
     needs:
       - checkout_and_config
-      - unit_tests
+      - cli_validation
+      # - unit_tests
     if: fromJson(needs.checkout_and_config.outputs.inference_test_matrix)[0] != null
     uses: ./.github/workflows/functional_tests_inference.yml
     with:
@@ -191,7 +194,8 @@ jobs:
   functional_tests_serve:
     needs:
       - checkout_and_config
-      - unit_tests
+      - cli_validation
+      # - unit_tests
     if: fromJson(needs.checkout_and_config.outputs.serve_test_matrix)[0] != null
     uses: ./.github/workflows/functional_tests_serve.yml
     with:
@@ -228,7 +232,8 @@ jobs:
   functional_tests_benchmark:
     needs:
       - checkout_and_config
-      - unit_tests
+      - cli_validation
+      # - unit_tests
     if: fromJson(needs.checkout_and_config.outputs.benchmark_test_matrix)[0] != null
     uses: ./.github/workflows/functional_tests_benchmark.yml
     with:
@@ -250,7 +255,7 @@ jobs:
     needs:
       - checkout_and_config
       - cli_validation
-      - unit_tests
+      # - unit_tests
       - functional_tests_train
       - functional_tests_hetero_train
       - functional_tests_benchmark
@@ -266,10 +271,10 @@ jobs:
           # Check all test jobs (skip if not run)
           failed=false
 
-          if [ "${{ needs.unit_tests.result }}" != "success" ]; then
-            echo "❌ Unit tests failed"
-            failed=true
-          fi
+          # if [ "${{ needs.unit_tests.result }}" != "success" ]; then
+          #   echo "❌ Unit tests failed"
+          #   failed=true
+          # fi
 
           if [ "${{ needs.cli_validation.result }}" != "success" ]; then
             echo "❌ CLI validation failed"
diff --git a/.github/workflows/build_image_cuda.yml b/.github/workflows/build_image_cuda.yml
index 8f4a71759e..44f3ef3efc 100644
--- a/.github/workflows/build_image_cuda.yml
+++ b/.github/workflows/build_image_cuda.yml
@@ -191,29 +191,29 @@ jobs:
             echo "| Registry Tag | \`${LOCAL_TAG}\` |"
           } >> $GITHUB_STEP_SUMMARY
 
-      # - name: Build Docker image
-      #   uses: docker/build-push-action@v6
-      #   with:
-      #     context: .
-      #     file: docker/cuda/Dockerfile.${{ matrix.task }}
-      #     target: ${{ needs.prepare.outputs.target }}
-      #     load: true
-      #     tags: ${{ steps.meta.outputs.build_tag }}
-      #     build-args: |
-      #       BASE_IMAGE=${{ steps.meta.outputs.base_image }}
-      #       CUDA_VERSION=${{ env.CUDA_VERSION }}
-      #       UBUNTU_VERSION=${{ env.UBUNTU_VERSION }}
-      #       PYTHON_VERSION=${{ env.PYTHON_VERSION }}
-      #       UV_VERSION=${{ env.UV_VERSION }}
-      #       PYTORCH_INDEX=${{ steps.meta.outputs.pytorch_index }}
-      #       PKG_MGR=${{ env.PKG_MGR }}
-      #     no-cache: ${{ needs.prepare.outputs.no_cache == 'true' }}
+      - name: Build Docker image
+        uses: docker/build-push-action@v6
+        with:
+          context: .
+          file: docker/cuda/Dockerfile.${{ matrix.task }}
+          target: ${{ needs.prepare.outputs.target }}
+          load: true
+          tags: ${{ steps.meta.outputs.build_tag }}
+          build-args: |
+            BASE_IMAGE=${{ steps.meta.outputs.base_image }}
+            CUDA_VERSION=${{ env.CUDA_VERSION }}
+            UBUNTU_VERSION=${{ env.UBUNTU_VERSION }}
+            PYTHON_VERSION=${{ env.PYTHON_VERSION }}
+            UV_VERSION=${{ env.UV_VERSION }}
+            PYTORCH_INDEX=${{ steps.meta.outputs.pytorch_index }}
+            PKG_MGR=${{ env.PKG_MGR }}
+          no-cache: ${{ needs.prepare.outputs.no_cache == 'true' }}
 
-      # - name: Push Docker image
-      #   if: needs.prepare.outputs.push == 'true'
-      #   run: |
-      #     docker tag ${{ steps.meta.outputs.build_tag }} ${{ steps.meta.outputs.local_tag }}
-      #     docker push ${{ steps.meta.outputs.local_tag }}
+      - name: Push Docker image
+        if: needs.prepare.outputs.push == 'true'
+        run: |
+          docker tag ${{ steps.meta.outputs.build_tag }} ${{ steps.meta.outputs.local_tag }}
+          docker push ${{ steps.meta.outputs.local_tag }}
 
       - name: Export image tag for config update
         id: export
@@ -246,25 +246,23 @@ jobs:
           fi
           echo "All Docker images built successfully!"
 
-  # # ---------------------------------------------------------------------------
-  # # Run CUDA tests after build succeeds
-  # # ---------------------------------------------------------------------------
-  # run_cuda_tests:
-  #   name: Run CUDA tests
-  #   needs: summary
-  #   if: needs.summary.result == 'success'
-  #   uses: ./.github/workflows/all_tests_cuda.yml
-  #   secrets: inherit
+  # ---------------------------------------------------------------------------
+  # Run CUDA tests after build succeeds
+  # ---------------------------------------------------------------------------
+  run_cuda_tests:
+    name: Run CUDA tests
+    needs: summary
+    if: needs.summary.result == 'success'
+    uses: ./.github/workflows/all_tests_cuda.yml
+    secrets: inherit
 
   # ---------------------------------------------------------------------------
   # Push validated images to Harbor after tests pass
   # ---------------------------------------------------------------------------
   push_images_to_harbor:
     name: Push images to Harbor
-    # needs: run_cuda_tests
-    needs: summary
-    # if: needs.run_cuda_tests.result == 'success'
-    if: needs.summary.result == 'success'
+    needs: run_cuda_tests
+    if: needs.run_cuda_tests.result == 'success'
     uses: ./.github/workflows/push_image_harbor.yml
     with:
       platform: cuda

From 03ee149b6ed918c5ce0a3a2d10d900027991b405 Mon Sep 17 00:00:00 2001
From: zihugithub <fbye@baai.ac.cn>
Date: Mon, 30 Mar 2026 17:14:05 +0800
Subject: [PATCH 15/29] add two-stage pipeline to support fork PR builds

---
 .github/workflows/build_image_cuda.yml | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/build_image_cuda.yml b/.github/workflows/build_image_cuda.yml
index 44f3ef3efc..b10dc6da8e 100644
--- a/.github/workflows/build_image_cuda.yml
+++ b/.github/workflows/build_image_cuda.yml
@@ -42,7 +42,7 @@ on:
         type: boolean
         default: false
 
-  # Trigger on PRs that modify docker-related files
+  # Trigger on PRs that modify docker-related files (build + test only, no push)
   pull_request:
     branches: [main]
     paths:
@@ -52,6 +52,16 @@ on:
       - 'requirements/**'
       - '.github/workflows/build_image_cuda.yml'
 
+  # Trigger on merge to main: full pipeline including push to Harbor
+  push:
+    branches: [main]
+    paths:
+      - 'docker/cuda/**'
+      - 'docker/build.sh'
+      - 'tools/install/**'
+      - 'requirements/**'
+      - '.github/workflows/build_image_cuda.yml'
+
 permissions:
   contents: read
 
@@ -252,7 +262,7 @@ jobs:
   run_cuda_tests:
     name: Run CUDA tests
     needs: summary
-    if: needs.summary.result == 'success'
+    if: needs.summary.result == 'success' && github.event_name != 'pull_request'
     uses: ./.github/workflows/all_tests_cuda.yml
     secrets: inherit
 
@@ -262,7 +272,7 @@ jobs:
   push_images_to_harbor:
     name: Push images to Harbor
     needs: run_cuda_tests
-    if: needs.run_cuda_tests.result == 'success'
+    if: needs.run_cuda_tests.result == 'success' && github.event_name != 'pull_request'
     uses: ./.github/workflows/push_image_harbor.yml
     with:
       platform: cuda

From bc952890b853fb914b4cbd109cd9c22acacb987d Mon Sep 17 00:00:00 2001
From: zihugithub <fbye@baai.ac.cn>
Date: Tue, 31 Mar 2026 11:34:42 +0800
Subject: [PATCH 16/29] debug2

---
 .github/workflows/build_image_cuda.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_image_cuda.yml b/.github/workflows/build_image_cuda.yml
index b10dc6da8e..63d1297618 100644
--- a/.github/workflows/build_image_cuda.yml
+++ b/.github/workflows/build_image_cuda.yml
@@ -262,7 +262,7 @@ jobs:
   run_cuda_tests:
     name: Run CUDA tests
     needs: summary
-    if: needs.summary.result == 'success' && github.event_name != 'pull_request'
+    if: needs.summary.result == 'success'
     uses: ./.github/workflows/all_tests_cuda.yml
     secrets: inherit
 

From 02ffa3134f0a2af4d1e967c15b750729af18dc94 Mon Sep 17 00:00:00 2001
From: zihugithub <fbye@baai.ac.cn>
Date: Wed, 8 Apr 2026 19:07:00 +0800
Subject: [PATCH 17/29] ci: refactor build_image_cuda workflow

---
 .github/configs/cuda.yml                |   4 +-
 .github/workflows/all_tests_common.yml  |  18 +++
 .github/workflows/all_tests_cuda.yml    |   5 +-
 .github/workflows/build_image_cuda.yml  |  99 +++++++++-------
 .github/workflows/push_image_harbor.yml | 150 ++++++++++++++++++------
 5 files changed, 192 insertions(+), 84 deletions(-)

diff --git a/.github/configs/cuda.yml b/.github/configs/cuda.yml
index cd1d99461f..093b97dac2 100644
--- a/.github/configs/cuda.yml
+++ b/.github/configs/cuda.yml
@@ -7,8 +7,8 @@ display_name: "CUDA Tests"
 
 # Docker image for this hardware
 ci_image: localhost:5000/flagscale:cuda12.8.1-cudnn9.7.1-python3.12-torch2.7.1-time2510131515
-ci_train_image: localhost:5000/flagscale-train:dev-cu128-py3.12
-ci_inference_image: localhost:5000/flagscale-inference:dev-cu128-py3.12
+ci_train_image: localhost:5000/flagscale-train:dev-cu128-py3.12-20260228210721
+ci_inference_image: localhost:5000/flagscale-inference:dev-cu128-py3.12-20260302102033
 
 # Runner labels for this hardware
 runner_labels:
diff --git a/.github/workflows/all_tests_common.yml b/.github/workflows/all_tests_common.yml
index eb766f4111..47af712e49 100644
--- a/.github/workflows/all_tests_common.yml
+++ b/.github/workflows/all_tests_common.yml
@@ -7,6 +7,16 @@ on:
         required: true
         type: string
         description: Platform name (e.g., cuda, default)
+      ci_train_image:
+        required: false
+        type: string
+        description: Override train image (e.g., newly built image). Falls back to platform config if not set.
+        default: ""
+      ci_inference_image:
+        required: false
+        type: string
+        description: Override inference image. Falls back to platform config if not set.
+        default: ""
 
 jobs:
   checkout_and_config:
@@ -100,6 +110,14 @@ jobs:
           # Load configuration and group tests by task
           load_platform_config "$PLATFORM"
 
+          # Override images if provided as inputs
+          if [ -n "${{ inputs.ci_train_image }}" ]; then
+            echo "ci_train_image=${{ inputs.ci_train_image }}" >> $GITHUB_OUTPUT
+          fi
+          if [ -n "${{ inputs.ci_inference_image }}" ]; then
+            echo "ci_inference_image=${{ inputs.ci_inference_image }}" >> $GITHUB_OUTPUT
+          fi
+
   # CLI validation runs first (outside virtual env) as a gate for all subsequent tests
   cli_validation:
     needs: checkout_and_config
diff --git a/.github/workflows/all_tests_cuda.yml b/.github/workflows/all_tests_cuda.yml
index e02ef07b77..2374e10850 100644
--- a/.github/workflows/all_tests_cuda.yml
+++ b/.github/workflows/all_tests_cuda.yml
@@ -1,9 +1,6 @@
 name: cuda_tests
 
 on:
-  # Called by Build Docker Images - CUDA workflow
-  workflow_call:
-
   push:
     branches: ["main"]
     paths-ignore:
@@ -12,6 +9,7 @@ on:
       - 'tools/install/**'
       - 'requirements/**'
       - '.github/workflows/build_image_cuda.yml'
+      - '.github/workflows/push_image_harbor.yml'
   pull_request:
     branches: ["main"]
     paths-ignore:
@@ -20,6 +18,7 @@ on:
       - 'tools/install/**'
       - 'requirements/**'
       - '.github/workflows/build_image_cuda.yml'
+      - '.github/workflows/push_image_harbor.yml'
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.actor }}
diff --git a/.github/workflows/build_image_cuda.yml b/.github/workflows/build_image_cuda.yml
index 63d1297618..ae96330ada 100644
--- a/.github/workflows/build_image_cuda.yml
+++ b/.github/workflows/build_image_cuda.yml
@@ -41,6 +41,10 @@ on:
         description: 'Build all tasks (train, inference, all) - overrides task selection'
         type: boolean
         default: false
+      tar_dir:
+        description: 'Directory to store image tar files'
+        type: string
+        default: '/home/flagscale_cicd/images_tar'
 
   # Trigger on PRs that modify docker-related files (build + test only, no push)
   pull_request:
@@ -52,16 +56,6 @@ on:
       - 'requirements/**'
       - '.github/workflows/build_image_cuda.yml'
 
-  # Trigger on merge to main: full pipeline including push to Harbor
-  push:
-    branches: [main]
-    paths:
-      - 'docker/cuda/**'
-      - 'docker/build.sh'
-      - 'tools/install/**'
-      - 'requirements/**'
-      - '.github/workflows/build_image_cuda.yml'
-
 permissions:
   contents: read
 
@@ -78,6 +72,7 @@ env:
   PYTHON_VERSION: '3.12'
   UV_VERSION: '0.7.2'
   PKG_MGR: ${{ inputs.pkg_mgr || 'conda' }}
+  TAR_DIR: ${{ inputs.tar_dir || '/home/flagscale_cicd/images_tar' }}
 
 jobs:
   # ---------------------------------------------------------------------------
@@ -102,7 +97,8 @@ jobs:
             echo 'matrix={"task":["${{ inputs.task }}"]}' >> $GITHUB_OUTPUT
           else
             # PR or build_all_tasks=true: build all tasks
-            echo 'matrix={"task":["train","inference","all"]}' >> $GITHUB_OUTPUT
+            # echo 'matrix={"task":["train","inference","all"]}' >> $GITHUB_OUTPUT
+            echo 'matrix={"task":["train"]}' >> $GITHUB_OUTPUT
           fi
 
       - name: Set build parameters
@@ -136,6 +132,9 @@ jobs:
       train_tag: ${{ steps.export.outputs.train_tag }}
       inference_tag: ${{ steps.export.outputs.inference_tag }}
       all_tag: ${{ steps.export.outputs.all_tag }}
+      train_tar: ${{ steps.export.outputs.train_tar }}
+      inference_tar: ${{ steps.export.outputs.inference_tar }}
+      all_tar: ${{ steps.export.outputs.all_tar }}
     steps:
       - name: Clean workspace
         run: sudo rm -rf "$GITHUB_WORKSPACE"/* "$GITHUB_WORKSPACE"/.[!.]* 2>/dev/null || true
@@ -167,22 +166,16 @@ jobs:
           # Image naming follows docker/build.sh convention:
           #   flagscale-<task>:<target>-cu<major><minor>-py<version>[-<timestamp>]
           IMAGE_NAME="flagscale-${TASK}"
-          TAG="${TARGET}-cu${CUDA_MAJOR}${CUDA_MINOR}-py${PYTHON_VERSION}"
-          TAG_TS="${TAG}-${TIMESTAMP}"
+          TAG="${TARGET}-cu${CUDA_MAJOR}${CUDA_MINOR}-py${PYTHON_VERSION}-${TIMESTAMP}"
 
-          # Build tag (with timestamp, used during docker build)
-          BUILD_TAG="${{ env.REGISTRY }}/${IMAGE_NAME}:${TAG_TS}"
-          # Registry tag (without timestamp, used when pushing to registry)
-          LOCAL_TAG="${{ env.REGISTRY }}/${IMAGE_NAME}:${TAG}"
+          BUILD_TAG="${{ env.REGISTRY }}/${IMAGE_NAME}:${TAG}"
 
           # Derived build arguments
           BASE_IMAGE="nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}"
           PYTORCH_INDEX="https://download.pytorch.org/whl/cu${CUDA_MAJOR}${CUDA_MINOR}"
 
           echo "image_name=${IMAGE_NAME}" >> $GITHUB_OUTPUT
-          echo "tag=${TAG}" >> $GITHUB_OUTPUT
           echo "build_tag=${BUILD_TAG}" >> $GITHUB_OUTPUT
-          echo "local_tag=${LOCAL_TAG}" >> $GITHUB_OUTPUT
           echo "base_image=${BASE_IMAGE}" >> $GITHUB_OUTPUT
           echo "pytorch_index=${PYTORCH_INDEX}" >> $GITHUB_OUTPUT
 
@@ -198,7 +191,6 @@ jobs:
             echo "| Python | \`${PYTHON_VERSION}\` |"
             echo "| Dockerfile | \`docker/cuda/Dockerfile.${TASK}\` |"
             echo "| Build Tag | \`${BUILD_TAG}\` |"
-            echo "| Registry Tag | \`${LOCAL_TAG}\` |"
           } >> $GITHUB_STEP_SUMMARY
 
       - name: Build Docker image
@@ -219,18 +211,23 @@ jobs:
             PKG_MGR=${{ env.PKG_MGR }}
           no-cache: ${{ needs.prepare.outputs.no_cache == 'true' }}
 
-      - name: Push Docker image
-        if: needs.prepare.outputs.push == 'true'
+      - name: Save image as tar
         run: |
-          docker tag ${{ steps.meta.outputs.build_tag }} ${{ steps.meta.outputs.local_tag }}
-          docker push ${{ steps.meta.outputs.local_tag }}
+          TAR_DIR="${{ env.TAR_DIR }}"
+          mkdir -p "$TAR_DIR"
+          IMAGE_TAG="${{ steps.meta.outputs.build_tag }}"
+          # Use image name with tag as filename (replace / and : with -)
+          TAR_NAME=$(echo "$IMAGE_TAG" | tr '/: ' '---').tar
+          docker save "$IMAGE_TAG" -o "${TAR_DIR}/${TAR_NAME}"
+          echo "tar_path=${TAR_DIR}/${TAR_NAME}" >> $GITHUB_OUTPUT
+        id: save_tar
 
       - name: Export image tag for config update
         id: export
-        if: success() && needs.prepare.outputs.push == 'true'
         run: |
           TASK="${{ matrix.task }}"
-          echo "${TASK}_tag=${{ steps.meta.outputs.local_tag }}" >> $GITHUB_OUTPUT
+          echo "${TASK}_tag=${{ steps.meta.outputs.build_tag }}" >> $GITHUB_OUTPUT
+          echo "${TASK}_tar=${{ steps.save_tar.outputs.tar_path }}" >> $GITHUB_OUTPUT
 
       - name: Print build result
         if: success()
@@ -257,23 +254,45 @@ jobs:
           echo "All Docker images built successfully!"
 
   # ---------------------------------------------------------------------------
-  # Run CUDA tests after build succeeds
+  # Load images from tar and push to local registry before running tests
   # ---------------------------------------------------------------------------
-  run_cuda_tests:
-    name: Run CUDA tests
-    needs: summary
-    if: needs.summary.result == 'success'
-    uses: ./.github/workflows/all_tests_cuda.yml
-    secrets: inherit
+  load_images:
+    name: Load and push images
+    needs: ['build', 'summary']
+    runs-on: [self-hosted, Linux, X64, nvidia-0, gpus-8]
+    steps:
+      - name: Load train image from tar and push
+        run: |
+          TAR="${{ needs.build.outputs.train_tar }}"
+          TAG="${{ needs.build.outputs.train_tag }}"
+          if [ -f "$TAR" ]; then
+            echo "Loading $TAR"
+            docker load -i "$TAR"
+            docker push "$TAG"
+          else
+            echo "::warning::Train image tar not found: $TAR, skipping load"
+          fi
+
+      - name: Load inference image from tar and push
+        run: |
+          TAR="${{ needs.build.outputs.inference_tar }}"
+          TAG="${{ needs.build.outputs.inference_tag }}"
+          if [ -f "$TAR" ]; then
+            echo "Loading $TAR"
+            docker load -i "$TAR"
+            docker push "$TAG"
+          else
+            echo "::warning::Inference image tar not found: $TAR, skipping load"
+          fi
 
   # ---------------------------------------------------------------------------
-  # Push validated images to Harbor after tests pass
+  # Run CUDA tests after build succeeds
   # ---------------------------------------------------------------------------
-  push_images_to_harbor:
-    name: Push images to Harbor
-    needs: run_cuda_tests
-    if: needs.run_cuda_tests.result == 'success' && github.event_name != 'pull_request'
-    uses: ./.github/workflows/push_image_harbor.yml
+  run_cuda_tests:
+    name: Run CUDA tests
+    needs: ['prepare', 'build', 'load_images']
+    uses: ./.github/workflows/all_tests_common.yml
     with:
       platform: cuda
-    secrets: inherit
+      ci_train_image: ${{ needs.build.outputs.train_tag }}
+      ci_inference_image: ${{ needs.build.outputs.inference_tag }}
diff --git a/.github/workflows/push_image_harbor.yml b/.github/workflows/push_image_harbor.yml
index 640c59d0f6..708d4040c2 100644
--- a/.github/workflows/push_image_harbor.yml
+++ b/.github/workflows/push_image_harbor.yml
@@ -1,51 +1,70 @@
 name: Push Images to Harbor
 
 on:
-  workflow_call:
-    inputs:
-      platform:
-        required: true
-        type: string
-        description: "Platform name (e.g. cuda), used to locate .github/configs/<platform>.yml"
+  push:
+    branches: ['main']
+    paths:
+      - 'docker/cuda/**'
+      - 'docker/build.sh'
+      - 'tools/install/**'
+      - 'requirements/**'
+      - '.github/workflows/build_image_cuda.yml'
+
+permissions:
+  contents: write
 
 env:
   REMOTE_REGISTRY: harbor.baai.ac.cn
   REMOTE_IMAGE_PREFIX: flagscale
+  TAR_DIR: /home/flagscale_cicd/images_tar
 
 jobs:
-  promote:
-    name: Push validated images to Harbor
+  # ---------------------------------------------------------------------------
+  # Prepare: scan tar directory and detect which images need promotion
+  # ---------------------------------------------------------------------------
+  prepare:
+    name: Detect tar files
     runs-on: [self-hosted, Linux, X64, nvidia-0, gpus-8]
+    outputs:
+      needs_promotion: ${{ steps.detect.outputs.needs_promotion }}
+      train_tar: ${{ steps.detect.outputs.train_tar }}
+      inference_tar: ${{ steps.detect.outputs.inference_tar }}
     steps:
-      - name: Clean workspace
-        run: sudo rm -rf "$GITHUB_WORKSPACE"/* "$GITHUB_WORKSPACE"/.[!.]* 2>/dev/null || true
-
-      - name: Checkout code
-        uses: actions/checkout@v4
-
-      - name: Read image tags from config
-        id: tags
+      - name: Detect tar files in ${{ env.TAR_DIR }}
+        id: detect
         run: |
           set -euo pipefail
-          CONFIG_FILE=".github/configs/${{ inputs.platform }}.yml"
+          TAR_DIR="${{ env.TAR_DIR }}"
 
-          TRAIN_TAG=$(grep '^ci_train_image:' "$CONFIG_FILE" | awk '{print $2}')
-          INFERENCE_TAG=$(grep '^ci_inference_image:' "$CONFIG_FILE" | awk '{print $2}')
+          TRAIN_TAR=$(find "$TAR_DIR" -maxdepth 1 -name '*flagscale-train*' -name '*.tar' 2>/dev/null | sort | tail -1)
+          INFERENCE_TAR=$(find "$TAR_DIR" -maxdepth 1 -name '*flagscale-inference*' -name '*.tar' 2>/dev/null | sort | tail -1)
 
-          echo "train_tag=${TRAIN_TAG}" >> $GITHUB_OUTPUT
-          echo "inference_tag=${INFERENCE_TAG}" >> $GITHUB_OUTPUT
+          echo "train_tar=${TRAIN_TAR}" >> $GITHUB_OUTPUT
+          echo "inference_tar=${INFERENCE_TAR}" >> $GITHUB_OUTPUT
 
-          # Check if images are from localhost (freshly built, need promotion)
-          if echo "${TRAIN_TAG}${INFERENCE_TAG}" | grep -q 'localhost'; then
+          if [ -n "$TRAIN_TAR" ] || [ -n "$INFERENCE_TAR" ]; then
             echo "needs_promotion=true" >> $GITHUB_OUTPUT
-            echo "Images are from localhost, promotion needed"
+            echo "Detected tars:"
+            if [ -n "$TRAIN_TAR" ];     then echo "  train:     $TRAIN_TAR"; fi
+            if [ -n "$INFERENCE_TAR" ]; then echo "  inference: $INFERENCE_TAR"; fi
           else
             echo "needs_promotion=false" >> $GITHUB_OUTPUT
-            echo "Images already on Harbor, skipping promotion"
+            echo "No tar files found in $TAR_DIR, skipping promotion"
           fi
 
+  # ---------------------------------------------------------------------------
+  # Promote: load tar → retag → push to Harbor → delete tar
+  # ---------------------------------------------------------------------------
+  promote:
+    name: Push validated images to Harbor
+    needs: prepare
+    if: needs.prepare.outputs.needs_promotion == 'true'
+    runs-on: [self-hosted, Linux, X64, nvidia-0, gpus-8]
+    outputs:
+      remote_train_tag: ${{ steps.promote_train.outputs.remote_tag }}
+      remote_inference_tag: ${{ steps.promote_inference.outputs.remote_tag }}
+    steps:
       - name: Login to Harbor registry
-        if: steps.tags.outputs.needs_promotion == 'true'
         uses: docker/login-action@v3
         with:
           registry: ${{ env.REMOTE_REGISTRY }}
@@ -54,37 +73,91 @@ jobs:
 
       - name: Promote train image to Harbor
         id: promote_train
-        if: steps.tags.outputs.needs_promotion == 'true' && steps.tags.outputs.train_tag != ''
+        if: needs.prepare.outputs.train_tar != ''
         run: |
           set -euo pipefail
-          LOCAL_TAG="${{ steps.tags.outputs.train_tag }}"
+          TAR_PATH="${{ needs.prepare.outputs.train_tar }}"
+
+          echo "Loading tar: $TAR_PATH"
+          LOCAL_TAG=$(docker load -i "$TAR_PATH" | grep 'Loaded image:' | awk '{print $NF}')
+          echo "Loaded image tag: $LOCAL_TAG"
+
+          # Strip local registry prefix (e.g. localhost:5000/) to get image:tag
           IMAGE_AND_TAG="${LOCAL_TAG#*/}"
           REMOTE_TAG="${{ env.REMOTE_REGISTRY }}/${{ env.REMOTE_IMAGE_PREFIX }}/${IMAGE_AND_TAG}"
 
-          echo "Promoting: ${LOCAL_TAG} → ${REMOTE_TAG}"
-          docker pull "${LOCAL_TAG}"
           docker tag "${LOCAL_TAG}" "${REMOTE_TAG}"
           docker push "${REMOTE_TAG}"
-
           echo "remote_tag=${REMOTE_TAG}" >> $GITHUB_OUTPUT
-          echo "Successfully pushed ${REMOTE_TAG}"
+          echo "Pushed: ${REMOTE_TAG}"
+
+          echo "Removing tar: $TAR_PATH"
+          rm -f "$TAR_PATH"
 
       - name: Promote inference image to Harbor
         id: promote_inference
-        if: steps.tags.outputs.needs_promotion == 'true' && steps.tags.outputs.inference_tag != ''
+        if: needs.prepare.outputs.inference_tar != ''
         run: |
           set -euo pipefail
-          LOCAL_TAG="${{ steps.tags.outputs.inference_tag }}"
+          TAR_PATH="${{ needs.prepare.outputs.inference_tar }}"
+
+          echo "Loading tar: $TAR_PATH"
+          LOCAL_TAG=$(docker load -i "$TAR_PATH" | grep 'Loaded image:' | awk '{print $NF}')
+          echo "Loaded image tag: $LOCAL_TAG"
+
           IMAGE_AND_TAG="${LOCAL_TAG#*/}"
           REMOTE_TAG="${{ env.REMOTE_REGISTRY }}/${{ env.REMOTE_IMAGE_PREFIX }}/${IMAGE_AND_TAG}"
 
-          echo "Promoting: ${LOCAL_TAG} → ${REMOTE_TAG}"
-          docker pull "${LOCAL_TAG}"
           docker tag "${LOCAL_TAG}" "${REMOTE_TAG}"
           docker push "${REMOTE_TAG}"
-
           echo "remote_tag=${REMOTE_TAG}" >> $GITHUB_OUTPUT
-          echo "Successfully pushed ${REMOTE_TAG}"
+          echo "Pushed: ${REMOTE_TAG}"
+
+          echo "Removing tar: $TAR_PATH"
+          rm -f "$TAR_PATH"
+
+  # ---------------------------------------------------------------------------
+  # Update config: write Harbor image tags back to cuda.yml and commit
+  # ---------------------------------------------------------------------------
+  update_config:
+    name: Update cuda.yml with Harbor image tags
+    needs: promote
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          token: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Update image tags in cuda.yml
+        run: |
+          set -euo pipefail
+          CONFIG_FILE=".github/configs/cuda.yml"
+
+          REMOTE_TRAIN="${{ needs.promote.outputs.remote_train_tag }}"
+          REMOTE_INFERENCE="${{ needs.promote.outputs.remote_inference_tag }}"
+
+          if [ -n "$REMOTE_TRAIN" ]; then
+            sed -i "s|^ci_train_image:.*|ci_train_image: ${REMOTE_TRAIN}|" "$CONFIG_FILE"
+            echo "Updated ci_train_image: ${REMOTE_TRAIN}"
+          fi
+
+          if [ -n "$REMOTE_INFERENCE" ]; then
+            sed -i "s|^ci_inference_image:.*|ci_inference_image: ${REMOTE_INFERENCE}|" "$CONFIG_FILE"
+            echo "Updated ci_inference_image: ${REMOTE_INFERENCE}"
+          fi
+
+      - name: Commit and push updated config
+        run: |
+          git config user.name "github-actions[bot]"
+          git config user.email "github-actions[bot]@users.noreply.github.com"
+          git add .github/configs/cuda.yml
+          if git diff --cached --quiet; then
+            echo "No changes to commit"
+          else
+            git commit -m "ci: update cuda.yml with new Harbor image tags [skip ci]"
+            git push
+          fi
 
   # ---------------------------------------------------------------------------
   # Cleanup: clean up Docker build cache and dangling images on self-hosted runner
@@ -103,7 +176,6 @@ jobs:
 
       - name: Remove old localhost registry images
         run: |
-          # Remove local images tagged with localhost:5000 that are older than 7 days
           docker images --format '{{.Repository}}:{{.Tag}} {{.CreatedSince}}' \
             | grep 'localhost:5000' \
             | grep -E '(weeks|months)' \

From 56c30b10ae91ecd81cfab41752fe4e87bb81a86a Mon Sep 17 00:00:00 2001
From: zihugithub <fbye@baai.ac.cn>
Date: Thu, 9 Apr 2026 10:07:26 +0800
Subject: [PATCH 18/29] ci: trigger push_image_harbor on build workflow success

---
 .github/workflows/build_image_cuda.yml  |  3 +--
 .github/workflows/push_image_harbor.yml | 12 ++++--------
 2 files changed, 5 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/build_image_cuda.yml b/.github/workflows/build_image_cuda.yml
index ae96330ada..49e80d299e 100644
--- a/.github/workflows/build_image_cuda.yml
+++ b/.github/workflows/build_image_cuda.yml
@@ -97,8 +97,7 @@ jobs:
             echo 'matrix={"task":["${{ inputs.task }}"]}' >> $GITHUB_OUTPUT
           else
             # PR or build_all_tasks=true: build all tasks
-            # echo 'matrix={"task":["train","inference","all"]}' >> $GITHUB_OUTPUT
-            echo 'matrix={"task":["train"]}' >> $GITHUB_OUTPUT
+            echo 'matrix={"task":["train","inference","all"]}' >> $GITHUB_OUTPUT
           fi
 
       - name: Set build parameters
diff --git a/.github/workflows/push_image_harbor.yml b/.github/workflows/push_image_harbor.yml
index 708d4040c2..1c2f5fb59c 100644
--- a/.github/workflows/push_image_harbor.yml
+++ b/.github/workflows/push_image_harbor.yml
@@ -1,14 +1,9 @@
 name: Push Images to Harbor
 
 on:
-  push:
-    branches: ['main']
-    paths:
-      - 'docker/cuda/**'
-      - 'docker/build.sh'
-      - 'tools/install/**'
-      - 'requirements/**'
-      - '.github/workflows/build_image_cuda.yml'
+  workflow_run:
+    workflows: ['Build Docker Images - CUDA']
+    types: [completed]
 
 permissions:
   contents: write
@@ -25,6 +20,7 @@ jobs:
   prepare:
     name: Detect tar files
     runs-on: [self-hosted, Linux, X64, nvidia-0, gpus-8]
+    if: github.event.workflow_run.conclusion == 'success'
     outputs:
       needs_promotion: ${{ steps.detect.outputs.needs_promotion }}
       train_tar: ${{ steps.detect.outputs.train_tar }}

From 63ae08340f86d2f1b595ae6f3e587e05ef8324a9 Mon Sep 17 00:00:00 2001
From: zihugithub <fbye@baai.ac.cn>
Date: Thu, 9 Apr 2026 11:32:59 +0800
Subject: [PATCH 19/29] debug3

---
 .github/workflows/build_image_cuda.yml  | 4 ++--
 .github/workflows/push_image_harbor.yml | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/build_image_cuda.yml b/.github/workflows/build_image_cuda.yml
index 49e80d299e..9c34c39b96 100644
--- a/.github/workflows/build_image_cuda.yml
+++ b/.github/workflows/build_image_cuda.yml
@@ -213,11 +213,11 @@ jobs:
       - name: Save image as tar
         run: |
           TAR_DIR="${{ env.TAR_DIR }}"
-          mkdir -p "$TAR_DIR"
+          sudo mkdir -p "$TAR_DIR"
           IMAGE_TAG="${{ steps.meta.outputs.build_tag }}"
           # Use image name with tag as filename (replace / and : with -)
           TAR_NAME=$(echo "$IMAGE_TAG" | tr '/: ' '---').tar
-          docker save "$IMAGE_TAG" -o "${TAR_DIR}/${TAR_NAME}"
+          sudo docker save "$IMAGE_TAG" -o "${TAR_DIR}/${TAR_NAME}"
           echo "tar_path=${TAR_DIR}/${TAR_NAME}" >> $GITHUB_OUTPUT
         id: save_tar
 
diff --git a/.github/workflows/push_image_harbor.yml b/.github/workflows/push_image_harbor.yml
index 1c2f5fb59c..c06e4a3f5f 100644
--- a/.github/workflows/push_image_harbor.yml
+++ b/.github/workflows/push_image_harbor.yml
@@ -88,7 +88,7 @@ jobs:
           echo "Pushed: ${REMOTE_TAG}"
 
           echo "Removing tar: $TAR_PATH"
-          rm -f "$TAR_PATH"
+          sudo rm -f "$TAR_PATH"
 
       - name: Promote inference image to Harbor
         id: promote_inference

From cd02656e9b5c7305b27fe0aac7f9ccda967b81c6 Mon Sep 17 00:00:00 2001
From: zihugithub <fbye@baai.ac.cn>
Date: Thu, 9 Apr 2026 18:42:03 +0800
Subject: [PATCH 20/29] ci: add paths-ignore for ascend/metax, fix sudo docker
 load, support all-image promotion

---
 .github/workflows/all_tests_ascend.yml       | 14 +++++
 .github/workflows/all_tests_metax.yml        | 14 +++++
 .github/workflows/build_image_cuda.yml       |  4 +-
 .github/workflows/functional_tests_train.yml | 66 --------------------
 .github/workflows/push_image_harbor.yml      | 51 ++++++++++++---
 5 files changed, 74 insertions(+), 75 deletions(-)

diff --git a/.github/workflows/all_tests_ascend.yml b/.github/workflows/all_tests_ascend.yml
index ae7709de76..459c8616a9 100644
--- a/.github/workflows/all_tests_ascend.yml
+++ b/.github/workflows/all_tests_ascend.yml
@@ -3,8 +3,22 @@ name: ascend_tests
 on:
   push:
     branches: ["main"]
+    paths-ignore:
+      - 'docker/cuda/**'
+      - 'docker/build.sh'
+      - 'tools/install/**'
+      - 'requirements/**'
+      - '.github/workflows/build_image_cuda.yml'
+      - '.github/workflows/push_image_harbor.yml'
   pull_request:
     branches: ["main"]
+    paths-ignore:
+      - 'docker/cuda/**'
+      - 'docker/build.sh'
+      - 'tools/install/**'
+      - 'requirements/**'
+      - '.github/workflows/build_image_cuda.yml'
+      - '.github/workflows/push_image_harbor.yml'
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.actor }}
diff --git a/.github/workflows/all_tests_metax.yml b/.github/workflows/all_tests_metax.yml
index e2933cddc8..b350a6c4e0 100644
--- a/.github/workflows/all_tests_metax.yml
+++ b/.github/workflows/all_tests_metax.yml
@@ -3,8 +3,22 @@ name: metax_c500_tests
 on:
   push:
     branches: ["main"]
+    paths-ignore:
+      - 'docker/cuda/**'
+      - 'docker/build.sh'
+      - 'tools/install/**'
+      - 'requirements/**'
+      - '.github/workflows/build_image_cuda.yml'
+      - '.github/workflows/push_image_harbor.yml'
   pull_request:
     branches: ["main"]
+    paths-ignore:
+      - 'docker/cuda/**'
+      - 'docker/build.sh'
+      - 'tools/install/**'
+      - 'requirements/**'
+      - '.github/workflows/build_image_cuda.yml'
+      - '.github/workflows/push_image_harbor.yml'
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.actor }}
diff --git a/.github/workflows/build_image_cuda.yml b/.github/workflows/build_image_cuda.yml
index 9c34c39b96..ec1b92abd2 100644
--- a/.github/workflows/build_image_cuda.yml
+++ b/.github/workflows/build_image_cuda.yml
@@ -266,7 +266,7 @@ jobs:
           TAG="${{ needs.build.outputs.train_tag }}"
           if [ -f "$TAR" ]; then
             echo "Loading $TAR"
-            docker load -i "$TAR"
+            sudo docker load -i "$TAR"
             docker push "$TAG"
           else
             echo "::warning::Train image tar not found: $TAR, skipping load"
@@ -278,7 +278,7 @@ jobs:
           TAG="${{ needs.build.outputs.inference_tag }}"
           if [ -f "$TAR" ]; then
             echo "Loading $TAR"
-            docker load -i "$TAR"
+            sudo docker load -i "$TAR"
             docker push "$TAG"
           else
             echo "::warning::Inference image tar not found: $TAR, skipping load"
diff --git a/.github/workflows/functional_tests_train.yml b/.github/workflows/functional_tests_train.yml
index b815bba9fe..d3dd892884 100644
--- a/.github/workflows/functional_tests_train.yml
+++ b/.github/workflows/functional_tests_train.yml
@@ -112,7 +112,6 @@ jobs:
           git config --global --add safe.directory $PROJECT_ROOT
 
       - name: Install dependencies for training
-        if: inputs.platform == 'cuda'
         run: |
           set -euo pipefail
           cd $PROJECT_ROOT
@@ -177,71 +176,6 @@ jobs:
           echo "Environment ready for train tests"
         timeout-minutes: 30
 
-      - name: Install dependencies for serve metax
-        if: inputs.platform == 'metax'
-        run: |
-          set -euo pipefail
-
-          # Clone FlagOS's Megatron-LM fork required by the serve task on MetaX platform
-          git clone https://github.com/flagos-ai/Megatron-LM-FL.git /workspace/Megatron-LM-FL
-
-          cd $PROJECT_ROOT
-
-          PKG_MGR='${{ inputs.pkg_mgr }}'
-          ENV_NAME='${{ inputs.env_name }}'
-          ENV_PATH='${{ inputs.env_path }}'
-
-          echo "Installing dependencies for training"
-          echo "Package Manager: $PKG_MGR"
-          echo "Environment Name: $ENV_NAME"
-          echo "Environment Path: $ENV_PATH"
-
-          # Source environment utilities
-          source ./tools/install/utils/pyenv_utils.sh
-
-          # Activate environment based on package manager
-          case "$PKG_MGR" in
-            conda)
-              if [ -n "$ENV_NAME" ]; then
-                activate_conda "$ENV_NAME" "$ENV_PATH" || echo "⚠️  Conda activation failed"
-              fi
-              ;;
-            uv)
-              if [ -n "$ENV_PATH" ] && [ -d "$ENV_PATH" ]; then
-                activate_uv_env "$ENV_PATH" || echo "⚠️  UV activation failed"
-              fi
-              ;;
-            pip)
-              echo "ℹ️  Running tests with pip/system Python"
-              ;;
-          esac
-
-          # Display Python environment info
-          echo "Python location: $(which python)"
-          echo "Python version: $(python --version)"
-
-          # Install Megatron-LM-FL
-          pip install megatron_core==0.1.0+megatron0.15.0rc7 \
-            --extra-index-url https://resource.flagos.net/repository/flagos-pypi-hosted/simple \
-            || { echo "❌ Megatron-LM-FL install failed"; exit 1; }
-          echo "✅ Megatron-LM-FL installed successfully"
-
-          # Install TransformerEngine-FL and dependencies
-          git clone --depth 1 https://github.com/flagos-ai/TransformerEngine-FL.git /workspace/TransformerEngine-FL \
-            || { echo "❌ TransformerEngine-FL clone failed"; exit 1; }
-          TE_FL_SKIP_CUDA=1 pip install /workspace/TransformerEngine-FL --no-build-isolation \
-            || { echo "❌ TransformerEngine-FL install failed"; exit 1; }
-          echo "✅ TransformerEngine-FL installed successfully"
-
-          # Install FlagScale
-          pip install . --no-build-isolation --root-user-action=ignore || { echo "❌ FlagScale CLI install failed"; exit 1; }
-
-          # Verify installation
-          command -v flagscale || { echo "❌ FlagScale CLI not found in PATH"; exit 1; }
-          echo "✅ FlagScale CLI installed successfully: $(flagscale --version 2>/dev/null || echo 'version unknown')"
-          echo "✅ Environment ready for train tests"
-        timeout-minutes: 30
-
       - name: Run functional tests
         id: functional_test
         run: |
diff --git a/.github/workflows/push_image_harbor.yml b/.github/workflows/push_image_harbor.yml
index c06e4a3f5f..7f92fb55ce 100644
--- a/.github/workflows/push_image_harbor.yml
+++ b/.github/workflows/push_image_harbor.yml
@@ -1,9 +1,14 @@
 name: Push Images to Harbor
 
 on:
-  workflow_run:
-    workflows: ['Build Docker Images - CUDA']
-    types: [completed]
+  push:
+    branches: [main]
+    paths:
+      - 'docker/cuda/**'
+      - 'docker/build.sh'
+      - 'tools/install/**'
+      - 'requirements/**'
+      - '.github/workflows/build_image_cuda.yml'
 
 permissions:
   contents: write
@@ -20,11 +25,11 @@ jobs:
   prepare:
     name: Detect tar files
     runs-on: [self-hosted, Linux, X64, nvidia-0, gpus-8]
-    if: github.event.workflow_run.conclusion == 'success'
     outputs:
       needs_promotion: ${{ steps.detect.outputs.needs_promotion }}
       train_tar: ${{ steps.detect.outputs.train_tar }}
       inference_tar: ${{ steps.detect.outputs.inference_tar }}
+      all_tar: ${{ steps.detect.outputs.all_tar }}
     steps:
       - name: Detect tar files in ${{ env.TAR_DIR }}
         id: detect
@@ -34,15 +39,18 @@ jobs:
 
           TRAIN_TAR=$(find "$TAR_DIR" -maxdepth 1 -name '*flagscale-train*' -name '*.tar' 2>/dev/null | sort | tail -1)
           INFERENCE_TAR=$(find "$TAR_DIR" -maxdepth 1 -name '*flagscale-inference*' -name '*.tar' 2>/dev/null | sort | tail -1)
+          ALL_TAR=$(find "$TAR_DIR" -maxdepth 1 -name '*flagscale-all*' -name '*.tar' 2>/dev/null | sort | tail -1)
 
           echo "train_tar=${TRAIN_TAR}" >> $GITHUB_OUTPUT
           echo "inference_tar=${INFERENCE_TAR}" >> $GITHUB_OUTPUT
+          echo "all_tar=${ALL_TAR}" >> $GITHUB_OUTPUT
 
-          if [ -n "$TRAIN_TAR" ] || [ -n "$INFERENCE_TAR" ]; then
+          if [ -n "$TRAIN_TAR" ] || [ -n "$INFERENCE_TAR" ] || [ -n "$ALL_TAR" ]; then
             echo "needs_promotion=true" >> $GITHUB_OUTPUT
             echo "Detected tars:"
             if [ -n "$TRAIN_TAR" ];     then echo "  train:     $TRAIN_TAR"; fi
             if [ -n "$INFERENCE_TAR" ]; then echo "  inference: $INFERENCE_TAR"; fi
+            if [ -n "$ALL_TAR" ];       then echo "  all:       $ALL_TAR"; fi
           else
             echo "needs_promotion=false" >> $GITHUB_OUTPUT
             echo "No tar files found in $TAR_DIR, skipping promotion"
@@ -59,6 +67,7 @@ jobs:
     outputs:
       remote_train_tag: ${{ steps.promote_train.outputs.remote_tag }}
       remote_inference_tag: ${{ steps.promote_inference.outputs.remote_tag }}
+      remote_all_tag: ${{ steps.promote_all.outputs.remote_tag }}
     steps:
       - name: Login to Harbor registry
         uses: docker/login-action@v3
@@ -75,7 +84,7 @@ jobs:
           TAR_PATH="${{ needs.prepare.outputs.train_tar }}"
 
           echo "Loading tar: $TAR_PATH"
-          LOCAL_TAG=$(docker load -i "$TAR_PATH" | grep 'Loaded image:' | awk '{print $NF}')
+          LOCAL_TAG=$(sudo docker load -i "$TAR_PATH" | grep 'Loaded image:' | awk '{print $NF}')
           echo "Loaded image tag: $LOCAL_TAG"
 
           # Strip local registry prefix (e.g. localhost:5000/) to get image:tag
@@ -98,7 +107,29 @@ jobs:
           TAR_PATH="${{ needs.prepare.outputs.inference_tar }}"
 
           echo "Loading tar: $TAR_PATH"
-          LOCAL_TAG=$(docker load -i "$TAR_PATH" | grep 'Loaded image:' | awk '{print $NF}')
+          LOCAL_TAG=$(sudo docker load -i "$TAR_PATH" | grep 'Loaded image:' | awk '{print $NF}')
+          echo "Loaded image tag: $LOCAL_TAG"
+
+          IMAGE_AND_TAG="${LOCAL_TAG#*/}"
+          REMOTE_TAG="${{ env.REMOTE_REGISTRY }}/${{ env.REMOTE_IMAGE_PREFIX }}/${IMAGE_AND_TAG}"
+
+          docker tag "${LOCAL_TAG}" "${REMOTE_TAG}"
+          docker push "${REMOTE_TAG}"
+          echo "remote_tag=${REMOTE_TAG}" >> $GITHUB_OUTPUT
+          echo "Pushed: ${REMOTE_TAG}"
+
+          echo "Removing tar: $TAR_PATH"
+          rm -f "$TAR_PATH"
+
+      - name: Promote all image to Harbor
+        id: promote_all
+        if: needs.prepare.outputs.all_tar != ''
+        run: |
+          set -euo pipefail
+          TAR_PATH="${{ needs.prepare.outputs.all_tar }}"
+
+          echo "Loading tar: $TAR_PATH"
+          LOCAL_TAG=$(sudo docker load -i "$TAR_PATH" | grep 'Loaded image:' | awk '{print $NF}')
           echo "Loaded image tag: $LOCAL_TAG"
 
           IMAGE_AND_TAG="${LOCAL_TAG#*/}"
@@ -132,6 +163,7 @@ jobs:
 
           REMOTE_TRAIN="${{ needs.promote.outputs.remote_train_tag }}"
           REMOTE_INFERENCE="${{ needs.promote.outputs.remote_inference_tag }}"
+          REMOTE_ALL="${{ needs.promote.outputs.remote_all_tag }}"
 
           if [ -n "$REMOTE_TRAIN" ]; then
             sed -i "s|^ci_train_image:.*|ci_train_image: ${REMOTE_TRAIN}|" "$CONFIG_FILE"
@@ -143,6 +175,11 @@ jobs:
             echo "Updated ci_inference_image: ${REMOTE_INFERENCE}"
           fi
 
+          if [ -n "$REMOTE_ALL" ]; then
+            sed -i "s|^ci_image:.*|ci_image: ${REMOTE_ALL}|" "$CONFIG_FILE"
+            echo "Updated ci_image: ${REMOTE_ALL}"
+          fi
+
       - name: Commit and push updated config
         run: |
           git config user.name "github-actions[bot]"

From 3b99fa37e3cdff5663b7dd985bf28008ce0f9a42 Mon Sep 17 00:00:00 2001
From: zihugithub <fbye@baai.ac.cn>
Date: Mon, 13 Apr 2026 18:34:05 +0800
Subject: [PATCH 21/29] ci: pass proxy settings to docker build stages

---
 .github/workflows/build_image_cuda.yml | 10 ++++++++
 docker/cuda/Dockerfile.all             | 32 ++++++++++++++++++++++++++
 docker/cuda/Dockerfile.inference       | 32 ++++++++++++++++++++++++++
 docker/cuda/Dockerfile.train           | 32 ++++++++++++++++++++++++++
 4 files changed, 106 insertions(+)

diff --git a/.github/workflows/build_image_cuda.yml b/.github/workflows/build_image_cuda.yml
index ec1b92abd2..252e54a068 100644
--- a/.github/workflows/build_image_cuda.yml
+++ b/.github/workflows/build_image_cuda.yml
@@ -124,6 +124,10 @@ jobs:
     name: Build ${{ matrix.task }}
     needs: prepare
     runs-on: [self-hosted, Linux, X64, nvidia-0, gpus-8]
+    env:
+      HTTP_PROXY: ${{ vars.HTTP_PROXY }}
+      HTTPS_PROXY: ${{ vars.HTTPS_PROXY }}
+      NO_PROXY: ${{ vars.NO_PROXY }}
     strategy:
       fail-fast: false
       matrix: ${{ fromJson(needs.prepare.outputs.matrix) }}
@@ -208,6 +212,12 @@ jobs:
             UV_VERSION=${{ env.UV_VERSION }}
             PYTORCH_INDEX=${{ steps.meta.outputs.pytorch_index }}
             PKG_MGR=${{ env.PKG_MGR }}
+            HTTP_PROXY=${{ env.HTTP_PROXY }}
+            HTTPS_PROXY=${{ env.HTTPS_PROXY }}
+            NO_PROXY=${{ env.NO_PROXY }}
+            http_proxy=${{ env.HTTP_PROXY }}
+            https_proxy=${{ env.HTTPS_PROXY }}
+            no_proxy=${{ env.NO_PROXY }}
           no-cache: ${{ needs.prepare.outputs.no_cache == 'true' }}
 
       - name: Save image as tar
diff --git a/docker/cuda/Dockerfile.all b/docker/cuda/Dockerfile.all
index f38b3b3405..e40201b073 100644
--- a/docker/cuda/Dockerfile.all
+++ b/docker/cuda/Dockerfile.all
@@ -38,6 +38,14 @@ ARG UV_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
 # PyTorch wheel index (derived from CUDA version)
 ARG PYTORCH_INDEX=https://download.pytorch.org/whl/cu128
 
+# Proxy settings (build-time only, not persisted in image)
+ARG HTTP_PROXY
+ARG HTTPS_PROXY
+ARG NO_PROXY
+ARG http_proxy=${HTTP_PROXY}
+ARG https_proxy=${HTTPS_PROXY}
+ARG no_proxy=${NO_PROXY}
+
 # =============================================================================
 # BASE STAGE - System dependencies
 # =============================================================================
@@ -48,6 +56,14 @@ ARG PYTHON_VERSION
 ARG UV_VERSION
 ARG PKG_MGR
 
+# Proxy settings (build-time only)
+ARG HTTP_PROXY
+ARG HTTPS_PROXY
+ARG NO_PROXY
+ARG http_proxy=${HTTP_PROXY}
+ARG https_proxy=${HTTPS_PROXY}
+ARG no_proxy=${NO_PROXY}
+
 # Root installation directory (single source of truth)
 ARG FLAGSCALE_HOME=/root
 
@@ -128,6 +144,14 @@ ARG PYTORCH_INDEX
 ARG PKG_MGR
 ARG FLAGSCALE_HOME=/root
 
+# Proxy settings (build-time only)
+ARG HTTP_PROXY
+ARG HTTPS_PROXY
+ARG NO_PROXY
+ARG http_proxy=${HTTP_PROXY}
+ARG https_proxy=${HTTPS_PROXY}
+ARG no_proxy=${NO_PROXY}
+
 # PyPI index URLs (re-declare to use in this stage)
 ARG PIP_INDEX_URL
 ARG PIP_EXTRA_INDEX_URL
@@ -186,6 +210,14 @@ ARG PYTORCH_INDEX
 ARG PKG_MGR
 ARG FLAGSCALE_HOME=/root
 
+# Proxy settings (build-time only)
+ARG HTTP_PROXY
+ARG HTTPS_PROXY
+ARG NO_PROXY
+ARG http_proxy=${HTTP_PROXY}
+ARG https_proxy=${HTTPS_PROXY}
+ARG no_proxy=${NO_PROXY}
+
 # PyPI index URLs
 ARG PIP_INDEX_URL
 ARG PIP_EXTRA_INDEX_URL
diff --git a/docker/cuda/Dockerfile.inference b/docker/cuda/Dockerfile.inference
index 25788401b5..a6d3636292 100644
--- a/docker/cuda/Dockerfile.inference
+++ b/docker/cuda/Dockerfile.inference
@@ -36,6 +36,14 @@ ARG UV_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
 # PyTorch wheel index (derived from CUDA version)
 ARG PYTORCH_INDEX=https://download.pytorch.org/whl/cu128
 
+
+ARG HTTP_PROXY
+ARG HTTPS_PROXY
+ARG NO_PROXY
+ARG http_proxy=${HTTP_PROXY}
+ARG https_proxy=${HTTPS_PROXY}
+ARG no_proxy=${NO_PROXY}
+
 # =============================================================================
 # BASE STAGE - System dependencies
 # =============================================================================
@@ -46,6 +54,14 @@ ARG PYTHON_VERSION
 ARG UV_VERSION
 ARG PKG_MGR
 
+# Proxy settings (build-time only)
+ARG HTTP_PROXY
+ARG HTTPS_PROXY
+ARG NO_PROXY
+ARG http_proxy=${HTTP_PROXY}
+ARG https_proxy=${HTTPS_PROXY}
+ARG no_proxy=${NO_PROXY}
+
 # Root installation directory (single source of truth)
 ARG FLAGSCALE_HOME=/root
 
@@ -126,6 +142,14 @@ ARG PYTORCH_INDEX
 ARG PKG_MGR
 ARG FLAGSCALE_HOME=/root
 
+# Proxy settings (build-time only)
+ARG HTTP_PROXY
+ARG HTTPS_PROXY
+ARG NO_PROXY
+ARG http_proxy=${HTTP_PROXY}
+ARG https_proxy=${HTTPS_PROXY}
+ARG no_proxy=${NO_PROXY}
+
 # PyPI index URLs (re-declare to use in this stage)
 ARG PIP_INDEX_URL
 ARG PIP_EXTRA_INDEX_URL
@@ -184,6 +208,14 @@ ARG PYTORCH_INDEX
 ARG PKG_MGR
 ARG FLAGSCALE_HOME=/root
 
+# Proxy settings (build-time only)
+ARG HTTP_PROXY
+ARG HTTPS_PROXY
+ARG NO_PROXY
+ARG http_proxy=${HTTP_PROXY}
+ARG https_proxy=${HTTPS_PROXY}
+ARG no_proxy=${NO_PROXY}
+
 # PyPI index URLs
 ARG PIP_INDEX_URL
 ARG PIP_EXTRA_INDEX_URL
diff --git a/docker/cuda/Dockerfile.train b/docker/cuda/Dockerfile.train
index b6de815865..06fa58bc0e 100644
--- a/docker/cuda/Dockerfile.train
+++ b/docker/cuda/Dockerfile.train
@@ -36,6 +36,14 @@ ARG UV_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
 # PyTorch wheel index (derived from CUDA version)
 ARG PYTORCH_INDEX=https://download.pytorch.org/whl/cu128
 
+# Proxy settings (build-time only, not persisted in image)
+ARG HTTP_PROXY
+ARG HTTPS_PROXY
+ARG NO_PROXY
+ARG http_proxy=${HTTP_PROXY}
+ARG https_proxy=${HTTPS_PROXY}
+ARG no_proxy=${NO_PROXY}
+
 # =============================================================================
 # BASE STAGE - System dependencies
 # =============================================================================
@@ -46,6 +54,14 @@ ARG PYTHON_VERSION
 ARG UV_VERSION
 ARG PKG_MGR
 
+# Proxy settings (build-time only)
+ARG HTTP_PROXY
+ARG HTTPS_PROXY
+ARG NO_PROXY
+ARG http_proxy=${HTTP_PROXY}
+ARG https_proxy=${HTTPS_PROXY}
+ARG no_proxy=${NO_PROXY}
+
 # Root installation directory (single source of truth)
 ARG FLAGSCALE_HOME=/root
 
@@ -127,6 +143,14 @@ ARG PKG_MGR
 ARG FLAGSCALE_HOME=/root
 ARG PYTHON_VERSION=3.12
 
+# Proxy settings (build-time only)
+ARG HTTP_PROXY
+ARG HTTPS_PROXY
+ARG NO_PROXY
+ARG http_proxy=${HTTP_PROXY}
+ARG https_proxy=${HTTPS_PROXY}
+ARG no_proxy=${NO_PROXY}
+
 # PyPI index URLs (re-declare to use in this stage)
 ARG PIP_INDEX_URL
 ARG PIP_EXTRA_INDEX_URL
@@ -186,6 +210,14 @@ ARG PKG_MGR
 ARG FLAGSCALE_HOME=/root
 ARG PYTHON_VERSION=3.12
 
+# Proxy settings (build-time only)
+ARG HTTP_PROXY
+ARG HTTPS_PROXY
+ARG NO_PROXY
+ARG http_proxy=${HTTP_PROXY}
+ARG https_proxy=${HTTPS_PROXY}
+ARG no_proxy=${NO_PROXY}
+
 # PyPI index URLs
 ARG PIP_INDEX_URL
 ARG PIP_EXTRA_INDEX_URL

From c4c3e10e09432a1b88635e50b496433e6f2d9bef Mon Sep 17 00:00:00 2001
From: zihugithub <fbye@baai.ac.cn>
Date: Tue, 14 Apr 2026 14:24:12 +0800
Subject: [PATCH 22/29] ci: auto-detect proxy from runner environment for
 docker build

---
 .github/workflows/build_image_cuda.yml | 25 +++++++++++++++++++------
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/build_image_cuda.yml b/.github/workflows/build_image_cuda.yml
index 252e54a068..bbce763f0c 100644
--- a/.github/workflows/build_image_cuda.yml
+++ b/.github/workflows/build_image_cuda.yml
@@ -128,6 +128,7 @@ jobs:
       HTTP_PROXY: ${{ vars.HTTP_PROXY }}
       HTTPS_PROXY: ${{ vars.HTTPS_PROXY }}
       NO_PROXY: ${{ vars.NO_PROXY }}
+      # Note: vars.* may be empty for fork PRs; proxy is detected from runner env in the 'proxy' step
     strategy:
       fail-fast: false
       matrix: ${{ fromJson(needs.prepare.outputs.matrix) }}
@@ -196,6 +197,18 @@ jobs:
             echo "| Build Tag | \`${BUILD_TAG}\` |"
           } >> $GITHUB_STEP_SUMMARY
 
+      - name: Detect proxy from runner environment
+        id: proxy
+        run: |
+          # Read proxy from runner environment (works for both fork PRs and direct pushes)
+          HTTP_PROXY_VAL="${http_proxy:-${HTTP_PROXY:-}}"
+          HTTPS_PROXY_VAL="${https_proxy:-${HTTPS_PROXY:-}}"
+          NO_PROXY_VAL="${no_proxy:-${NO_PROXY:-}}"
+          echo "http_proxy=${HTTP_PROXY_VAL}" >> $GITHUB_OUTPUT
+          echo "https_proxy=${HTTPS_PROXY_VAL}" >> $GITHUB_OUTPUT
+          echo "no_proxy=${NO_PROXY_VAL}" >> $GITHUB_OUTPUT
+          echo "Detected proxies: HTTP=${HTTP_PROXY_VAL} HTTPS=${HTTPS_PROXY_VAL} NO_PROXY=${NO_PROXY_VAL}"
+
       - name: Build Docker image
         uses: docker/build-push-action@v6
         with:
@@ -212,12 +225,12 @@ jobs:
             UV_VERSION=${{ env.UV_VERSION }}
             PYTORCH_INDEX=${{ steps.meta.outputs.pytorch_index }}
             PKG_MGR=${{ env.PKG_MGR }}
-            HTTP_PROXY=${{ env.HTTP_PROXY }}
-            HTTPS_PROXY=${{ env.HTTPS_PROXY }}
-            NO_PROXY=${{ env.NO_PROXY }}
-            http_proxy=${{ env.HTTP_PROXY }}
-            https_proxy=${{ env.HTTPS_PROXY }}
-            no_proxy=${{ env.NO_PROXY }}
+            HTTP_PROXY=${{ steps.proxy.outputs.http_proxy }}
+            HTTPS_PROXY=${{ steps.proxy.outputs.https_proxy }}
+            NO_PROXY=${{ steps.proxy.outputs.no_proxy }}
+            http_proxy=${{ steps.proxy.outputs.http_proxy }}
+            https_proxy=${{ steps.proxy.outputs.https_proxy }}
+            no_proxy=${{ steps.proxy.outputs.no_proxy }}
           no-cache: ${{ needs.prepare.outputs.no_cache == 'true' }}
 
       - name: Save image as tar

From 1a7a3bc451af35b879c933e84eafbd648763ba71 Mon Sep 17 00:00:00 2001
From: zihugithub <fbye@baai.ac.cn>
Date: Wed, 15 Apr 2026 10:52:48 +0800
Subject: [PATCH 23/29] ci: add runs_on parameter for custom runner selection

---
 .github/workflows/all_tests_common.yml | 11 +++++++++++
 .github/workflows/build_image_cuda.yml |  1 +
 2 files changed, 12 insertions(+)

diff --git a/.github/workflows/all_tests_common.yml b/.github/workflows/all_tests_common.yml
index b15f455faf..8f4a8d8562 100644
--- a/.github/workflows/all_tests_common.yml
+++ b/.github/workflows/all_tests_common.yml
@@ -17,6 +17,11 @@ on:
         type: string
         description: Override inference image. Falls back to platform config if not set.
         default: ""
+      runs_on:
+        required: false
+        type: string
+        description: Override runs_on. Falls back to platform config if not set.
+        default: ""
 
 jobs:
   checkout_and_config:
@@ -117,6 +122,12 @@ jobs:
           if [ -n "${{ inputs.ci_inference_image }}" ]; then
             echo "ci_inference_image=${{ inputs.ci_inference_image }}" >> $GITHUB_OUTPUT
           fi
+          # Use single-quoted assignment so bash treats the JSON value literally
+          # (double quotes inside the JSON would break echo "runs_on=${{ inputs.runs_on }}")
+          RUNS_ON_INPUT='${{ inputs.runs_on }}'
+          if [ -n "$RUNS_ON_INPUT" ]; then
+            { echo 'runs_on<<EOFRUNSON_INPUT'; echo "$RUNS_ON_INPUT"; echo 'EOFRUNSON_INPUT'; } >> $GITHUB_OUTPUT
+          fi
 
   # CLI validation runs first (outside virtual env) as a gate for all subsequent tests
   cli_validation:
diff --git a/.github/workflows/build_image_cuda.yml b/.github/workflows/build_image_cuda.yml
index bbce763f0c..d1c4f80322 100644
--- a/.github/workflows/build_image_cuda.yml
+++ b/.github/workflows/build_image_cuda.yml
@@ -318,3 +318,4 @@ jobs:
       platform: cuda
       ci_train_image: ${{ needs.build.outputs.train_tag }}
       ci_inference_image: ${{ needs.build.outputs.inference_tag }}
+      runs_on: '["self-hosted", "Linux", "X64", "nvidia-0", "gpus-8"]'

From fcbdbcce9308c0b809a4084913813fc6b68937cb Mon Sep 17 00:00:00 2001
From: zihugithub <fbye@baai.ac.cn>
Date: Wed, 15 Apr 2026 13:29:09 +0800
Subject: [PATCH 24/29] ci: add runs_on and container_volumes as overridable
 workflow inputs

---
 .github/workflows/all_tests_common.yml | 10 ++++++++++
 .github/workflows/build_image_cuda.yml | 13 ++++++++++++-
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/all_tests_common.yml b/.github/workflows/all_tests_common.yml
index 8f4a8d8562..e2ad875a9a 100644
--- a/.github/workflows/all_tests_common.yml
+++ b/.github/workflows/all_tests_common.yml
@@ -22,6 +22,11 @@ on:
         type: string
         description: Override runs_on. Falls back to platform config if not set.
         default: ""
+      container_volumes:
+        required: false
+        type: string
+        description: Override container_volumes. Falls back to platform config if not set.
+        default: ""
 
 jobs:
   checkout_and_config:
@@ -128,6 +133,11 @@ jobs:
           if [ -n "$RUNS_ON_INPUT" ]; then
             { echo 'runs_on<<EOFRUNSON_INPUT'; echo "$RUNS_ON_INPUT"; echo 'EOFRUNSON_INPUT'; } >> $GITHUB_OUTPUT
           fi
+          # (double quotes inside the JSON would break echo "container_volumes=${{ inputs.container_volumes }}")
+          CONTAINER_VOLUMES_INPUT='${{ inputs.container_volumes }}'
+          if [ -n "$CONTAINER_VOLUMES_INPUT" ]; then
+            { echo 'container_volumes<<EOFRUNSON_INPUT'; echo "$CONTAINER_VOLUMES_INPUT"; echo 'EOFRUNSON_INPUT'; } >> $GITHUB_OUTPUT
+          fi
 
   # CLI validation runs first (outside virtual env) as a gate for all subsequent tests
   cli_validation:
diff --git a/.github/workflows/build_image_cuda.yml b/.github/workflows/build_image_cuda.yml
index d1c4f80322..081eb469da 100644
--- a/.github/workflows/build_image_cuda.yml
+++ b/.github/workflows/build_image_cuda.yml
@@ -45,6 +45,16 @@ on:
         description: 'Directory to store image tar files'
         type: string
         default: '/home/flagscale_cicd/images_tar'
+      runs_on:
+        required: false
+        type: string
+        description: Override runs_on. Falls back to platform config if not set.
+        default: '["self-hosted", "Linux", "X64", "nvidia-0", "gpus-8"]'
+      container_volumes:
+        required: false
+        type: string
+        description: Override container_volumes. Falls back to platform config if not set.
+        default: '["/home/flagscale_cicd/docker/docker_build/docker_data:/home/gitlab-runner/data", "/home/flagscale_cicd/docker/docker_build/docker_tokenizers:/home/gitlab-runner/tokenizers"]'
 
   # Trigger on PRs that modify docker-related files (build + test only, no push)
   pull_request:
@@ -318,4 +328,5 @@ jobs:
       platform: cuda
       ci_train_image: ${{ needs.build.outputs.train_tag }}
       ci_inference_image: ${{ needs.build.outputs.inference_tag }}
-      runs_on: '["self-hosted", "Linux", "X64", "nvidia-0", "gpus-8"]'
+      runs_on: ${{ inputs.runs_on || '["self-hosted", "Linux", "X64", "nvidia-0", "gpus-8"]' }}
+      container_volumes: ${{ inputs.container_volumes || '["/home/flagscale_cicd/docker/docker_build/docker_data:/home/gitlab-runner/data", "/home/flagscale_cicd/docker/docker_build/docker_tokenizers:/home/gitlab-runner/tokenizers"]'}}

From d6301a7b76ba9d5ba1e47a42a62d5a9dee886c84 Mon Sep 17 00:00:00 2001
From: zihugithub <fbye@baai.ac.cn>
Date: Wed, 15 Apr 2026 13:29:54 +0800
Subject: [PATCH 25/29] ci: add runs_on and container_volumes as overridable
 workflow inputs

---
 .github/workflows/build_image_cuda.yml | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/build_image_cuda.yml b/.github/workflows/build_image_cuda.yml
index 081eb469da..945203be73 100644
--- a/.github/workflows/build_image_cuda.yml
+++ b/.github/workflows/build_image_cuda.yml
@@ -54,7 +54,9 @@ on:
         required: false
         type: string
         description: Override container_volumes. Falls back to platform config if not set.
-        default: '["/home/flagscale_cicd/docker/docker_build/docker_data:/home/gitlab-runner/data", "/home/flagscale_cicd/docker/docker_build/docker_tokenizers:/home/gitlab-runner/tokenizers"]'
+        default: >-
+          ["/home/flagscale_cicd/docker/docker_build/docker_data:/home/gitlab-runner/data",
+          "/home/flagscale_cicd/docker/docker_build/docker_tokenizers:/home/gitlab-runner/tokenizers"]
 
   # Trigger on PRs that modify docker-related files (build + test only, no push)
   pull_request:
@@ -328,5 +330,10 @@ jobs:
       platform: cuda
       ci_train_image: ${{ needs.build.outputs.train_tag }}
       ci_inference_image: ${{ needs.build.outputs.inference_tag }}
-      runs_on: ${{ inputs.runs_on || '["self-hosted", "Linux", "X64", "nvidia-0", "gpus-8"]' }}
-      container_volumes: ${{ inputs.container_volumes || '["/home/flagscale_cicd/docker/docker_build/docker_data:/home/gitlab-runner/data", "/home/flagscale_cicd/docker/docker_build/docker_tokenizers:/home/gitlab-runner/tokenizers"]'}}
+      runs_on: >-
+        ${{ inputs.runs_on ||
+        '["self-hosted", "Linux", "X64", "nvidia-0", "gpus-8"]' }}
+      container_volumes: >-
+        ${{ inputs.container_volumes ||
+        '["/home/flagscale_cicd/docker/docker_build/docker_data:/home/gitlab-runner/data",
+        "/home/flagscale_cicd/docker/docker_build/docker_tokenizers:/home/gitlab-runner/tokenizers"]' }}

From abcb13bec7bd246f388be1bbe515110b1272fe62 Mon Sep 17 00:00:00 2001
From: zihugithub <fbye@baai.ac.cn>
Date: Wed, 15 Apr 2026 15:52:33 +0800
Subject: [PATCH 26/29] debug0

---
 .github/workflows/all_tests_common.yml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/all_tests_common.yml b/.github/workflows/all_tests_common.yml
index e2ad875a9a..f1ed893fcb 100644
--- a/.github/workflows/all_tests_common.yml
+++ b/.github/workflows/all_tests_common.yml
@@ -314,11 +314,11 @@ jobs:
           # Check all test jobs (skip if not run)
           failed=false
 
-          if [ "${{ needs.unit_tests.result }}" != "success" ] && \
-             [ "${{ needs.unit_tests.result }}" != "skipped" ]; then
-            echo "❌ Unit tests failed"
-            failed=true
-          fi
+          # if [ "${{ needs.unit_tests.result }}" != "success" ] && \
+          #    [ "${{ needs.unit_tests.result }}" != "skipped" ]; then
+          #   echo "❌ Unit tests failed"
+          #   failed=true
+          # fi
 
           if [ "${{ needs.cli_validation.result }}" != "success" ] && \
              [ "${{ needs.cli_validation.result }}" != "skipped" ]; then

From 747517ca5f7ef2a0f8a2d9100663ec82ed5eb30f Mon Sep 17 00:00:00 2001
From: zihugithub <fbye@baai.ac.cn>
Date: Wed, 15 Apr 2026 19:15:31 +0800
Subject: [PATCH 27/29] ci: update Ascend volume paths and re-enable unit tests

---
 .github/configs/ascend.yml             |  4 +-
 .github/workflows/all_tests_common.yml | 66 +++++++++++++-------------
 tests/__init__.py                      |  1 +
 tests/unit_tests/__init__.py           |  1 +
 4 files changed, 37 insertions(+), 35 deletions(-)
 create mode 100644 tests/__init__.py
 create mode 100644 tests/unit_tests/__init__.py

diff --git a/.github/configs/ascend.yml b/.github/configs/ascend.yml
index d2879fabbd..db071e052d 100644
--- a/.github/configs/ascend.yml
+++ b/.github/configs/ascend.yml
@@ -15,8 +15,8 @@ runner_labels: ["flagscale-ascend-ascend910-gpu2-32c-128g"]
 
 # Container volumes (hardware-specific paths)
 container_volumes:
-  - /public/cicd/baai_datasets:/home/gitlab-runner/data
-  - /public/cicd/baai_tokenizers:/home/gitlab-runner/tokenizers
+  - /public-ks3/cicd/baai_datasets:/home/gitlab-runner/data
+  - /public-ks3/cicd/baai_tokenizers:/home/gitlab-runner/tokenizers
   - /usr/local/Ascend/driver:/usr/local/Ascend/driver:ro
   - /usr/local/Ascend/add-ons:/usr/local/Ascend/add-ons:ro
   - /usr/local/sbin/npu-smi:/usr/local/sbin/npu-smi:ro
diff --git a/.github/workflows/all_tests_common.yml b/.github/workflows/all_tests_common.yml
index f1ed893fcb..94cc9cdbdc 100644
--- a/.github/workflows/all_tests_common.yml
+++ b/.github/workflows/all_tests_common.yml
@@ -154,34 +154,34 @@ jobs:
       env_name: ${{ needs.checkout_and_config.outputs.env_name_train }}
       env_path: ${{ needs.checkout_and_config.outputs.env_path }}
 
-  # unit_tests:
-  #   needs:
-  #     - checkout_and_config
-  #     - cli_validation
-  #   if: fromJson(needs.checkout_and_config.outputs.device_types)[0] != null
-  #   strategy:
-  #     fail-fast: false
-  #     matrix:
-  #       device: ${{ fromJson(needs.checkout_and_config.outputs.device_types) }}
-  #   uses: ./.github/workflows/unit_tests_common.yml
-  #   name: unit_tests
-  #   with:
-  #     platform: ${{ inputs.platform }}
-  #     device: ${{ matrix.device }}
-  #     image: ${{ needs.checkout_and_config.outputs.ci_train_image }}
-  #     runs_on: ${{ needs.checkout_and_config.outputs.runs_on }}
-  #     container_volumes: ${{ needs.checkout_and_config.outputs.container_volumes }}
-  #     container_options: ${{ needs.checkout_and_config.outputs.container_options }}
-  #     source_artifact: flagscale-source-${{ github.sha }}
-  #     pkg_mgr: ${{ needs.checkout_and_config.outputs.pkg_mgr }}
-  #     env_name: ${{ needs.checkout_and_config.outputs.env_name_train }}
-  #     env_path: ${{ needs.checkout_and_config.outputs.env_path }}
+  unit_tests:
+    needs:
+      - checkout_and_config
+      - cli_validation
+    if: fromJson(needs.checkout_and_config.outputs.device_types)[0] != null
+    strategy:
+      fail-fast: false
+      matrix:
+        device: ${{ fromJson(needs.checkout_and_config.outputs.device_types) }}
+    uses: ./.github/workflows/unit_tests_common.yml
+    name: unit_tests
+    with:
+      platform: ${{ inputs.platform }}
+      device: ${{ matrix.device }}
+      image: ${{ needs.checkout_and_config.outputs.ci_train_image }}
+      runs_on: ${{ needs.checkout_and_config.outputs.runs_on }}
+      container_volumes: ${{ needs.checkout_and_config.outputs.container_volumes }}
+      container_options: ${{ needs.checkout_and_config.outputs.container_options }}
+      source_artifact: flagscale-source-${{ github.sha }}
+      pkg_mgr: ${{ needs.checkout_and_config.outputs.pkg_mgr }}
+      env_name: ${{ needs.checkout_and_config.outputs.env_name_train }}
+      env_path: ${{ needs.checkout_and_config.outputs.env_path }}
 
   functional_tests_train:
     needs:
       - checkout_and_config
       - cli_validation
-      # - unit_tests
+      - unit_tests
     if: fromJson(needs.checkout_and_config.outputs.train_test_matrix)[0] != null
     uses: ./.github/workflows/functional_tests_train.yml
     with:
@@ -200,7 +200,7 @@ jobs:
     needs:
       - checkout_and_config
       - cli_validation
-      # - unit_tests
+      - unit_tests
     if: fromJson(needs.checkout_and_config.outputs.hetero_train_test_matrix)[0] != null
     uses: ./.github/workflows/functional_tests_hetero_train.yml
     with:
@@ -219,7 +219,7 @@ jobs:
     needs:
       - checkout_and_config
       - cli_validation
-      # - unit_tests
+      - unit_tests
     if: fromJson(needs.checkout_and_config.outputs.inference_test_matrix)[0] != null
     uses: ./.github/workflows/functional_tests_inference.yml
     with:
@@ -238,7 +238,7 @@ jobs:
     needs:
       - checkout_and_config
       - cli_validation
-      # - unit_tests
+      - unit_tests
     if: fromJson(needs.checkout_and_config.outputs.serve_test_matrix)[0] != null
     uses: ./.github/workflows/functional_tests_serve.yml
     with:
@@ -276,7 +276,7 @@ jobs:
     needs:
       - checkout_and_config
       - cli_validation
-      # - unit_tests
+      - unit_tests
     if: fromJson(needs.checkout_and_config.outputs.benchmark_test_matrix)[0] != null
     uses: ./.github/workflows/functional_tests_benchmark.yml
     with:
@@ -298,7 +298,7 @@ jobs:
     needs:
       - checkout_and_config
       - cli_validation
-      # - unit_tests
+      - unit_tests
       - functional_tests_train
       - functional_tests_hetero_train
       - functional_tests_benchmark
@@ -314,11 +314,11 @@ jobs:
           # Check all test jobs (skip if not run)
           failed=false
 
-          # if [ "${{ needs.unit_tests.result }}" != "success" ] && \
-          #    [ "${{ needs.unit_tests.result }}" != "skipped" ]; then
-          #   echo "❌ Unit tests failed"
-          #   failed=true
-          # fi
+          if [ "${{ needs.unit_tests.result }}" != "success" ] && \
+             [ "${{ needs.unit_tests.result }}" != "skipped" ]; then
+            echo "❌ Unit tests failed"
+            failed=true
+          fi
 
           if [ "${{ needs.cli_validation.result }}" != "success" ] && \
              [ "${{ needs.cli_validation.result }}" != "skipped" ]; then
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000000..db5dae1f3c
--- /dev/null
+++ b/tests/__init__.py
@@ -0,0 +1 @@
+# This file makes tests/ a Python package
diff --git a/tests/unit_tests/__init__.py b/tests/unit_tests/__init__.py
new file mode 100644
index 0000000000..9add4932b0
--- /dev/null
+++ b/tests/unit_tests/__init__.py
@@ -0,0 +1 @@
+# This file makes tests/unit_tests/ a Python package

From 852340d79f0d4e2eac4132babf627df3a0519a40 Mon Sep 17 00:00:00 2001
From: liyuzhuo <lee.yuzhuo233@gmail.com>
Date: Thu, 16 Apr 2026 14:38:28 +0800
Subject: [PATCH 28/29] ignore docs

---
 .github/workflows/all_tests_ascend.yml |  2 ++
 .github/workflows/all_tests_cuda.yml   | 36 +++++++++++++++++++++++++-
 .github/workflows/all_tests_metax.yml  |  2 ++
 .github/workflows/build_image_cuda.yml |  2 +-
 .github/workflows/format_check.yml     |  2 ++
 5 files changed, 42 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/all_tests_ascend.yml b/.github/workflows/all_tests_ascend.yml
index 459c8616a9..401eeb813f 100644
--- a/.github/workflows/all_tests_ascend.yml
+++ b/.github/workflows/all_tests_ascend.yml
@@ -4,6 +4,7 @@ on:
   push:
     branches: ["main"]
     paths-ignore:
+      - 'docs/**'
       - 'docker/cuda/**'
       - 'docker/build.sh'
       - 'tools/install/**'
@@ -13,6 +14,7 @@ on:
   pull_request:
     branches: ["main"]
     paths-ignore:
+      - 'docs/**'
       - 'docker/cuda/**'
       - 'docker/build.sh'
       - 'tools/install/**'
diff --git a/.github/workflows/all_tests_cuda.yml b/.github/workflows/all_tests_cuda.yml
index 2374e10850..db6215ceab 100644
--- a/.github/workflows/all_tests_cuda.yml
+++ b/.github/workflows/all_tests_cuda.yml
@@ -4,6 +4,7 @@ on:
   push:
     branches: ["main"]
     paths-ignore:
+      - 'docs/**'
       - 'docker/cuda/**'
       - 'docker/build.sh'
       - 'tools/install/**'
@@ -13,6 +14,7 @@ on:
   pull_request:
     branches: ["main"]
     paths-ignore:
+      - 'docs/**'
       - 'docker/cuda/**'
       - 'docker/build.sh'
       - 'tools/install/**'
@@ -25,19 +27,51 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
+  # ---------------------------------------------------------------------------
+  # Guard: skip tests when PR contains docker-related changes, because
+  # build_image_cuda.yml will handle build + test for those PRs.
+  # paths-ignore alone cannot guarantee mutual exclusivity when a PR touches
+  # both docker files and non-docker files, so we need this job-level check.
+  # ---------------------------------------------------------------------------
+  check_docker_changes:
+    runs-on: ubuntu-latest
+    if: github.event_name == 'pull_request'
+    outputs:
+      has_docker_changes: ${{ steps.check.outputs.has_docker_changes }}
+    steps:
+      - name: Check for docker-related file changes
+        id: check
+        env:
+          GH_TOKEN: ${{ github.token }}
+        run: |
+          CHANGED=$(gh api repos/${{ github.repository }}/pulls/${{ github.event.pull_request.number }}/files --paginate --jq '.[].filename')
+          if echo "$CHANGED" | grep -qE '^(docker/cuda/|docker/build\.sh$|tools/install/|requirements/|\.github/workflows/build_image_cuda\.yml$)'; then
+            echo "has_docker_changes=true" >> $GITHUB_OUTPUT
+          else
+            echo "has_docker_changes=false" >> $GITHUB_OUTPUT
+          fi
+
   run_tests:
+    needs: check_docker_changes
+    # On push: always run (paths-ignore already filtered).
+    # On PR: only run when no docker-related files changed.
+    if: always() && (github.event_name == 'push' || needs.check_docker_changes.outputs.has_docker_changes != 'true')
     # Package manager and environment settings are read from .github/configs/cuda.yml
     uses: ./.github/workflows/all_tests_common.yml
     with:
       platform: cuda
 
   all_tests:
-    needs: run_tests
+    needs: [check_docker_changes, run_tests]
     runs-on: ubuntu-latest
     if: always()
     steps:
       - name: Verify workflow status
         run: |
+          if [ "${{ needs.check_docker_changes.outputs.has_docker_changes }}" = "true" ]; then
+            echo "⏭️ Skipped - docker changes detected, handled by build_image_cuda workflow"
+            exit 0
+          fi
           if [ "${{ needs.run_tests.result }}" != "success" ]; then
             echo "❌ Tests workflow failed"
             exit 1
diff --git a/.github/workflows/all_tests_metax.yml b/.github/workflows/all_tests_metax.yml
index b350a6c4e0..d459e040b6 100644
--- a/.github/workflows/all_tests_metax.yml
+++ b/.github/workflows/all_tests_metax.yml
@@ -4,6 +4,7 @@ on:
   push:
     branches: ["main"]
     paths-ignore:
+      - 'docs/**'
       - 'docker/cuda/**'
       - 'docker/build.sh'
       - 'tools/install/**'
@@ -13,6 +14,7 @@ on:
   pull_request:
     branches: ["main"]
     paths-ignore:
+      - 'docs/**'
       - 'docker/cuda/**'
       - 'docker/build.sh'
       - 'tools/install/**'
diff --git a/.github/workflows/build_image_cuda.yml b/.github/workflows/build_image_cuda.yml
index 945203be73..3e8a500899 100644
--- a/.github/workflows/build_image_cuda.yml
+++ b/.github/workflows/build_image_cuda.yml
@@ -61,7 +61,7 @@ on:
   # Trigger on PRs that modify docker-related files (build + test only, no push)
   pull_request:
     branches: [main]
-    paths:
+    paths:∏
       - 'docker/cuda/**'
       - 'docker/build.sh'
       - 'tools/install/**'
diff --git a/.github/workflows/format_check.yml b/.github/workflows/format_check.yml
index 2aa817cc58..ba3ca9b97b 100644
--- a/.github/workflows/format_check.yml
+++ b/.github/workflows/format_check.yml
@@ -4,6 +4,8 @@ on:
   pull_request:
     branches: [ "main" ]
     types: [opened, synchronize, reopened]
+    paths-ignore:
+      - 'docs/**'
 
 jobs:
   format:

From 264d68e8bc7c386eef3a20c1e00383502e716aec Mon Sep 17 00:00:00 2001
From: liyuzhuo <lee.yuzhuo233@gmail.com>
Date: Thu, 16 Apr 2026 14:54:29 +0800
Subject: [PATCH 29/29] syntax

---
 .github/workflows/build_image_cuda.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_image_cuda.yml b/.github/workflows/build_image_cuda.yml
index 3e8a500899..945203be73 100644
--- a/.github/workflows/build_image_cuda.yml
+++ b/.github/workflows/build_image_cuda.yml
@@ -61,7 +61,7 @@ on:
   # Trigger on PRs that modify docker-related files (build + test only, no push)
   pull_request:
     branches: [main]
-    paths:∏
+    paths:
       - 'docker/cuda/**'
       - 'docker/build.sh'
       - 'tools/install/**'