Skip to content
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
2314a21
[CICD] Add auto build and push CUDA Docker images to Harbor pipeline
Mar 10, 2026
6a6a947
fix: code style adjustments in PR
Mar 10, 2026
7843266
Set safe directory
Mar 10, 2026
1f895eb
Clean workspace
Mar 10, 2026
903c520
Remove redundant clean workspace steps before checkout
Mar 11, 2026
84d54bc
fix(ci): use head_ref for PR checkout to avoid detached HEAD on push
Mar 12, 2026
753242a
fix git fetch failure in update-config job
Mar 16, 2026
26c3a14
Clean workspace
Mar 17, 2026
bf71fbe
add paths-ignore for test workflow and fix PR checkout in build workflow
Mar 17, 2026
9cfa2e1
support fork PRs in build_image_cuda workflow
Mar 17, 2026
7310831
use FORK_PUSH_TOKEN for fork PR push and add token-help job
Mar 17, 2026
8b39e35
fix(ci): use stable image tags without timestamp for registry push
Mar 17, 2026
a510a24
debug1
Mar 20, 2026
7513472
Merge branch 'flagos-ai:main' into auto-build-push-image-to-harbor
zihugithub Mar 21, 2026
3492db7
re-enable Docker build/push and CUDA tests in build_image_cuda workflow
Mar 21, 2026
03ee149
add two-stage pipeline to support fork PR builds
Mar 30, 2026
bc95289
debug2
Mar 31, 2026
6c5a953
Merge branch 'main' into auto-build-push-image-to-harbor
zihugithub Apr 2, 2026
3095f1c
Merge branch 'flagos-ai:main' into auto-build-push-image-to-harbor
zihugithub Apr 8, 2026
02ffa31
ci: refactor build_image_cuda workflow
Apr 8, 2026
f37da2e
Merge branch 'auto-build-push-image-to-harbor' of https://github.com/…
Apr 8, 2026
56c30b1
ci: trigger push_image_harbor on build workflow success
Apr 9, 2026
63ae083
debug3
Apr 9, 2026
cd02656
ci: add paths-ignore for ascend/metax, fix sudo docker load, support …
Apr 9, 2026
3b99fa3
ci: pass proxy settings to docker build stages
Apr 13, 2026
cd777fb
Merge branch 'main' into auto-build-push-image-to-harbor
zihugithub Apr 13, 2026
c4c3e10
ci: auto-detect proxy from runner environment for docker build
Apr 14, 2026
b78b2f1
Merge branch 'auto-build-push-image-to-harbor' of https://github.com/…
Apr 14, 2026
1a7a3bc
ci: add runs_on parameter for custom runner selection
Apr 15, 2026
fcbdbcc
ci: add runs_on and container_volumes as overridable workflow inputs
Apr 15, 2026
d6301a7
ci: add runs_on and container_volumes as overridable workflow inputs
Apr 15, 2026
abcb13b
debug0
Apr 15, 2026
cbf4723
Merge branch 'main' into auto-build-push-image-to-harbor
zihugithub Apr 15, 2026
747517c
ci: update Ascend volume paths and re-enable unit tests
Apr 15, 2026
852340d
ignore docs
Darryl233 Apr 16, 2026
264d68e
syntax
Darryl233 Apr 16, 2026
b1fe283
resolve conflict
Darryl233 Apr 20, 2026
d7eb059
fix
Darryl233 Apr 21, 2026
5c9eeb4
fix
Darryl233 Apr 21, 2026
9c3f0b4
unify tar dir
Darryl233 Apr 22, 2026
cd4e2d5
cleanup
Darryl233 Apr 22, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions .github/workflows/all_tests_cuda.yml
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please ignore docs as well

Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
name: cuda_tests

on:
# Trigger after Build Docker Images - CUDA succeeds
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pull Request to main will trigger both Build Docker Images and all_tests_cuda?

workflow_run:
workflows: ["Build Docker Images - CUDA"]
types: [completed]

push:
branches: ["main"]
pull_request:
Expand All @@ -12,6 +17,10 @@ concurrency:

jobs:
run_tests:
# Skip if triggered by workflow_run but the build did not succeed
if: >-
github.event_name != 'workflow_run' ||
github.event.workflow_run.conclusion == 'success'
# Package manager and environment settings are read from .github/configs/cuda.yml
uses: ./.github/workflows/all_tests_common.yml
with:
Expand All @@ -29,3 +38,14 @@ jobs:
exit 1
fi
echo "✅ All tests passed!"

push_images_to_harbor:
needs: all_tests
if: >-
needs.all_tests.result == 'success' &&
github.event_name == 'workflow_run' &&
github.event.workflow_run.conclusion == 'success'
uses: ./.github/workflows/push_image_harbor.yml
with:
platform: cuda
secrets: inherit
298 changes: 298 additions & 0 deletions .github/workflows/build_image_cuda.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,298 @@
name: Build Docker Images - CUDA

on:
# Manual trigger with configurable options
workflow_dispatch:
inputs:
task:
description: 'Task to build'
required: true
type: choice
options:
- train
- inference
- all
default: 'train'
target:
description: 'Build target stage (dev includes dev tools, release is production)'
required: true
type: choice
options:
- dev
- release
default: 'dev'
push:
description: 'Push image to registry'
type: boolean
default: true
no_cache:
description: 'Build without Docker cache'
type: boolean
default: false
pkg_mgr:
description: 'Package manager to use'
required: true
type: choice
options:
- conda
- uv
default: 'conda'
build_all_tasks:
description: 'Build all tasks (train, inference, all) - overrides task selection'
type: boolean
default: false

# Trigger on PRs that modify docker-related files
pull_request:
branches: [main]
paths:
- 'docker/cuda/**'
- 'docker/build.sh'
- 'tools/install/**'
- 'requirements/**'
- '.github/workflows/build_image_cuda.yml'

permissions:
contents: write

concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true

env:
# Local registry used by CI runners (see .github/configs/cuda.yml)
REGISTRY: localhost:5000
# Default build versions (keep in sync with docker/build.sh)
CUDA_VERSION: '12.8.1'
UBUNTU_VERSION: '22.04'
PYTHON_VERSION: '3.12'
UV_VERSION: '0.7.2'
PKG_MGR: ${{ inputs.pkg_mgr || 'conda' }}

jobs:
# ---------------------------------------------------------------------------
# Prepare: compute build matrix and parameters based on trigger type
# ---------------------------------------------------------------------------
prepare:
name: Prepare build matrix
runs-on: ubuntu-latest
outputs:
matrix: ${{ steps.set-matrix.outputs.matrix }}
target: ${{ steps.params.outputs.target }}
push: ${{ steps.params.outputs.push }}
no_cache: ${{ steps.params.outputs.no_cache }}
steps:
- name: Determine build matrix
id: set-matrix
run: |
EVENT="${{ github.event_name }}"

if [ "$EVENT" = "workflow_dispatch" ] && [ "${{ inputs.build_all_tasks }}" != "true" ]; then
# Manual trigger: build selected task only
echo 'matrix={"task":["${{ inputs.task }}"]}' >> $GITHUB_OUTPUT
else
# PR or build_all_tasks=true: build all tasks
echo 'matrix={"task":["train","inference","all"]}' >> $GITHUB_OUTPUT
fi

- name: Set build parameters
id: params
run: |
EVENT="${{ github.event_name }}"

if [ "$EVENT" = "pull_request" ]; then
# PR: always build dev images, push to local registry, use cache
echo "target=dev" >> $GITHUB_OUTPUT
echo "push=true" >> $GITHUB_OUTPUT
echo "no_cache=false" >> $GITHUB_OUTPUT
else
# workflow_dispatch: use user-provided inputs
echo "target=${{ inputs.target || 'dev' }}" >> $GITHUB_OUTPUT
echo "push=${{ inputs.push }}" >> $GITHUB_OUTPUT
echo "no_cache=${{ inputs.no_cache }}" >> $GITHUB_OUTPUT
fi

# ---------------------------------------------------------------------------
# Build: build and push Docker images (matrix across tasks)
# ---------------------------------------------------------------------------
build:
name: Build ${{ matrix.task }}
needs: prepare
runs-on: [self-hosted, Linux, X64, nvidia-0, gpus-8]
strategy:
fail-fast: false
matrix: ${{ fromJson(needs.prepare.outputs.matrix) }}
outputs:
train_tag: ${{ steps.export.outputs.train_tag }}
inference_tag: ${{ steps.export.outputs.inference_tag }}
all_tag: ${{ steps.export.outputs.all_tag }}
steps:
- name: Checkout code
uses: actions/checkout@v4

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
with:
# Use docker driver to avoid pulling moby/buildkit from Docker Hub
driver: docker

- name: Compute build metadata
id: meta
run: |
set -euo pipefail

TASK="${{ matrix.task }}"
TARGET="${{ needs.prepare.outputs.target }}"
CUDA_VERSION="${{ env.CUDA_VERSION }}"
PYTHON_VERSION="${{ env.PYTHON_VERSION }}"
UBUNTU_VERSION="${{ env.UBUNTU_VERSION }}"

CUDA_MAJOR=$(echo "$CUDA_VERSION" | cut -d. -f1)
CUDA_MINOR=$(echo "$CUDA_VERSION" | cut -d. -f2)
TIMESTAMP=$(date +%Y%m%d%H%M%S)

# Image naming follows docker/build.sh convention:
# flagscale-<task>:<target>-cu<major><minor>-py<version>-<timestamp>
IMAGE_NAME="flagscale-${TASK}"
TAG="${TARGET}-cu${CUDA_MAJOR}${CUDA_MINOR}-py${PYTHON_VERSION}-${TIMESTAMP}"

# Local registry tag (for CI runners)
LOCAL_TAG="${{ env.REGISTRY }}/${IMAGE_NAME}:${TAG}"

# Derived build arguments
BASE_IMAGE="nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}"
PYTORCH_INDEX="https://download.pytorch.org/whl/cu${CUDA_MAJOR}${CUDA_MINOR}"

# Tags list: local registry only
TAGS="${LOCAL_TAG}"

echo "image_name=${IMAGE_NAME}" >> $GITHUB_OUTPUT
echo "tag=${TAG}" >> $GITHUB_OUTPUT
echo "local_tag=${LOCAL_TAG}" >> $GITHUB_OUTPUT
echo "base_image=${BASE_IMAGE}" >> $GITHUB_OUTPUT
echo "pytorch_index=${PYTORCH_INDEX}" >> $GITHUB_OUTPUT

# Multi-line tags output
{
echo "tags<<EOF"
echo "${TAGS}"
echo "EOF"
} >> $GITHUB_OUTPUT

# Job summary
{
echo "### Build: ${IMAGE_NAME}"
echo ""
echo "| Parameter | Value |"
echo "|---|---|"
echo "| Task | \`${TASK}\` |"
echo "| Target | \`${TARGET}\` |"
echo "| CUDA | \`${CUDA_VERSION}\` |"
echo "| Python | \`${PYTHON_VERSION}\` |"
echo "| Dockerfile | \`docker/cuda/Dockerfile.${TASK}\` |"
echo "| Local Tag | \`${LOCAL_TAG}\` |"
} >> $GITHUB_STEP_SUMMARY

- name: Build Docker image
uses: docker/build-push-action@v6
with:
context: .
file: docker/cuda/Dockerfile.${{ matrix.task }}
target: ${{ needs.prepare.outputs.target }}
load: true
tags: ${{ steps.meta.outputs.tags }}
build-args: |
BASE_IMAGE=${{ steps.meta.outputs.base_image }}
CUDA_VERSION=${{ env.CUDA_VERSION }}
UBUNTU_VERSION=${{ env.UBUNTU_VERSION }}
PYTHON_VERSION=${{ env.PYTHON_VERSION }}
UV_VERSION=${{ env.UV_VERSION }}
PYTORCH_INDEX=${{ steps.meta.outputs.pytorch_index }}
PKG_MGR=${{ env.PKG_MGR }}
no-cache: ${{ needs.prepare.outputs.no_cache == 'true' }}

- name: Push Docker image
if: needs.prepare.outputs.push == 'true'
run: docker push ${{ steps.meta.outputs.local_tag }}

- name: Export image tag for config update
id: export
if: success() && needs.prepare.outputs.push == 'true'
run: |
TASK="${{ matrix.task }}"
echo "${TASK}_tag=${{ steps.meta.outputs.local_tag }}" >> $GITHUB_OUTPUT

- name: Print build result
if: success()
run: |
echo "" >> $GITHUB_STEP_SUMMARY
echo "**Result:** Built successfully" >> $GITHUB_STEP_SUMMARY
echo "**Pushed:** ${{ needs.prepare.outputs.push }}" >> $GITHUB_STEP_SUMMARY

# ---------------------------------------------------------------------------
# Update Config: update cuda.yml with localhost tags (temporary, for test validation)
# After tests pass, push_image_harbor.yml will promote to Harbor and update to final tags
# ---------------------------------------------------------------------------
update-config:
name: Update CI config
needs: build
runs-on: [self-hosted, Linux, X64, nvidia-0, gpus-8]
if: needs.build.result == 'success'
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: ${{ github.head_ref || github.ref }}
token: ${{ secrets.GITHUB_TOKEN }}
clean: true
fetch-depth: 0

- name: Update cuda.yml with new image tags
run: |
set -euo pipefail
CONFIG_FILE=".github/configs/cuda.yml"

TRAIN_TAG="${{ needs.build.outputs.train_tag }}"
INFERENCE_TAG="${{ needs.build.outputs.inference_tag }}"

if [ -n "$TRAIN_TAG" ]; then
echo "Updating ci_train_image to: $TRAIN_TAG"
sed -i "s|^ci_train_image:.*|ci_train_image: ${TRAIN_TAG}|" "$CONFIG_FILE"
fi

if [ -n "$INFERENCE_TAG" ]; then
echo "Updating ci_inference_image to: $INFERENCE_TAG"
sed -i "s|^ci_inference_image:.*|ci_inference_image: ${INFERENCE_TAG}|" "$CONFIG_FILE"
fi

echo "Updated config:"
cat "$CONFIG_FILE"

- name: Commit and push config update
run: |
git config user.name "github-actions[bot]"
git config user.email "github-actions[bot]@users.noreply.github.com"
git add .github/configs/cuda.yml
if git diff --cached --quiet; then
echo "No config changes to commit"
else
git commit -m "ci: update CUDA image tags [skip ci]"
git push
fi

# ---------------------------------------------------------------------------
# Summary: verify all builds completed
# ---------------------------------------------------------------------------
summary:
name: Build summary
needs: update-config
runs-on: ubuntu-latest
if: always()
steps:
- name: Verify build results
run: |
if [ "${{ needs.update-config.result }}" != "success" ]; then
echo "::error::One or more image builds failed"
exit 1
fi
echo "All Docker images built successfully!"
Loading
Loading