Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/workflows/containers.yml
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@ jobs:
runner: linux-arm64-cpu
- image_repo: xgb-ci.manylinux_2_28_aarch64
runner: linux-arm64-cpu
- image_repo: xgb-ci.gpu_build_cuda13_rockylinux8_aarch64
runner: linux-arm64-cpu
steps:
- name: Workflow trigger information
run: |
Expand Down
8 changes: 8 additions & 0 deletions containers/ci_container.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,14 @@ xgb-ci.gpu_build_cuda13_rockylinux8:
build_args:
CUDA_VERSION: "13.0.0"
NCCL_VERSION: *nccl_version
ARCH: x86_64

xgb-ci.gpu_build_cuda13_rockylinux8_aarch64:
container_def: gpu_build_cuda13_rockylinux8
build_args:
CUDA_VERSION: "13.0.0"
NCCL_VERSION: *nccl_version
ARCH: aarch64

xgb-ci.gpu_build_r_rockylinux8:
container_def: gpu_build_r_rockylinux8
Expand Down
16 changes: 10 additions & 6 deletions containers/dockerfile/Dockerfile.gpu_build_cuda13_rockylinux8
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@ ARG CUDA_VERSION=notset
FROM nvcr.io/nvidia/cuda:$CUDA_VERSION-devel-rockylinux8
ARG CUDA_VERSION
ARG NCCL_VERSION
ARG MINIFORGE_VERSION=25.3.1-0
ARG ARCH=x86_64
ARG MINIFORGE_VERSION=25.11.0-1
ARG CMAKE_VERSION=4.1.0

SHELL ["/bin/bash", "-c"]
Expand All @@ -16,30 +17,33 @@ ENV GOSU_VERSION=1.10

# Install all basic requirements
RUN \
curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/D42D0685.pub | sed '/^Version/d' \
{ [ $ARCH = "aarch64" ] && export CUDA_REPO_ARCH="sbsa" || export CUDA_REPO_ARCH="x86_64"; } && \
curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/rhel8/${CUDA_REPO_ARCH}/D42D0685.pub | sed '/^Version/d' \
> /etc/pki/rpm-gpg/RPM-GPG-KEY-NVIDIA && \
dnf -y update && \
dnf -y install dnf-plugins-core && \
dnf config-manager --set-enabled powertools && \
dnf install -y tar unzip wget xz git which ninja-build gcc-toolset-10-gcc gcc-toolset-10-binutils gcc-toolset-10-gcc-c++ && \
# Miniforge
wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/$MINIFORGE_VERSION/Miniforge3-$MINIFORGE_VERSION-Linux-x86_64.sh && \
wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/$MINIFORGE_VERSION/Miniforge3-$MINIFORGE_VERSION-Linux-${ARCH}.sh && \
bash conda.sh -b -p /opt/miniforge && \
/opt/miniforge/bin/python -m pip install awscli && \
# CMake
wget -nv -O cmake.sh https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.sh && \
wget -nv -O cmake.sh https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-${ARCH}.sh && \
bash cmake.sh --skip-license --prefix=/usr

# NCCL2 (License: https://docs.nvidia.com/deeplearning/sdk/nccl-sla/index.html)
RUN \
{ [ $ARCH = "aarch64" ] && export CUDA_REPO_ARCH="sbsa" || export CUDA_REPO_ARCH="x86_64"; } && \
export NCCL_VERSION=$NCCL_VERSION && \
dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo && \
dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/${CUDA_REPO_ARCH}/cuda-rhel8.repo && \
dnf -y update && \
dnf install -y libnccl-${NCCL_VERSION}+cuda13.0 libnccl-devel-${NCCL_VERSION}+cuda13.0 libnccl-static-${NCCL_VERSION}+cuda13.0

# Install lightweight sudo (not bound to TTY)
RUN set -ex; \
wget -nv -nc -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \
{ [ $ARCH = "aarch64" ] && export GOSU_ARCH="arm64" || export GOSU_ARCH="amd64"; } && \
wget -nv -nc -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-${GOSU_ARCH}" && \
chmod +x /usr/local/bin/gosu && \
gosu nobody true

Expand Down
2 changes: 1 addition & 1 deletion containers/dockerfile/Dockerfile.gpu_build_r_rockylinux8
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ ENV GOSU_VERSION=1.10

# Install all basic requirements
RUN \
curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/D42D0685.pub | sed '/^Version/d' \
curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/D42D0685.pub | sed '/^Version/d' \
> /etc/pki/rpm-gpg/RPM-GPG-KEY-NVIDIA && \
dnf -y update && \
dnf -y install dnf-plugins-core && \
Expand Down
4 changes: 2 additions & 2 deletions containers/dockerfile/Dockerfile.gpu_build_rockylinux8
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ ENV GOSU_VERSION=1.10

# Install all basic requirements
RUN \
curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/D42D0685.pub | sed '/^Version/d' \
curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/D42D0685.pub | sed '/^Version/d' \
> /etc/pki/rpm-gpg/RPM-GPG-KEY-NVIDIA && \
dnf -y update && \
dnf -y install dnf-plugins-core && \
Expand All @@ -34,7 +34,7 @@ RUN \
# NCCL2 (License: https://docs.nvidia.com/deeplearning/sdk/nccl-sla/index.html)
RUN \
export NCCL_VERSION=$NCCL_VERSION && \
dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo && \
dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo && \
dnf -y update && \
dnf install -y libnccl-${NCCL_VERSION}+cuda12.9 libnccl-devel-${NCCL_VERSION}+cuda12.9

Expand Down
2 changes: 1 addition & 1 deletion containers/dockerfile/Dockerfile.jvm_gpu_build
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ RUN \
# NCCL2 (License: https://docs.nvidia.com/deeplearning/sdk/nccl-sla/index.html)
RUN \
export NCCL_VERSION=$NCCL_VERSION && \
dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo && \
dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo && \
dnf -y update && \
dnf install -y libnccl-${NCCL_VERSION}+cuda12.9 libnccl-devel-${NCCL_VERSION}+cuda12.9 libnccl-static-${NCCL_VERSION}+cuda12.9

Expand Down
13 changes: 13 additions & 0 deletions vm_images/linux-arm64/bootstrap.sh
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,19 @@ sudo systemctl is-active --quiet docker.service || sudo systemctl start docker.s
sudo systemctl is-enabled --quiet docker.service || sudo systemctl enable docker.service
sleep 10 # Docker daemon takes time to come up after installing
sudo docker info

## Install NVIDIA Container Toolkit
curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \
&& curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \
sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \
sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
sudo apt-get update
sudo apt-get install -y nvidia-container-toolkit
sudo nvidia-ctk runtime configure --runtime=docker
sudo systemctl restart docker

sleep 10
sudo docker run --rm --gpus all ubuntu nvidia-smi
sudo systemctl stop docker

## Install AWS CLI v2
Expand Down
14 changes: 14 additions & 0 deletions vm_images/linux-arm64/install_drivers.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/bin/bash
set -euo pipefail

## Install basic tools
echo 'debconf debconf/frontend select Noninteractive' | sudo debconf-set-selections
sudo apt-get update
sudo apt-get install -y cmake git build-essential wget ca-certificates curl unzip

## Install CUDA Driver 580
wget -nv https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/sbsa/cuda-keyring_1.1-1_all.deb
sudo dpkg -i cuda-keyring_1.1-1_all.deb
sudo apt-get update
sudo apt-get -y install nvidia-open-580
rm cuda-keyring_1.1-1_all.deb
17 changes: 14 additions & 3 deletions vm_images/linux-arm64/linux-arm64.pkr.hcl
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ packer {

locals {
ami_name_prefix = "xgboost-ci"
image_name = "RunsOn worker with Ubuntu 24.04 ARM64"
image_name = "RunsOn worker with Ubuntu 24.04 ARM64 + CUDA driver 580"
region = "us-west-2"
timestamp = regex_replace(timestamp(), "[- TZ:]", "")
volume_size = 40
Expand All @@ -33,7 +33,7 @@ source "amazon-ebs" "runs-on-linux-arm64" {
ami_virtualization_type = "hvm"
associate_public_ip_address = true
communicator = "ssh"
instance_type = "c6g.4xlarge"
instance_type = "g5g.xlarge"
region = "${local.region}"
ssh_timeout = "10m"
ssh_username = "ubuntu"
Expand Down Expand Up @@ -63,6 +63,17 @@ build {
sources = ["source.amazon-ebs.runs-on-linux-arm64"]

provisioner "shell" {
script = "bootstrap.sh"
script = "install_drivers.sh"
pause_after = "30s"
}

provisioner "shell" {
expect_disconnect = true
inline = ["echo 'Reboot VM'", "sudo reboot"]
}

provisioner "shell" {
pause_before = "1m0s"
script = "bootstrap.sh"
}
}
Loading