diff --git a/.github/workflows/containers.yml b/.github/workflows/containers.yml index 5803784..532eb8e 100644 --- a/.github/workflows/containers.yml +++ b/.github/workflows/containers.yml @@ -61,6 +61,8 @@ jobs: runner: linux-arm64-cpu - image_repo: xgb-ci.manylinux_2_28_aarch64 runner: linux-arm64-cpu + - image_repo: xgb-ci.gpu_build_cuda13_rockylinux8_aarch64 + runner: linux-arm64-cpu steps: - name: Workflow trigger information run: | diff --git a/containers/ci_container.yml b/containers/ci_container.yml index 1b68c5d..4403986 100644 --- a/containers/ci_container.yml +++ b/containers/ci_container.yml @@ -22,6 +22,14 @@ xgb-ci.gpu_build_cuda13_rockylinux8: build_args: CUDA_VERSION: "13.0.0" NCCL_VERSION: *nccl_version + ARCH: x86_64 + +xgb-ci.gpu_build_cuda13_rockylinux8_aarch64: + container_def: gpu_build_cuda13_rockylinux8 + build_args: + CUDA_VERSION: "13.0.0" + NCCL_VERSION: *nccl_version + ARCH: aarch64 xgb-ci.gpu_build_r_rockylinux8: container_def: gpu_build_r_rockylinux8 diff --git a/containers/dockerfile/Dockerfile.gpu_build_cuda13_rockylinux8 b/containers/dockerfile/Dockerfile.gpu_build_cuda13_rockylinux8 index 626db05..11e190e 100644 --- a/containers/dockerfile/Dockerfile.gpu_build_cuda13_rockylinux8 +++ b/containers/dockerfile/Dockerfile.gpu_build_cuda13_rockylinux8 @@ -2,7 +2,8 @@ ARG CUDA_VERSION=notset FROM nvcr.io/nvidia/cuda:$CUDA_VERSION-devel-rockylinux8 ARG CUDA_VERSION ARG NCCL_VERSION -ARG MINIFORGE_VERSION=25.3.1-0 +ARG ARCH=x86_64 +ARG MINIFORGE_VERSION=25.11.0-1 ARG CMAKE_VERSION=4.1.0 SHELL ["/bin/bash", "-c"] @@ -16,30 +17,33 @@ ENV GOSU_VERSION=1.10 # Install all basic requirements RUN \ - curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/D42D0685.pub | sed '/^Version/d' \ + { [ $ARCH = "aarch64" ] && export CUDA_REPO_ARCH="sbsa" || export CUDA_REPO_ARCH="x86_64"; } && \ + curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/rhel8/${CUDA_REPO_ARCH}/D42D0685.pub | sed '/^Version/d' \ > /etc/pki/rpm-gpg/RPM-GPG-KEY-NVIDIA && \ dnf -y update && \ dnf -y install dnf-plugins-core && \ dnf config-manager --set-enabled powertools && \ dnf install -y tar unzip wget xz git which ninja-build gcc-toolset-10-gcc gcc-toolset-10-binutils gcc-toolset-10-gcc-c++ && \ # Miniforge - wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/$MINIFORGE_VERSION/Miniforge3-$MINIFORGE_VERSION-Linux-x86_64.sh && \ + wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/$MINIFORGE_VERSION/Miniforge3-$MINIFORGE_VERSION-Linux-${ARCH}.sh && \ bash conda.sh -b -p /opt/miniforge && \ /opt/miniforge/bin/python -m pip install awscli && \ # CMake - wget -nv -O cmake.sh https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.sh && \ + wget -nv -O cmake.sh https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-${ARCH}.sh && \ bash cmake.sh --skip-license --prefix=/usr # NCCL2 (License: https://docs.nvidia.com/deeplearning/sdk/nccl-sla/index.html) RUN \ + { [ $ARCH = "aarch64" ] && export CUDA_REPO_ARCH="sbsa" || export CUDA_REPO_ARCH="x86_64"; } && \ export NCCL_VERSION=$NCCL_VERSION && \ - dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo && \ + dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/${CUDA_REPO_ARCH}/cuda-rhel8.repo && \ dnf -y update && \ dnf install -y libnccl-${NCCL_VERSION}+cuda13.0 libnccl-devel-${NCCL_VERSION}+cuda13.0 libnccl-static-${NCCL_VERSION}+cuda13.0 # Install lightweight sudo (not bound to TTY) RUN set -ex; \ - wget -nv -nc -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \ + { [ $ARCH = "aarch64" ] && export GOSU_ARCH="arm64" || export GOSU_ARCH="amd64"; } && \ + wget -nv -nc -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-${GOSU_ARCH}" && \ chmod +x /usr/local/bin/gosu && \ gosu nobody true diff --git a/containers/dockerfile/Dockerfile.gpu_build_r_rockylinux8 b/containers/dockerfile/Dockerfile.gpu_build_r_rockylinux8 index 8fe45d8..cf8ee59 100644 --- a/containers/dockerfile/Dockerfile.gpu_build_r_rockylinux8 +++ b/containers/dockerfile/Dockerfile.gpu_build_r_rockylinux8 @@ -18,7 +18,7 @@ ENV GOSU_VERSION=1.10 # Install all basic requirements RUN \ - curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/D42D0685.pub | sed '/^Version/d' \ + curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/D42D0685.pub | sed '/^Version/d' \ > /etc/pki/rpm-gpg/RPM-GPG-KEY-NVIDIA && \ dnf -y update && \ dnf -y install dnf-plugins-core && \ diff --git a/containers/dockerfile/Dockerfile.gpu_build_rockylinux8 b/containers/dockerfile/Dockerfile.gpu_build_rockylinux8 index fbc5819..4064691 100644 --- a/containers/dockerfile/Dockerfile.gpu_build_rockylinux8 +++ b/containers/dockerfile/Dockerfile.gpu_build_rockylinux8 @@ -17,7 +17,7 @@ ENV GOSU_VERSION=1.10 # Install all basic requirements RUN \ - curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/D42D0685.pub | sed '/^Version/d' \ + curl -fsSL https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/D42D0685.pub | sed '/^Version/d' \ > /etc/pki/rpm-gpg/RPM-GPG-KEY-NVIDIA && \ dnf -y update && \ dnf -y install dnf-plugins-core && \ @@ -34,7 +34,7 @@ RUN \ # NCCL2 (License: https://docs.nvidia.com/deeplearning/sdk/nccl-sla/index.html) RUN \ export NCCL_VERSION=$NCCL_VERSION && \ - dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo && \ + dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo && \ dnf -y update && \ dnf install -y libnccl-${NCCL_VERSION}+cuda12.9 libnccl-devel-${NCCL_VERSION}+cuda12.9 diff --git a/containers/dockerfile/Dockerfile.jvm_gpu_build b/containers/dockerfile/Dockerfile.jvm_gpu_build index 324c253..46544fc 100644 --- a/containers/dockerfile/Dockerfile.jvm_gpu_build +++ b/containers/dockerfile/Dockerfile.jvm_gpu_build @@ -36,7 +36,7 @@ RUN \ # NCCL2 (License: https://docs.nvidia.com/deeplearning/sdk/nccl-sla/index.html) RUN \ export NCCL_VERSION=$NCCL_VERSION && \ - dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo && \ + dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo && \ dnf -y update && \ dnf install -y libnccl-${NCCL_VERSION}+cuda12.9 libnccl-devel-${NCCL_VERSION}+cuda12.9 libnccl-static-${NCCL_VERSION}+cuda12.9 diff --git a/vm_images/linux-arm64/bootstrap.sh b/vm_images/linux-arm64/bootstrap.sh index 061dce4..9a222e4 100644 --- a/vm_images/linux-arm64/bootstrap.sh +++ b/vm_images/linux-arm64/bootstrap.sh @@ -31,6 +31,19 @@ sudo systemctl is-active --quiet docker.service || sudo systemctl start docker.s sudo systemctl is-enabled --quiet docker.service || sudo systemctl enable docker.service sleep 10 # Docker daemon takes time to come up after installing sudo docker info + +## Install NVIDIA Container Toolkit +curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \ + && curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \ + sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \ + sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list +sudo apt-get update +sudo apt-get install -y nvidia-container-toolkit +sudo nvidia-ctk runtime configure --runtime=docker +sudo systemctl restart docker + +sleep 10 +sudo docker run --rm --gpus all ubuntu nvidia-smi sudo systemctl stop docker ## Install AWS CLI v2 diff --git a/vm_images/linux-arm64/install_drivers.sh b/vm_images/linux-arm64/install_drivers.sh new file mode 100644 index 0000000..5d66199 --- /dev/null +++ b/vm_images/linux-arm64/install_drivers.sh @@ -0,0 +1,14 @@ +#!/bin/bash +set -euo pipefail + +## Install basic tools +echo 'debconf debconf/frontend select Noninteractive' | sudo debconf-set-selections +sudo apt-get update +sudo apt-get install -y cmake git build-essential wget ca-certificates curl unzip + +## Install CUDA Driver 580 +wget -nv https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/sbsa/cuda-keyring_1.1-1_all.deb +sudo dpkg -i cuda-keyring_1.1-1_all.deb +sudo apt-get update +sudo apt-get -y install nvidia-open-580 +rm cuda-keyring_1.1-1_all.deb diff --git a/vm_images/linux-arm64/linux-arm64.pkr.hcl b/vm_images/linux-arm64/linux-arm64.pkr.hcl index dc4ec6f..d34cf6b 100644 --- a/vm_images/linux-arm64/linux-arm64.pkr.hcl +++ b/vm_images/linux-arm64/linux-arm64.pkr.hcl @@ -9,7 +9,7 @@ packer { locals { ami_name_prefix = "xgboost-ci" - image_name = "RunsOn worker with Ubuntu 24.04 ARM64" + image_name = "RunsOn worker with Ubuntu 24.04 ARM64 + CUDA driver 580" region = "us-west-2" timestamp = regex_replace(timestamp(), "[- TZ:]", "") volume_size = 40 @@ -33,7 +33,7 @@ source "amazon-ebs" "runs-on-linux-arm64" { ami_virtualization_type = "hvm" associate_public_ip_address = true communicator = "ssh" - instance_type = "c6g.4xlarge" + instance_type = "g5g.xlarge" region = "${local.region}" ssh_timeout = "10m" ssh_username = "ubuntu" @@ -63,6 +63,17 @@ build { sources = ["source.amazon-ebs.runs-on-linux-arm64"] provisioner "shell" { - script = "bootstrap.sh" + script = "install_drivers.sh" + pause_after = "30s" + } + + provisioner "shell" { + expect_disconnect = true + inline = ["echo 'Reboot VM'", "sudo reboot"] + } + + provisioner "shell" { + pause_before = "1m0s" + script = "bootstrap.sh" } }