diff --git a/cloudbuild/gpu-image-builder/build_packer_builder.sh b/cloudbuild/gpu-image-builder/build_packer_builder.sh new file mode 100644 index 0000000000..6b75fd3422 --- /dev/null +++ b/cloudbuild/gpu-image-builder/build_packer_builder.sh @@ -0,0 +1,26 @@ +#!/bin/bash +# build_packer_builder.sh +# Builds the custom Packer Cloud Build builder if it doesn't exist. +# https://docs.cloud.google.com/build/docs/building/build-vm-images-with-packer + +set -xeuo pipefail + +PROJECT_ID="${1}" +PACKER_BUILDER_IMAGE="gcr.io/${PROJECT_ID}/packer" + +if gcloud container images describe "${PACKER_BUILDER_IMAGE}" > /dev/null 2>&1; then + echo "Packer builder image '${PACKER_BUILDER_IMAGE}' exists, skipping build." +else + echo "Packer builder image not found. Building it now..." + git clone https://github.com/GoogleCloudPlatform/cloud-builders-community.git --depth=1 + cd cloud-builders-community/packer + gcloud builds submit \ + https://github.com/GoogleCloudPlatform/cloud-builders-community \ + --git-source-revision=master \ + --git-source-dir=./packer/ \ + --project="${PROJECT_ID}" \ + --service-account=projects/stackdriver-test-143416/serviceAccounts/build-and-test@stackdriver-test-143416.iam.gserviceaccount.com \ + --gcs-log-dir=gs://cloud-built-otel-collector-buckets-test-logs + cd - + echo "Packer builder image built." +fi \ No newline at end of file diff --git a/cloudbuild/gpu-image-builder/check_source_image.sh b/cloudbuild/gpu-image-builder/check_source_image.sh new file mode 100644 index 0000000000..d8507350e8 --- /dev/null +++ b/cloudbuild/gpu-image-builder/check_source_image.sh @@ -0,0 +1,41 @@ +#!/bin/bash +# check_source_image.sh +# Checks if the latest public image is newer than the source of our last build and if we need a new build + +set -euo pipefail + +PROJECT_ID="${1}" +SOURCE_IMAGE_FAMILY="${2}" +SOURCE_IMAGE_PROJECT="${3}" +TARGET_IMAGE_FAMILY="${4}" +# Louhi set trigger type as either "cron-trigger" or "git-change-trigger" +LOUHI_TRIGGER_TYPE="${5}" + +echo "--- Checking for New Source Image ---" +LATEST_PUBLIC_IMAGE=$(gcloud compute images describe-from-family "${SOURCE_IMAGE_FAMILY}" --project="${SOURCE_IMAGE_PROJECT}" --format="value(name)") +echo "Latest available public image: ${LATEST_PUBLIC_IMAGE}" + +LAST_CURATED_SOURCE_IMAGE="" +if gcloud compute images describe-from-family "${TARGET_IMAGE_FAMILY}" --project="${PROJECT_ID}" &> /dev/null; then + LAST_CURATED_SOURCE_IMAGE=$(gcloud compute images describe-from-family "${TARGET_IMAGE_FAMILY}" --project="${PROJECT_ID}" --format="value(labels.source-gce-image)") + echo "Source image of our latest curated image: ${LAST_CURATED_SOURCE_IMAGE}" +else + echo "Image family '${TARGET_IMAGE_FAMILY}' not found. Assuming this is the first build." +fi + +# Only skip when running nightly, and there is no new base image +if [[ "${LATEST_PUBLIC_IMAGE}" == "${LAST_CURATED_SOURCE_IMAGE}" ]] && \ + [[ "${LOUHI_TRIGGER_TYPE}" == "cron-trigger" ]]; then + echo "Source image '${LATEST_PUBLIC_IMAGE}' has not changed. Signaling to skip build." + echo "SKIP" > /workspace/build_status.txt +# Else, we either have a new image, or this is trigger by git changes +# Note that we set the Louhi Git trigger to only watch cloudbuild/gpu-image-builder directory +else + if [[ "${LATEST_PUBLIC_IMAGE}" != "${LAST_CURATED_SOURCE_IMAGE}" ]]; then + echo "New source image '${LATEST_PUBLIC_IMAGE}' detected or first run. Signaling to run build." + else + echo "New image building triggered by GitHub changes (Louhi trigger type = '${LOUHI_TRIGGER_TYPE}')" + fi + echo "${LATEST_PUBLIC_IMAGE}" > /workspace/new_source_image.txt + echo "RUN" > /workspace/build_status.txt +fi diff --git a/cloudbuild/gpu-image-builder/cloudbuild.yaml b/cloudbuild/gpu-image-builder/cloudbuild.yaml new file mode 100644 index 0000000000..7a78ee3015 --- /dev/null +++ b/cloudbuild/gpu-image-builder/cloudbuild.yaml @@ -0,0 +1,52 @@ +# cloudbuild.yaml +steps: +# Check for new source image. Runs 'check_source_image.sh'. +- id: 'check-source-image' + name: 'gcr.io/cloud-builders/gcloud' + entrypoint: 'bash' + args: + - '-c' + - | + chmod +x /workspace/louhi_ws/ops-agent/cloudbuild/gpu-image-builder/gpu-check_source_image.sh + /workspace/louhi_ws/ops-agent/cloudbuild/gpu-image-builder/check_source_image.sh "${PROJECT_ID}" \ + "${_LOUHI_PARAM_SOURCE_IMAGE_FAMILY}" \ + "${_LOUHI_PARAM_SOURCE_IMAGE_PROJECT}" \ + "${_LOUHI_PARAM_OUTPUT_IMAGE_FAMILY}" \ + "${_LOUHI_TRIGGER_TYPE}" + waitFor: ['-'] + +# Conditionally build the Packer builder image. Runs 'build_packer_builder.sh'. +- id: 'build-packer-builder' + name: 'gcr.io/cloud-builders/gcloud' + entrypoint: 'bash' + args: + - '-c' + - | + chmod +x /workspace/louhi_ws/ops-agent/cloudbuild/gpu-image-builder/build_packer_builder.sh + /workspace/louhi_ws/ops-agent/cloudbuild/gpu-image-builder/build_packer_builder.sh "${PROJECT_ID}" + waitFor: ['-'] # Can run in parallel with check-source-image + +# Run Packer to build the GCE image, but only if 'check-source-image' signaled to RUN. +- id: 'packer-build-gpu-image' + name: 'gcr.io/${PROJECT_ID}/packer' + entrypoint: 'bash' + args: + - '-c' + - | + if [[ "$(cat /workspace/build_status.txt)" == "SKIP" ]]; then + echo "Skipping Packer build as source image has not changed." + exit 0 + fi + + /usr/bin/packer build \ + -var "project_id=${PROJECT_ID}" \ + -var "image_name=${_LOUHI_PARAM_OUTPUT_IMAGE_FAMILY}-$(date -u +%Y%m%d-%H%M%S)" \ + -var "image_family=${_LOUHI_PARAM_OUTPUT_IMAGE_FAMILY}" \ + -var "source_image=$(cat /workspace/new_source_image.txt)" \ + -var "source_image_project=${_LOUHI_PARAM_SOURCE_IMAGE_PROJECT}" \ + -var "zone=us-central1-a" \ + -var "build_id=${BUILD_ID}" \ + /workspace/louhi_ws/ops-agent/cloudbuild/gpu-image-builder/packer.pkr.hcl + waitFor: ['check-source-image', 'build-packer-builder'] + +timeout: 14400s diff --git a/cloudbuild/gpu-image-builder/packer.pkr.hcl b/cloudbuild/gpu-image-builder/packer.pkr.hcl new file mode 100644 index 0000000000..3700846f9b --- /dev/null +++ b/cloudbuild/gpu-image-builder/packer.pkr.hcl @@ -0,0 +1,77 @@ +// packer.pkr.hcl +variable "project_id" { + type = string + description = "GCP Project ID" +} + +variable "image_name" { + type = string + description = "Name of the created GCE image" +} + +variable "image_family" { + type = string + description = "Image family for the created GCE image" +} + +variable "source_image" { + type = string + description = "The specific source GCE image name (e.g., ubuntu-2204-jammy-v20240115)" +} + +variable "source_image_project" { + type = string + description = "The specific source GCE image project (e.g., ubuntu-os-cloud)" +} + +variable "zone" { + type = string + default = "us-central1-a" + description = "GCP zone for the temporary build instance" +} + +variable "build_id" { + type = string + description = "Cloud Build ID for traceability" + default = "manual" +} + +source "googlecompute" "gpu_image" { + project_id = var.project_id + zone = var.zone + source_image = var.source_image + source_image_project_id = [var.source_image_project] + image_name = var.image_name + image_family = var.image_family + ssh_username = "packer" + disk_size = 50 + disk_type = "pd-standard" + machine_type = "n1-standard-4" // Use a standard VM for building, no GPU needed here + tags = ["packer-build"] + + // *** IMPORTANT: Label the created image with its source image *** + image_labels = { + source-gce-image = "${var.source_image}" + built-by = "louhi" + cloud-build-id = "${var.build_id}" + } +} + +build { + sources = ["source.googlecompute.gpu_image"] + + // Provisioner 1: Most distros only need one step + provisioner "shell" { + script = "/workspace/louhi_ws/cloudbuild/gpu-image-builder/scripts/${var.image_family}/setup_vm.sh" + expect_disconnect = true // Expect a disconnect/reboot after GPU driver install + timeout = "240m" + } + + // Provisioner 2: Handles the post-reboot part, ONLY for Debian 12. + provisioner "shell" { + script = var.image_family == "debian-12" ? "/workspace/louhi_ws/cloudbuild/gpu-image-builder/scripts/${var.image_family}/post_reboot.sh" : "/workspace/louhi_ws/cloudbuild/gpu-image-builder/scripts/noop.sh" + pause_before = "60s" // Wait for the reboot to be complete + expect_disconnect = false // No reboot expected in this second phase. + timeout = "240m" + } +} diff --git a/cloudbuild/gpu-image-builder/scripts/debian-11/setup_vm.sh b/cloudbuild/gpu-image-builder/scripts/debian-11/setup_vm.sh new file mode 100644 index 0000000000..5123ade43a --- /dev/null +++ b/cloudbuild/gpu-image-builder/scripts/debian-11/setup_vm.sh @@ -0,0 +1,10 @@ +#!/bin/bash +# setup_vm.sh - Provisioning script for Packer, executed via Shell Provisioner. +set -euo pipefail + +# Source Image: ml-images:common-gpu-debian-11-py310 +# Source Image description: Google, Deep Learning VM with CUDA 11.8, M126, Debian 11, Python 3.10. With CUDA 11.8 preinstalled. +# Output Image: stackdriver-test-143416:debian-11 + +# DLVM images come with a script to install the driver and CUDA toolkit. +/opt/deeplearning/install-driver.sh \ No newline at end of file diff --git a/cloudbuild/gpu-image-builder/scripts/debian-12/post_reboot.sh b/cloudbuild/gpu-image-builder/scripts/debian-12/post_reboot.sh new file mode 100644 index 0000000000..00a026ae69 --- /dev/null +++ b/cloudbuild/gpu-image-builder/scripts/debian-12/post_reboot.sh @@ -0,0 +1,12 @@ +#!/bin/bash +# post_reboot.sh - Runs setup steps after the VM has rebooted on Debian 12. Provisioning script for Packer, executed via Shell Provisioner. +set -euo pipefail + +INSTALLER_DIR="/var/lib/cuda-installer" +CUDA_INSTALLER_PATH="${INSTALLER_DIR}/cuda_installer.pyz" + +# Rerun `install_driver` to finish driver installation +sudo python3 "${CUDA_INSTALLER_PATH}" install_driver --ignore-no-gpu --installation-mode=repo --installation-branch=nfb || { echo "ERROR: cuda_installer.pyz install_driver failed!"; exit 1; } + +# Install CUDA toolkit +sudo python3 "${CUDA_INSTALLER_PATH}" install_cuda --ignore-no-gpu --installation-mode=repo --installation-branch=nfb || { echo "ERROR: cuda_installer.pyz install_cuda failed!"; exit 1; } diff --git a/cloudbuild/gpu-image-builder/scripts/debian-12/setup_vm.sh b/cloudbuild/gpu-image-builder/scripts/debian-12/setup_vm.sh new file mode 100644 index 0000000000..5e75b56794 --- /dev/null +++ b/cloudbuild/gpu-image-builder/scripts/debian-12/setup_vm.sh @@ -0,0 +1,18 @@ +#!/bin/bash +# setup_vm.sh - Provisioning script for Packer, executed via Shell Provisioner. +set -euo pipefail + +# Source Image: debian-cloud:debian-12 +# Output Image: stackdriver-test-143416:debian-12 + +sudo apt update -y +sudo apt install -y --no-install-recommends python3 python3-pip wget curl gnupg git || { echo "ERROR: Failed to install prerequisites!"; exit 1; } + +INSTALLER_DIR="/var/lib/cuda-installer" +CUDA_INSTALLER_PATH="${INSTALLER_DIR}/cuda_installer.pyz" +sudo mkdir -p "${INSTALLER_DIR}" +sudo curl -L https://storage.googleapis.com/compute-gpu-installation-us/installer/latest/cuda_installer.pyz --output "${CUDA_INSTALLER_PATH}" +sudo chmod +x "${CUDA_INSTALLER_PATH}" + +sudo python3 "${CUDA_INSTALLER_PATH}" install_driver --ignore-no-gpu --installation-mode=repo --installation-branch=nfb || { echo "ERROR: cuda_installer.pyz install_driver failed!"; exit 1; } +# The script will reboot diff --git a/cloudbuild/gpu-image-builder/scripts/debian-13/setup_vm.sh b/cloudbuild/gpu-image-builder/scripts/debian-13/setup_vm.sh new file mode 100644 index 0000000000..3a2632e756 --- /dev/null +++ b/cloudbuild/gpu-image-builder/scripts/debian-13/setup_vm.sh @@ -0,0 +1,18 @@ +#!/bin/bash +# setup_vm.sh - Provisioning script for Packer, executed via Shell Provisioner. +set -euo pipefail + +# Source Image: debian-cloud:debian-13 +# Output Image: stackdriver-test-143416:debian-13 + +# Install driver and CUDA toolkit +sudo apt update -y +KERNEL_VERSION=`uname -r` +sudo apt install -y linux-headers-${KERNEL_VERSION} pciutils gcc make dkms wget git + +wget https://developer.download.nvidia.com/compute/cuda/repos/debian13/x86_64/cuda-keyring_1.1-1_all.deb +sudo dpkg -i cuda-keyring_1.1-1_all.deb +sudo apt update + +sudo apt -y install cuda-13-1 + diff --git a/cloudbuild/gpu-image-builder/scripts/noop.sh b/cloudbuild/gpu-image-builder/scripts/noop.sh new file mode 100644 index 0000000000..138e5ca3a8 --- /dev/null +++ b/cloudbuild/gpu-image-builder/scripts/noop.sh @@ -0,0 +1,2 @@ +#!/bin/bash +# Empty Script as a placeholder for noop steps \ No newline at end of file diff --git a/cloudbuild/gpu-image-builder/scripts/rocky-linux-8/setup_vm.sh b/cloudbuild/gpu-image-builder/scripts/rocky-linux-8/setup_vm.sh new file mode 100644 index 0000000000..28cefc0cdb --- /dev/null +++ b/cloudbuild/gpu-image-builder/scripts/rocky-linux-8/setup_vm.sh @@ -0,0 +1,14 @@ +#!/bin/bash +# setup_vm.sh - Provisioning script for Packer, executed via Shell Provisioner. +set -euo pipefail + +# Source Image: rocky-linux-accelerator-cloud:rocky-linux-8-optimized-gcp-nvidia-580 +# Source Image Description: Rocky Linux, Rocky Linux, 8 with the Nvidia 580 driver, x86_64 optimized for GCP built on {date} +# Output Image: stackdriver-test-143416:rocky-linux-8 + +# The accelerator image already has the driver (R580) installed. +# Follow https://developer.nvidia.com/cuda-13-0-0-download-archive?target_os=Linux&target_arch=x86_64&Distribution=Rocky&target_version=8&target_type=rpm_network +# to install the matching CUDA toolkit 13.0 (without driver) +sudo dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo +sudo dnf clean all +sudo dnf -y install cuda-toolkit-13-0 git make diff --git a/cloudbuild/gpu-image-builder/scripts/rocky-linux-9/setup_vm.sh b/cloudbuild/gpu-image-builder/scripts/rocky-linux-9/setup_vm.sh new file mode 100644 index 0000000000..5c470d4b9e --- /dev/null +++ b/cloudbuild/gpu-image-builder/scripts/rocky-linux-9/setup_vm.sh @@ -0,0 +1,14 @@ +#!/bin/bash +# setup_vm.sh - Provisioning script for Packer, executed via Shell Provisioner. +set -euo pipefail + +# Source Image: rocky-linux-accelerator-cloud:rocky-linux-9-optimized-gcp-nvidia-580 +# Source Image Description: Rocky Linux, Rocky Linux, 9 with the Nvidia 580 driver, x86_64 optimized for GCP with the Nvidia 580 driver built on {date} +# Output Image: stackdriver-test-143416:rocky-linux-9 + +# The accelerator image already has the driver (R580) installed. +# Follow https://developer.nvidia.com/cuda-13-0-0-download-archive?target_os=Linux&target_arch=x86_64&Distribution=Rocky&target_version=9&target_type=rpm_network +# to install the matching CUDA toolkit 13.0 (without driver) +sudo dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo +sudo dnf clean all +sudo dnf -y install cuda-toolkit-13-0 git make \ No newline at end of file diff --git a/cloudbuild/gpu-image-builder/scripts/sles-15/setup_vm.sh b/cloudbuild/gpu-image-builder/scripts/sles-15/setup_vm.sh new file mode 100644 index 0000000000..0f106cbbdb --- /dev/null +++ b/cloudbuild/gpu-image-builder/scripts/sles-15/setup_vm.sh @@ -0,0 +1,56 @@ +#!/bin/bash +# setup_vm.sh - Provisioning script for Packer, executed via Shell Provisioner. +set -euo pipefail + +# Source Image: suse-cloud:sles-15 +# Output Image: stackdriver-test-143416:sles-15 + +# Mimic our prepareSLES() logic in gce_testing.go +# https://github.com/GoogleCloudPlatform/opentelemetry-operations-collector/blob/ec757f2f48c865c7aa1afaed27891d8727a28f2e/integration_test/gce-testing-internal/gce/gce_testing.go#L1057 +retry_command() { + local max_attempts="$1" + local sleep_time="$2" + local cmd="$3" + + echo "Starting command: $cmd" + echo "----------------------------------------" + + for ((i=1; i<=max_attempts; i++)); do + echo "[Attempt $i/$max_attempts] Running..." + + # Run the command using bash -c to handle complex commands (like those with &&) + if bash -c "$cmd"; then + echo "----------------------------------------" + echo "Success!" + return 0 + fi + + echo "Attempt failed." + + # Sleep only if we have attempts left + if [ $i -lt $max_attempts ]; then + echo "Waiting $sleep_time seconds before retrying..." + sleep $sleep_time + fi + done + + echo "----------------------------------------" + echo "Error: Command failed after $max_attempts attempts." + exit 1 +} + +retry_command 5 5 "sudo /usr/sbin/registercloudguest --force" +retry_command 120 5 "sudo zypper --non-interactive --gpg-auto-import-keys refresh && sudo zypper --non-interactive install --force coreutils" + +sudo zypper --non-interactive install -y kernel-default-devel=$(uname -r | sed 's/\-default//') pciutils gcc make wget git + +# Install CUDA and driver together, since the `exercise` script needs to run a +# CUDA app to generating GPU process metrics +# Prefer to install from the package manager since it is normally faster and has +# less errors on installation. The cuda-12-9 mega-package installs driver and +# CUDA together +sudo zypper --non-interactive addrepo https://developer.download.nvidia.com/compute/cuda/repos/sles15/x86_64/cuda-sles15.repo +sudo zypper --gpg-auto-import-keys --non-interactive refresh +# CUDA 13 is not yet working with the SLES 15 image +sudo zypper --non-interactive install -y nvidia-compute-utils-G06 +sudo zypper --non-interactive install -y cuda-12-9 diff --git a/cloudbuild/gpu-image-builder/scripts/ubuntu-2204-lts/setup_vm.sh b/cloudbuild/gpu-image-builder/scripts/ubuntu-2204-lts/setup_vm.sh new file mode 100644 index 0000000000..7719319a07 --- /dev/null +++ b/cloudbuild/gpu-image-builder/scripts/ubuntu-2204-lts/setup_vm.sh @@ -0,0 +1,15 @@ +#!/bin/bash +# setup_vm.sh - Provisioning script for Packer, executed via Shell Provisioner. +set -euo pipefail + +# Source Image: ubuntu-os-accelerator-images:ubuntu-accelerator-2204-amd64-with-nvidia-580 +# Source Image Description: Canonical, Ubuntu, 22.04 LTS NVIDIA version: 580, amd64 jammy image built on {date} +# Output Image: stackdriver-test-143416:ubuntu-2204-lts + +# The accelerator image already has the driver (R580) installed. +# Follow https://developer.nvidia.com/cuda-13-0-0-download-archive?target_os=Linux&target_arch=x86_64&Distribution=Ubuntu&target_version=22.04&target_type=deb_network +# to install the matching CUDA toolkit 13.0 (without driver) +wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb +sudo dpkg -i cuda-keyring_1.1-1_all.deb +sudo apt-get update +sudo apt-get -y install build-essential cuda-toolkit-13-0 \ No newline at end of file diff --git a/cloudbuild/gpu-image-builder/scripts/ubuntu-2404-lts-amd64/setup_vm.sh b/cloudbuild/gpu-image-builder/scripts/ubuntu-2404-lts-amd64/setup_vm.sh new file mode 100644 index 0000000000..2ea45ce29c --- /dev/null +++ b/cloudbuild/gpu-image-builder/scripts/ubuntu-2404-lts-amd64/setup_vm.sh @@ -0,0 +1,15 @@ +#!/bin/bash +# setup_vm.sh - Provisioning script for Packer, executed via Shell Provisioner. +set -euo pipefail + +# Source Image: ubuntu-os-accelerator-images:ubuntu-accelerator-2404-amd64-with-nvidia-580 +# Source Image Description: Canonical, Ubuntu, 24.04 LTS NVIDIA version: 580, amd64 noble image built on {date} +# Output Image: stackdriver-test-143416:ubuntu-2404-lts + +# The accelerator image already has the driver (R580) installed. +# Follow https://developer.nvidia.com/cuda-13-0-0-download-archive?target_os=Linux&target_arch=x86_64&Distribution=Ubuntu&target_version=24.04&target_type=deb_network +# to install the matching CUDA toolkit 13.0 (without driver) +wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-keyring_1.1-1_all.deb +sudo dpkg -i cuda-keyring_1.1-1_all.deb +sudo apt-get update +sudo apt-get -y install build-essential cuda-toolkit-13-0 \ No newline at end of file