diff --git a/integration_test/agents/agents.go b/integration_test/agents/agents.go index 3afb097063..7b0d921923 100644 --- a/integration_test/agents/agents.go +++ b/integration_test/agents/agents.go @@ -516,7 +516,8 @@ func IsRPMBased(imageSpec string) bool { strings.HasPrefix(imageSpec, "suse-cloud") || strings.HasPrefix(imageSpec, "suse-sap-cloud") || strings.HasPrefix(imageSpec, "opensuse-cloud") || - strings.Contains(imageSpec, "sles-") + strings.Contains(imageSpec, "sles-") || + strings.Contains(imageSpec, "rocky-linux-") } // StripTildeSuffix strips off everything after the first ~ character. We see diff --git a/integration_test/third_party_apps_test/applications/dcgm/centos_rhel/install b/integration_test/third_party_apps_test/applications/dcgm/centos_rhel/install index 3e37e0c2c4..f6629104f8 100644 --- a/integration_test/third_party_apps_test/applications/dcgm/centos_rhel/install +++ b/integration_test/third_party_apps_test/applications/dcgm/centos_rhel/install @@ -1,133 +1,9 @@ set -e -source /etc/os-release -MAJOR_VERSION_ID=${VERSION_ID%%.*} -verify_driver() { - # Verify NVIDIA driver: - # Installation could finish successfully but the driver is still unusable - # A common error when running this check: - # "NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA - # driver. Make sure that the latest NVIDIA driver is installed and running." - nvidia-smi -} +# DCGM and CUDA toolkit share the same NVIDIA repo, which has already been configured in the image +# Install DCGM +sudo yum install -y datacenter-gpu-manager +sudo systemctl --now enable nvidia-dcgm -install_cuda_from_runfile() { - # Ref: https://docs.nvidia.com/datacenter/tesla/tesla-installation-notes/index.html#runfile - # This method requires the matching kernel-devel package to be installed, and - # the package may be absent from the repo and cause this method to fail - # Remove existing installation before using the runfile - remove_cuda_package - remove_driver_package - # For Rocky Linux 9: when a new OS version becomes available, the default - # repo setting (/etc/yum.repos.d/rocky.repo) will automatically point to the - # new version's repo. This is problematic since the new OS is not available - # right away on GCE. Set up the matched repo to install the correct - # kernel-devel-$(uname -r) - # Not needed for RL8 since 8.10 is already the last RL8 release. - REPO_URL="https://dl.rockylinux.org/vault/rocky/$VERSION_ID/AppStream/x86_64/os/" - REPO_METADATA="$REPO_URL/repodata/repomd.xml" - STATUS_CODE=$(curl -s -o /dev/null -w "%{http_code}" "$REPO_METADATA") - if [[ $ID == rocky && "$MAJOR_VERSION_ID" == 9 && "$STATUS_CODE" == "200" ]]; then - cat </dev/null; then + sudo sed -i '/bullseye-backports/s/^/#/' /etc/apt/sources.list + sudo sed -i '/bullseye-backports/s/^/#/' /etc/apt/sources.list.d/*.list + fi +fi -# check NVIDIA driver installation succeeded -nvidia-smi +if ! dpkg -s cuda-keyring >/dev/null 2>&1; then + filename="cuda-keyring_1.1-1_all.deb" + url="https://developer.download.nvidia.com/compute/cuda/repos/${DISTRIBUTION}/x86_64/${filename}" + + wget --no-verbose "$url" + sudo dpkg -i "$filename" +fi # Install DCGM sudo apt-get update diff --git a/integration_test/third_party_apps_test/applications/dcgm/exercise b/integration_test/third_party_apps_test/applications/dcgm/exercise index 22604e508c..42189817a5 100644 --- a/integration_test/third_party_apps_test/applications/dcgm/exercise +++ b/integration_test/third_party_apps_test/applications/dcgm/exercise @@ -1,6 +1,18 @@ set -e -# Run the bandwidthTest demo with a large range to create a process that uses +# Run the gpu burn with a large range to create a process that uses # GPU for a period that is longer than default collection interval of 60s -/usr/local/cuda/extras/demo_suite/bandwidthTest --memory=pinned --mode=range \ - --start=1024 --end=20480 --increment=1 +git clone https://github.com/wilicc/gpu-burn +cd gpu-burn +DEVICE_CODE=$(lspci -n | grep -Po '10de:[\w\d]{4}') +case $DEVICE_CODE in + # V100 | P4 | P100 + # Device PCIe ID lookup: https://envytools.readthedocs.io/en/latest/hw/pciid.html + 10de:1db1|10de:1bb3|10de:15f8) + make COMPUTE=60 + ;; + *) + make + ;; +esac +./gpu_burn -d 180 diff --git a/integration_test/third_party_apps_test/applications/dcgm/metadata.yaml b/integration_test/third_party_apps_test/applications/dcgm/metadata.yaml index 37c9f2028a..705b0f2952 100644 --- a/integration_test/third_party_apps_test/applications/dcgm/metadata.yaml +++ b/integration_test/third_party_apps_test/applications/dcgm/metadata.yaml @@ -23,28 +23,31 @@ configure_integration: |- You must install DCGM and run the DCGM daemon service. supported_operating_systems: linux supported_app_version: ["3.1"] -gpu_platforms: # p4, p100 don't emit DCGM profiling metrics +gpu_platforms: + # Platform selection: one most common distro (Ubuntu/Debian) for all GPU models + one easy-to-access GPU model (L4) for all distros +# Debian 11 has the proprietary version of the driver that supports older GPUs (V100, P100, P4) +# P4, P100 don't emit DCGM profiling metrics - model: a100 platforms: - ubuntu-os-cloud:ubuntu-2204-lts - model: v100 platforms: - - ubuntu-os-cloud:ubuntu-2204-lts + - debian-cloud:debian-11 - model: t4 platforms: - ubuntu-os-cloud:ubuntu-2204-lts - model: l4 platforms: - debian-cloud:debian-11 - - ml-images:common-gpu-debian-11-py310 + - debian-cloud:debian-12 + # DCGM 3 not available on debian-cloud:debian-13 - rocky-linux-cloud:rocky-linux-8 - rocky-linux-cloud:rocky-linux-9 - suse-cloud:sles-15 - ubuntu-os-cloud:ubuntu-2204-lts - ubuntu-os-cloud:ubuntu-2404-lts-amd64 - model: h100 - platforms: - - ubuntu-os-cloud:ubuntu-minimal-2204-lts # due to H100 quota, choose an image from the exhaustive list to skip presubmits + platforms: [] # Need quota expected_metrics: - type: workload.googleapis.com/gpu.dcgm.utilization value_type: DOUBLE diff --git a/integration_test/third_party_apps_test/applications/dcgm/sles/install b/integration_test/third_party_apps_test/applications/dcgm/sles/install index f30fe6459d..6df2671af4 100644 --- a/integration_test/third_party_apps_test/applications/dcgm/sles/install +++ b/integration_test/third_party_apps_test/applications/dcgm/sles/install @@ -1,22 +1,6 @@ set -e -sudo zypper --non-interactive install -y kernel-default-devel=$(uname -r | sed 's/\-default//') pciutils gcc make wget - -# Install CUDA and driver the same way as the nvml app -# Prefer to install from the package manager since it is normally faster and has -# less errors on installation; fallback to the runfile method if the package -# manager's package is not working or not compitible with the GPU model -DISTRIBUTION=$(. /etc/os-release;echo $ID$VERSION_ID | sed -e 's/\.[0-9]//') -# Need to add the repo for installing CUDA and DCGM -sudo zypper --non-interactive ar http://developer.download.nvidia.com/compute/cuda/repos/${DISTRIBUTION}/x86_64/cuda-${DISTRIBUTION}.repo -sudo zypper --gpg-auto-import-keys --non-interactive refresh -echo "Installing latest version of NVIDIA CUDA and driver" -sudo zypper --non-interactive install -y nvidia-compute-utils-G06 -sudo zypper --non-interactive install -y cuda-12-9 - -# check NVIDIA driver installation succeeded -nvidia-smi - +# DCGM and CUDA toolkit share the same NVIDIA repo, which has already been configured in the image # Install DCGM sudo zypper --non-interactive install datacenter-gpu-manager sudo systemctl --now enable nvidia-dcgm diff --git a/integration_test/third_party_apps_test/applications/dcgmv1/centos_rhel/install b/integration_test/third_party_apps_test/applications/dcgmv1/centos_rhel/install index f52350ae84..f6629104f8 100644 --- a/integration_test/third_party_apps_test/applications/dcgmv1/centos_rhel/install +++ b/integration_test/third_party_apps_test/applications/dcgmv1/centos_rhel/install @@ -1,135 +1,9 @@ set -e -source /etc/os-release -MAJOR_VERSION_ID=${VERSION_ID%%.*} -verify_driver() { - # Verify NVIDIA driver: - # Installation could finish successfully but the driver is still unusable - # A common error when running this check: - # "NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA - # driver. Make sure that the latest NVIDIA driver is installed and running." - nvidia-smi -} +# DCGM and CUDA toolkit share the same NVIDIA repo, which has already been configured in the image +# Install DCGM +sudo yum install -y datacenter-gpu-manager +sudo systemctl --now enable nvidia-dcgm -install_cuda_from_runfile() { - # Ref: https://docs.nvidia.com/datacenter/tesla/tesla-installation-notes/index.html#runfile - # This method requires the matching kernel-devel package to be installed, and - # the package may be absent from the repo and cause this method to fail - # Remove existing installation before using the runfile - remove_cuda_package - remove_driver_package - # For Rocky Linux 9: when a new OS version becomes available, the default - # repo setting (/etc/yum.repos.d/rocky.repo) will automatically point to the - # new version's repo. This is problematic since the new OS is not available - # right away on GCE. Set up the matched repo to install the correct - # kernel-devel-$(uname -r) - # Not needed for RL8 since 8.10 is already the last RL8 release. - REPO_URL="https://dl.rockylinux.org/vault/rocky/$VERSION_ID/AppStream/x86_64/os/" - REPO_METADATA="$REPO_URL/repodata/repomd.xml" - STATUS_CODE=$(curl -s -o /dev/null -w "%{http_code}" "$REPO_METADATA") - if [[ $ID == rocky && "$MAJOR_VERSION_ID" == 9 && "$STATUS_CODE" == "200" ]]; then - cat </dev/null; then + sudo sed -i '/bullseye-backports/s/^/#/' /etc/apt/sources.list + sudo sed -i '/bullseye-backports/s/^/#/' /etc/apt/sources.list.d/*.list + fi +fi -# check NVIDIA driver installation succeeded -nvidia-smi +if ! dpkg -s cuda-keyring >/dev/null 2>&1; then + filename="cuda-keyring_1.1-1_all.deb" + url="https://developer.download.nvidia.com/compute/cuda/repos/${DISTRIBUTION}/x86_64/${filename}" + + wget --no-verbose "$url" + sudo dpkg -i "$filename" +fi # Install DCGM sudo apt-get update diff --git a/integration_test/third_party_apps_test/applications/dcgmv1/metadata.yaml b/integration_test/third_party_apps_test/applications/dcgmv1/metadata.yaml index c20f4cdd4e..ef407271c1 100644 --- a/integration_test/third_party_apps_test/applications/dcgmv1/metadata.yaml +++ b/integration_test/third_party_apps_test/applications/dcgmv1/metadata.yaml @@ -23,28 +23,31 @@ configure_integration: |- You must install DCGM and run the DCGM daemon service. supported_operating_systems: linux supported_app_version: ["3.1"] -gpu_platforms: # p4, p100 don't emit DCGM profiling metrics +gpu_platforms: +# Platform selection: one most common distro (Ubuntu/Debian) for all GPU models + one easy-to-access GPU model (L4) for all distros +# Debian 11 has the proprietary version of the driver that supports older GPUs (V100, P100, P4) +# P4, P100 don't emit DCGM profiling metrics - model: a100 platforms: - ubuntu-os-cloud:ubuntu-2204-lts - model: v100 platforms: - - ubuntu-os-cloud:ubuntu-2204-lts + - debian-cloud:debian-11 - model: t4 platforms: - ubuntu-os-cloud:ubuntu-2204-lts - model: l4 platforms: - debian-cloud:debian-11 - - ml-images:common-gpu-debian-11-py310 + - debian-cloud:debian-12 + # DCGM 3 not available on debian-cloud:debian-13 - rocky-linux-cloud:rocky-linux-8 - rocky-linux-cloud:rocky-linux-9 - suse-cloud:sles-15 - ubuntu-os-cloud:ubuntu-2204-lts - ubuntu-os-cloud:ubuntu-2404-lts-amd64 - model: h100 - platforms: - - ubuntu-os-cloud:ubuntu-minimal-2204-lts # due to H100 quota, choose an image from the exhaustive list to skip presubmits + platforms: [] # Need quota expected_metrics: - type: workload.googleapis.com/dcgm.gpu.profiling.sm_utilization value_type: DOUBLE diff --git a/integration_test/third_party_apps_test/applications/dcgmv1/sles/install b/integration_test/third_party_apps_test/applications/dcgmv1/sles/install index f30fe6459d..6df2671af4 100644 --- a/integration_test/third_party_apps_test/applications/dcgmv1/sles/install +++ b/integration_test/third_party_apps_test/applications/dcgmv1/sles/install @@ -1,22 +1,6 @@ set -e -sudo zypper --non-interactive install -y kernel-default-devel=$(uname -r | sed 's/\-default//') pciutils gcc make wget - -# Install CUDA and driver the same way as the nvml app -# Prefer to install from the package manager since it is normally faster and has -# less errors on installation; fallback to the runfile method if the package -# manager's package is not working or not compitible with the GPU model -DISTRIBUTION=$(. /etc/os-release;echo $ID$VERSION_ID | sed -e 's/\.[0-9]//') -# Need to add the repo for installing CUDA and DCGM -sudo zypper --non-interactive ar http://developer.download.nvidia.com/compute/cuda/repos/${DISTRIBUTION}/x86_64/cuda-${DISTRIBUTION}.repo -sudo zypper --gpg-auto-import-keys --non-interactive refresh -echo "Installing latest version of NVIDIA CUDA and driver" -sudo zypper --non-interactive install -y nvidia-compute-utils-G06 -sudo zypper --non-interactive install -y cuda-12-9 - -# check NVIDIA driver installation succeeded -nvidia-smi - +# DCGM and CUDA toolkit share the same NVIDIA repo, which has already been configured in the image # Install DCGM sudo zypper --non-interactive install datacenter-gpu-manager sudo systemctl --now enable nvidia-dcgm diff --git a/integration_test/third_party_apps_test/applications/nvml/centos_rhel/install b/integration_test/third_party_apps_test/applications/nvml/centos_rhel/install index 25e47ab038..ef8e4094d2 100644 --- a/integration_test/third_party_apps_test/applications/nvml/centos_rhel/install +++ b/integration_test/third_party_apps_test/applications/nvml/centos_rhel/install @@ -1,123 +1,3 @@ set -e -source /etc/os-release -MAJOR_VERSION_ID=${VERSION_ID%%.*} - -verify_driver() { - # Verify NVIDIA driver: - # Installation could finish successfully but the driver is still unusable - # A common error when running this check: - # "NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA - # driver. Make sure that the latest NVIDIA driver is installed and running." - nvidia-smi -} - -install_cuda_from_runfile() { - # Ref: https://docs.nvidia.com/datacenter/tesla/tesla-installation-notes/index.html#runfile - # This method requires the matching kernel-devel package to be installed, and - # the package may be absent from the repo and cause this method to fail - # Remove existing installation before using the runfile - remove_cuda_package - remove_driver_package - # For Rocky Linux 9: when a new OS version becomes available, the default - # repo setting (/etc/yum.repos.d/rocky.repo) will automatically point to the - # new version's repo. This is problematic since the new OS is not available - # right away on GCE. Set up the matched repo to install the correct - # kernel-devel-$(uname -r) - # Not needed for RL8 since 8.10 is already the last RL8 release. - REPO_URL="https://dl.rockylinux.org/vault/rocky/$VERSION_ID/AppStream/x86_64/os/" - REPO_METADATA="$REPO_URL/repodata/repomd.xml" - STATUS_CODE=$(curl -s -o /dev/null -w "%{http_code}" "$REPO_METADATA") - if [[ $ID == rocky && "$MAJOR_VERSION_ID" == 9 && "$STATUS_CODE" == "200" ]]; then - cat <